xref: /xnu-10002.1.13/osfmk/arm/pmap/pmap.c (revision 1031c584a5e37aff177559b9f69dbd3c8c3fd30a)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_page_shift = 12,
246 };
247 
248 const struct page_table_attr pmap_pt_attr_16k = {
249 	.pta_level_info = pmap_table_level_info_16k,
250 	.pta_root_level = PMAP_TT_L1_LEVEL,
251 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
252 	.pta_max_level  = PMAP_TT_L3_LEVEL,
253 	.pta_ops = &native_pt_ops,
254 	.ap_ro = ARM_PTE_AP(AP_RORO),
255 	.ap_rw = ARM_PTE_AP(AP_RWRW),
256 	.ap_rona = ARM_PTE_AP(AP_RONA),
257 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
258 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 	.ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 	.pta_tcr_value  = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 	.pta_page_size  = 16384,
264 	.pta_page_shift = 14,
265 };
266 
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272 
273 
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276 
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279 
280 #endif /* MACH_ASSERT */
281 
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 	(current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288 
289 
290 /*
291  * Represents a tlb range that will be flushed before exiting
292  * the ppl.
293  * Used by phys_attribute_clear_range to defer flushing pages in
294  * this range until the end of the operation.
295  */
296 typedef struct pmap_tlb_flush_range {
297 	pmap_t ptfr_pmap;
298 	vm_map_address_t ptfr_start;
299 	vm_map_address_t ptfr_end;
300 	bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302 
303 #if XNU_MONITOR
304 /*
305  * PPL External References.
306  */
307 extern vm_offset_t   segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t   segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t   segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313 
314 
315 /*
316  * PPL Global Variables
317  */
318 
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325 
326 /*
327  * Indicates if the PPL has started applying APRR.
328  * This variable is accessed from various assembly trampolines, so be sure to change
329  * those if you change the size or layout of this variable.
330  */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332 
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335 
336 #endif /* !XNU_MONITOR */
337 
338 
339 
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
342 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344 
345 extern uint8_t bootstrap_pagetables[];
346 
347 extern unsigned int not_in_kdp;
348 
349 extern vm_offset_t first_avail;
350 
351 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
352 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
353 extern vm_offset_t     static_memory_end;
354 
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357 
358 extern int maxproc, hard_maxproc;
359 
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362 
363 /*
364  * The bounds on our TTBRs.  These are for sanity checking that
365  * an address is accessible by a TTBR before we attempt to map it.
366  */
367 
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370 
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373 
374 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t    kernel_pmap = &kernel_pmap_store;
376 
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
378 
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
382 
383 typedef struct tt_free_entry {
384 	struct tt_free_entry    *next;
385 } tt_free_entry_t;
386 
387 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
388 
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX   4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
400 
401 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
402 
403 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409 
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412 
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
415 
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418 
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1;                                     /* enable no-execute protection */
421 int allow_data_exec  = 0;                               /* No apps may execute data */
422 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1;                                       /* enable no-execute protection */
428 const int allow_data_exec  = 0;                         /* No apps may execute data */
429 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431 
432 /**
433  * This variable is set true during hibernation entry to protect pmap data structures
434  * during image copying, and reset false on hibernation exit.
435  */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437 
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446 
447 /**
448  * This helper function ensures that potentially-long-running batched PPL operations are
449  * called in preemptible context before entering the PPL, so that the PPL call may
450  * periodically exit to allow pending urgent ASTs to be taken.
451  */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457 
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459 
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
462 
463 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
464 
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470     "Minimum address space size outside allowable range");
471 
472 // Max offset is 15.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 11.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476 
477 
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479     "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481     "Small device address space size outside allowable range");
482 
483 #  ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 #  else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 #  endif
488 #endif /* __arm64__ */
489 
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495 
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
499 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
500 #if !HAS_16BIT_ASID
501 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
502 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
503 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
504 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
505 #else
506 static uint16_t last_allocated_asid = 0;
507 #endif /* !HAS_16BIT_ASID */
508 
509 
510 #if __ARM_MIXED_PAGE_SIZE__
511 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
512 #endif
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
514 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
515 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
516 
517 /* PTE Define Macros */
518 
519 #define ARM_PTE_IS_COMPRESSED(x, p) \
520 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
521 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
522 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
523 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
524 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
525 
526 #define pte_is_wired(pte)                                                               \
527 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
528 
529 #define pte_was_writeable(pte) \
530 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
531 
532 #define pte_set_was_writeable(pte, was_writeable) \
533 	do {                                         \
534 	        if ((was_writeable)) {               \
535 	                (pte) |= ARM_PTE_WRITEABLE;  \
536 	        } else {                             \
537 	                (pte) &= ~ARM_PTE_WRITEABLE; \
538 	        }                                    \
539 	} while(0)
540 
541 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)542 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
543 {
544 	if (wired) {
545 		*ptep |= ARM_PTE_WIRED;
546 	} else {
547 		*ptep &= ~ARM_PTE_WIRED;
548 	}
549 	/*
550 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
551 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
552 	 * never reclaimed.
553 	 */
554 	if (pmap == kernel_pmap) {
555 		return;
556 	}
557 	unsigned short *ptd_wiredcnt_ptr;
558 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
559 	if (wired) {
560 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
561 	} else {
562 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 		if (__improbable(prev_wired == 0)) {
564 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
565 		}
566 	}
567 }
568 
569 #if HAS_FEAT_XS
570 
571 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)572 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
573 {
574 	if (__improbable(pt_attr->stage2)) {
575 		return false;
576 	}
577 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
578 	case CACHE_ATTRINDX_POSTED_XS:
579 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
580 		return true;
581 	default:
582 		return false;
583 	}
584 }
585 
586 #endif /* HAS_FEAT_XS */
587 
588 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
589 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
590 	arm64_sync_tlb(strong);                                                                               \
591 }
592 
593 /*
594  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
595  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
596  * will observe the updated PTE.
597  */
598 #define FLUSH_PTE()                                                                     \
599 	__builtin_arm_dmb(DMB_ISH);
600 
601 /*
602  * Synchronize updates to PTEs that were previously valid and thus may be cached in
603  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
604  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
605  * program order will not issue until the DSB completes.  Prior loads may be reordered
606  * after the barrier, but their behavior should not be materially affected by the
607  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
608  * matter for loads until the access is re-driven well after the TLB update is
609  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
610  * we should be in a position to handle access faults.  For "voluntary" PTE access
611  * restriction due to unmapping or protection, the decision to restrict access should
612  * have a data dependency on prior loads in order to avoid a data race.
613  */
614 #define FLUSH_PTE_STRONG()                                                             \
615 	__builtin_arm_dsb(DSB_ISHST);
616 
617 /**
618  * Write enough page table entries to map a single VM page. On systems where the
619  * VM page size does not match the hardware page size, multiple page table
620  * entries will need to be written.
621  *
622  * @note This function does not emit a barrier to ensure these page table writes
623  *       have completed before continuing. This is commonly needed. In the case
624  *       where a DMB or DSB barrier is needed, then use the write_pte() and
625  *       write_pte_strong() functions respectively instead of this one.
626  *
627  * @param ptep Pointer to the first page table entry to update.
628  * @param pte The value to write into each page table entry. In the case that
629  *            multiple PTEs are updated to a non-empty value, then the address
630  *            in this value will automatically be incremented for each PTE
631  *            write.
632  */
633 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)634 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
635 {
636 	/**
637 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
638 	 * systems, which is why it's checked at runtime instead of compile time.
639 	 * The "unreachable" warning needs to be suppressed because it still is a
640 	 * compile time constant on some systems.
641 	 */
642 	__unreachable_ok_push
643 	if (TEST_PAGE_RATIO_4) {
644 		if (((uintptr_t)ptep) & 0x1f) {
645 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
646 			    __func__, ptep, (void*)pte);
647 		}
648 
649 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
650 			/**
651 			 * If we're writing an empty/compressed PTE value, then don't
652 			 * auto-increment the address for each PTE write.
653 			 */
654 			*ptep = pte;
655 			*(ptep + 1) = pte;
656 			*(ptep + 2) = pte;
657 			*(ptep + 3) = pte;
658 		} else {
659 			*ptep = pte;
660 			*(ptep + 1) = pte | 0x1000;
661 			*(ptep + 2) = pte | 0x2000;
662 			*(ptep + 3) = pte | 0x3000;
663 		}
664 	} else {
665 		*ptep = pte;
666 	}
667 	__unreachable_ok_pop
668 }
669 
670 /**
671  * Writes enough page table entries to map a single VM page and then ensures
672  * those writes complete by executing a Data Memory Barrier.
673  *
674  * @note The DMB issued by this function is not strong enough to protect against
675  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
676  *       instruction is going to immediately be called after this write, it's
677  *       recommended to call write_pte_strong() instead of this function.
678  *
679  * See the function header for write_pte_fast() for more details on the
680  * parameters.
681  */
682 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)683 write_pte(pt_entry_t *ptep, pt_entry_t pte)
684 {
685 	write_pte_fast(ptep, pte);
686 	FLUSH_PTE();
687 }
688 
689 /**
690  * Writes enough page table entries to map a single VM page and then ensures
691  * those writes complete by executing a Data Synchronization Barrier. This
692  * barrier provides stronger guarantees than the DMB executed by write_pte().
693  *
694  * @note This function is useful if you're going to immediately flush the TLB
695  *       after making the PTE write. A DSB is required to protect against the
696  *       TLB invalidate being reordered before the PTE write.
697  *
698  * See the function header for write_pte_fast() for more details on the
699  * parameters.
700  */
701 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)702 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
703 {
704 	write_pte_fast(ptep, pte);
705 	FLUSH_PTE_STRONG();
706 }
707 
708 /**
709  * Retrieve the pmap structure for the thread running on the current CPU.
710  */
711 pmap_t
current_pmap()712 current_pmap()
713 {
714 	const pmap_t current = vm_map_pmap(current_thread()->map);
715 
716 	assert(current != NULL);
717 
718 #if XNU_MONITOR
719 	/**
720 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
721 	 * decided by kernel-writable memory. This function is used in various parts
722 	 * of the PPL, and besides validating that the pointer returned by this
723 	 * function is indeed a pmap structure, it's also important to ensure that
724 	 * it's actually the current thread's pmap. This is because different pmaps
725 	 * will have access to different entitlements based on the code signature of
726 	 * their loaded process. So if a different user pmap is set in the current
727 	 * thread structure (in an effort to bypass code signing restrictions), even
728 	 * though the structure would validate correctly as it is a real pmap
729 	 * structure, it should fail here.
730 	 *
731 	 * This only needs to occur for user pmaps because the kernel pmap's root
732 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
733 	 * changed so it'd be redundant to check), and its code signing fields are
734 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
735 	 * it shouldn't be possible to set those fields. Due to that, an attacker
736 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
737 	 * this check won't accomplish anything as it doesn't provide any extra code
738 	 * signing entitlements.
739 	 */
740 	if ((current != kernel_pmap) &&
741 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
742 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
743 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
744 	}
745 #endif /* XNU_MONITOR */
746 
747 	return current;
748 }
749 
750 #if DEVELOPMENT || DEBUG
751 
752 /*
753  * Trace levels are controlled by a bitmask in which each
754  * level can be enabled/disabled by the (1<<level) position
755  * in the boot arg
756  * Level 0: PPL extension functionality
757  * Level 1: pmap lifecycle (create/destroy/switch)
758  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
759  * Level 3: internal state management (attributes/fast-fault)
760  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
761  */
762 
763 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
764 
765 #define PMAP_TRACE(level, ...) \
766 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
767 	        KDBG_RELEASE(__VA_ARGS__); \
768 	}
769 #else /* DEVELOPMENT || DEBUG */
770 
771 #define PMAP_TRACE(level, ...)
772 
773 #endif /* DEVELOPMENT || DEBUG */
774 
775 
776 /*
777  * Internal function prototypes (forward declarations).
778  */
779 
780 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
781 
782 static void pmap_set_reference(ppnum_t pn);
783 
784 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
785 
786 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
787 
788 static kern_return_t pmap_expand(
789 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
790 
791 static int pmap_remove_range(
792 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
793 
794 static tt_entry_t *pmap_tt1_allocate(
795 	pmap_t, vm_size_t, unsigned int);
796 
797 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
798 
799 static void pmap_tt1_deallocate(
800 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
801 
802 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
803 
804 static kern_return_t pmap_tt_allocate(
805 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
806 
807 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
808 
809 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
810 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
811 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
812 
813 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
814 
815 
816 static void pmap_unmap_commpage(
817 	pmap_t pmap);
818 
819 static boolean_t
820 pmap_is_64bit(pmap_t);
821 
822 
823 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
824 
825 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
826 
827 static bool pmap_update_cache_attributes_locked(
828 	ppnum_t, unsigned, bool);
829 
830 static boolean_t arm_clear_fast_fault(
831 	ppnum_t ppnum,
832 	vm_prot_t fault_type,
833 	pt_entry_t *pte_p);
834 
835 static void pmap_trim_self(pmap_t pmap);
836 static void pmap_trim_subord(pmap_t subord);
837 
838 
839 /*
840  * Temporary prototypes, while we wait for pmap_enter to move to taking an
841  * address instead of a page number.
842  */
843 static kern_return_t
844 pmap_enter_addr(
845 	pmap_t pmap,
846 	vm_map_address_t v,
847 	pmap_paddr_t pa,
848 	vm_prot_t prot,
849 	vm_prot_t fault_type,
850 	unsigned int flags,
851 	boolean_t wired);
852 
853 kern_return_t
854 pmap_enter_options_addr(
855 	pmap_t pmap,
856 	vm_map_address_t v,
857 	pmap_paddr_t pa,
858 	vm_prot_t prot,
859 	vm_prot_t fault_type,
860 	unsigned int flags,
861 	boolean_t wired,
862 	unsigned int options,
863 	__unused void   *arg);
864 
865 #ifdef CONFIG_XNUPOST
866 kern_return_t pmap_test(void);
867 #endif /* CONFIG_XNUPOST */
868 
869 PMAP_SUPPORT_PROTOTYPES(
870 	kern_return_t,
871 	arm_fast_fault, (pmap_t pmap,
872 	vm_map_address_t va,
873 	vm_prot_t fault_type,
874 	bool was_af_fault,
875 	bool from_user), ARM_FAST_FAULT_INDEX);
876 
877 PMAP_SUPPORT_PROTOTYPES(
878 	boolean_t,
879 	arm_force_fast_fault, (ppnum_t ppnum,
880 	vm_prot_t allow_mode,
881 	int options), ARM_FORCE_FAST_FAULT_INDEX);
882 
883 MARK_AS_PMAP_TEXT static boolean_t
884 arm_force_fast_fault_with_flush_range(
885 	ppnum_t ppnum,
886 	vm_prot_t allow_mode,
887 	int options,
888 	pmap_tlb_flush_range_t *flush_range);
889 
890 /**
891  * Definition of the states driving the batch cache attributes update
892  * state machine.
893  */
894 typedef struct {
895 	uint64_t page_index : 32,           /* The page index to be operated on */
896 	    state : 8,                      /* The current state of the update machine */
897 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
898 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
899 	:0;
900 } batch_set_cache_attr_state_t;
901 
902 /* Possible values of the "state" field. */
903 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
904 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
905 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
907 
908 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
909 
910 PMAP_SUPPORT_PROTOTYPES(
911 	batch_set_cache_attr_state_t,
912 	pmap_batch_set_cache_attributes, (
913 #if XNU_MONITOR
914 		volatile upl_page_info_t *user_page_list,
915 #else /* !XNU_MONITOR */
916 		upl_page_info_array_t user_page_list,
917 #endif /* XNU_MONITOR */
918 		batch_set_cache_attr_state_t state,
919 		unsigned int page_cnt,
920 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
921 
922 PMAP_SUPPORT_PROTOTYPES(
923 	kern_return_t,
924 	pmap_change_wiring, (pmap_t pmap,
925 	vm_map_address_t v,
926 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
927 
928 PMAP_SUPPORT_PROTOTYPES(
929 	pmap_t,
930 	pmap_create_options, (ledger_t ledger,
931 	vm_map_size_t size,
932 	unsigned int flags,
933 	kern_return_t * kr), PMAP_CREATE_INDEX);
934 
935 PMAP_SUPPORT_PROTOTYPES(
936 	void,
937 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
938 
939 PMAP_SUPPORT_PROTOTYPES(
940 	kern_return_t,
941 	pmap_enter_options, (pmap_t pmap,
942 	vm_map_address_t v,
943 	pmap_paddr_t pa,
944 	vm_prot_t prot,
945 	vm_prot_t fault_type,
946 	unsigned int flags,
947 	boolean_t wired,
948 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
949 
950 PMAP_SUPPORT_PROTOTYPES(
951 	pmap_paddr_t,
952 	pmap_find_pa, (pmap_t pmap,
953 	addr64_t va), PMAP_FIND_PA_INDEX);
954 
955 PMAP_SUPPORT_PROTOTYPES(
956 	kern_return_t,
957 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
958 
959 
960 PMAP_SUPPORT_PROTOTYPES(
961 	boolean_t,
962 	pmap_is_empty, (pmap_t pmap,
963 	vm_map_offset_t va_start,
964 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
965 
966 
967 PMAP_SUPPORT_PROTOTYPES(
968 	unsigned int,
969 	pmap_map_cpu_windows_copy, (ppnum_t pn,
970 	vm_prot_t prot,
971 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
972 
973 PMAP_SUPPORT_PROTOTYPES(
974 	void,
975 	pmap_ro_zone_memcpy, (zone_id_t zid,
976 	vm_offset_t va,
977 	vm_offset_t offset,
978 	const vm_offset_t new_data,
979 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
980 
981 PMAP_SUPPORT_PROTOTYPES(
982 	uint64_t,
983 	pmap_ro_zone_atomic_op, (zone_id_t zid,
984 	vm_offset_t va,
985 	vm_offset_t offset,
986 	zro_atomic_op_t op,
987 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
988 
989 PMAP_SUPPORT_PROTOTYPES(
990 	void,
991 	pmap_ro_zone_bzero, (zone_id_t zid,
992 	vm_offset_t va,
993 	vm_offset_t offset,
994 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
995 
996 PMAP_SUPPORT_PROTOTYPES(
997 	vm_map_offset_t,
998 	pmap_nest, (pmap_t grand,
999 	pmap_t subord,
1000 	addr64_t vstart,
1001 	uint64_t size,
1002 	vm_map_offset_t vrestart,
1003 	kern_return_t * krp), PMAP_NEST_INDEX);
1004 
1005 PMAP_SUPPORT_PROTOTYPES(
1006 	void,
1007 	pmap_page_protect_options, (ppnum_t ppnum,
1008 	vm_prot_t prot,
1009 	unsigned int options,
1010 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1011 
1012 PMAP_SUPPORT_PROTOTYPES(
1013 	vm_map_address_t,
1014 	pmap_protect_options, (pmap_t pmap,
1015 	vm_map_address_t start,
1016 	vm_map_address_t end,
1017 	vm_prot_t prot,
1018 	unsigned int options,
1019 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1020 
1021 PMAP_SUPPORT_PROTOTYPES(
1022 	kern_return_t,
1023 	pmap_query_page_info, (pmap_t pmap,
1024 	vm_map_offset_t va,
1025 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1026 
1027 PMAP_SUPPORT_PROTOTYPES(
1028 	mach_vm_size_t,
1029 	pmap_query_resident, (pmap_t pmap,
1030 	vm_map_address_t start,
1031 	vm_map_address_t end,
1032 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1033 
1034 PMAP_SUPPORT_PROTOTYPES(
1035 	void,
1036 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1037 
1038 PMAP_SUPPORT_PROTOTYPES(
1039 	vm_map_address_t,
1040 	pmap_remove_options, (pmap_t pmap,
1041 	vm_map_address_t start,
1042 	vm_map_address_t end,
1043 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1044 
1045 
1046 PMAP_SUPPORT_PROTOTYPES(
1047 	void,
1048 	pmap_set_cache_attributes, (ppnum_t pn,
1049 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1050 
1051 PMAP_SUPPORT_PROTOTYPES(
1052 	void,
1053 	pmap_update_compressor_page, (ppnum_t pn,
1054 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1055 
1056 PMAP_SUPPORT_PROTOTYPES(
1057 	void,
1058 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1059 
1060 #if MACH_ASSERT || XNU_MONITOR
1061 PMAP_SUPPORT_PROTOTYPES(
1062 	void,
1063 	pmap_set_process, (pmap_t pmap,
1064 	int pid,
1065 	char *procname), PMAP_SET_PROCESS_INDEX);
1066 #endif
1067 
1068 PMAP_SUPPORT_PROTOTYPES(
1069 	void,
1070 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1071 
1072 PMAP_SUPPORT_PROTOTYPES(
1073 	vm_map_offset_t,
1074 	pmap_unnest_options, (pmap_t grand,
1075 	addr64_t vaddr,
1076 	uint64_t size,
1077 	vm_map_offset_t vrestart,
1078 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1079 
1080 PMAP_SUPPORT_PROTOTYPES(
1081 	void,
1082 	phys_attribute_set, (ppnum_t pn,
1083 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1084 
1085 PMAP_SUPPORT_PROTOTYPES(
1086 	void,
1087 	phys_attribute_clear, (ppnum_t pn,
1088 	unsigned int bits,
1089 	int options,
1090 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1091 
1092 #if __ARM_RANGE_TLBI__
1093 PMAP_SUPPORT_PROTOTYPES(
1094 	vm_map_address_t,
1095 	phys_attribute_clear_range, (pmap_t pmap,
1096 	vm_map_address_t start,
1097 	vm_map_address_t end,
1098 	unsigned int bits,
1099 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1100 #endif /* __ARM_RANGE_TLBI__ */
1101 
1102 
1103 PMAP_SUPPORT_PROTOTYPES(
1104 	void,
1105 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1106 
1107 PMAP_SUPPORT_PROTOTYPES(
1108 	void,
1109 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1110 
1111 PMAP_SUPPORT_PROTOTYPES(
1112 	void,
1113 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1114 
1115 PMAP_SUPPORT_PROTOTYPES(
1116 	void,
1117 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1118 
1119 PMAP_SUPPORT_PROTOTYPES(
1120 	void,
1121 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1122 
1123 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1124 PMAP_SUPPORT_PROTOTYPES(
1125 	void,
1126 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1127 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1128 
1129 /* Definition of the states used by pmap_trim(). */
1130 typedef enum {
1131 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1132 	PMAP_TRIM_STATE_START = 0,
1133 
1134 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1135 	PMAP_TRIM_STATE_GRAND_BEFORE,
1136 
1137 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1138 	PMAP_TRIM_STATE_GRAND_AFTER,
1139 
1140 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1141 	PMAP_TRIM_STATE_SUBORD,
1142 
1143 	/* Marks that trimming is finished. */
1144 	PMAP_TRIM_STATE_DONE,
1145 
1146 	/* Sentry enum for sanity checks. */
1147 	PMAP_TRIM_STATE_COUNT,
1148 } pmap_trim_state_t;
1149 
1150 PMAP_SUPPORT_PROTOTYPES(
1151 	pmap_trim_state_t,
1152 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1153 
1154 #if HAS_APPLE_PAC
1155 PMAP_SUPPORT_PROTOTYPES(
1156 	void *,
1157 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1158 PMAP_SUPPORT_PROTOTYPES(
1159 	void *,
1160 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1161 #endif /* HAS_APPLE_PAC */
1162 
1163 
1164 
1165 
1166 PMAP_SUPPORT_PROTOTYPES(
1167 	kern_return_t,
1168 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1169 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1170 
1171 PMAP_SUPPORT_PROTOTYPES(
1172 	kern_return_t,
1173 	pmap_load_trust_cache_with_type, (TCType_t type,
1174 	const vm_address_t pmap_img4_payload,
1175 	const vm_size_t pmap_img4_payload_len,
1176 	const vm_address_t img4_manifest,
1177 	const vm_size_t img4_manifest_len,
1178 	const vm_address_t img4_aux_manifest,
1179 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1180 
1181 PMAP_SUPPORT_PROTOTYPES(
1182 	void,
1183 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1184 
1185 PMAP_SUPPORT_PROTOTYPES(
1186 	kern_return_t,
1187 	pmap_query_trust_cache, (TCQueryType_t query_type,
1188 	const uint8_t cdhash[kTCEntryHashSize],
1189 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1190 
1191 #if PMAP_CS_INCLUDE_CODE_SIGNING
1192 
1193 PMAP_SUPPORT_PROTOTYPES(
1194 	kern_return_t,
1195 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1196 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1197 
1198 PMAP_SUPPORT_PROTOTYPES(
1199 	kern_return_t,
1200 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1201 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1202 
1203 PMAP_SUPPORT_PROTOTYPES(
1204 	kern_return_t,
1205 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1206 	pmap_cs_profile_t * profile_obj),
1207 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1208 
1209 PMAP_SUPPORT_PROTOTYPES(
1210 	kern_return_t,
1211 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1212 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1213 
1214 PMAP_SUPPORT_PROTOTYPES(
1215 	kern_return_t,
1216 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1217 	const void *kernel_entitlements),
1218 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1219 
1220 PMAP_SUPPORT_PROTOTYPES(
1221 	kern_return_t,
1222 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1223 	const void **kernel_entitlements),
1224 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1225 
1226 PMAP_SUPPORT_PROTOTYPES(
1227 	kern_return_t,
1228 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1229 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1230 
1231 PMAP_SUPPORT_PROTOTYPES(
1232 	kern_return_t,
1233 	pmap_cs_allow_invalid, (pmap_t pmap),
1234 	PMAP_CS_ALLOW_INVALID_INDEX);
1235 
1236 PMAP_SUPPORT_PROTOTYPES(
1237 	void,
1238 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1239 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1240 
1241 PMAP_SUPPORT_PROTOTYPES(
1242 	bool,
1243 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1244 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1245 
1246 PMAP_SUPPORT_PROTOTYPES(
1247 	void,
1248 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1249 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1250 
1251 PMAP_SUPPORT_PROTOTYPES(
1252 	void,
1253 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1254 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1255 
1256 #endif
1257 
1258 PMAP_SUPPORT_PROTOTYPES(
1259 	uint32_t,
1260 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1261 
1262 PMAP_SUPPORT_PROTOTYPES(
1263 	bool,
1264 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1265 
1266 PMAP_SUPPORT_PROTOTYPES(
1267 	void,
1268 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1269 
1270 void pmap_footprint_suspend(vm_map_t    map,
1271     boolean_t   suspend);
1272 PMAP_SUPPORT_PROTOTYPES(
1273 	void,
1274 	pmap_footprint_suspend, (vm_map_t map,
1275 	boolean_t suspend),
1276 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1277 
1278 
1279 
1280 
1281 #if DEVELOPMENT || DEBUG
1282 PMAP_SUPPORT_PROTOTYPES(
1283 	kern_return_t,
1284 	pmap_test_text_corruption, (pmap_paddr_t),
1285 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1286 #endif /* DEVELOPMENT || DEBUG */
1287 
1288 /*
1289  * The low global vector page is mapped at a fixed alias.
1290  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1291  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1292  * to check both addresses anyway for backward compatibility. So for now
1293  * we leave H6 and H7 where they were.
1294  */
1295 #if (ARM_PGSHIFT == 14)
1296 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1297 #else
1298 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1299 #endif
1300 
1301 
1302 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1303 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1304 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1305 
1306 #if XNU_MONITOR
1307 
1308 #if __has_feature(ptrauth_calls)
1309 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1310 #else
1311 #define __ptrauth_ppl_handler
1312 #endif
1313 
1314 /*
1315  * Table of function pointers used for PPL dispatch.
1316  */
1317 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1318 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1319 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1320 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1321 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1322 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1323 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1324 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1325 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1326 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1327 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1328 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1329 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1330 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1331 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1332 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1333 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1334 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1335 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1336 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1337 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1338 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1339 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1340 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1341 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1342 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1343 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1344 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1345 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1346 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1347 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1348 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1349 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1350 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1351 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1352 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1353 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1354 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1355 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1356 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1357 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1358 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1359 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1360 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1361 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1362 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1363 #if PMAP_CS_INCLUDE_CODE_SIGNING
1364 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1365 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1366 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1367 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1368 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1369 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1370 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1371 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1372 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1373 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1374 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1375 #endif
1376 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1377 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1378 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1379 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1380 #if HAS_APPLE_PAC
1381 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1382 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1383 #endif /* HAS_APPLE_PAC */
1384 #if __ARM_RANGE_TLBI__
1385 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1386 #endif /* __ARM_RANGE_TLBI__ */
1387 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1388 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1389 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1390 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1391 
1392 #if DEVELOPMENT || DEBUG
1393 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1394 #endif /* DEVELOPMENT || DEBUG */
1395 
1396 };
1397 #endif
1398 
1399 #if XNU_MONITOR
1400 /**
1401  * A convenience function for setting protections on a single physical
1402  * aperture or static region mapping without invalidating the TLB.
1403  *
1404  * @note This function does not perform any TLB invalidations. That must be done
1405  *       separately to be able to safely use the updated mapping.
1406  *
1407  * @note This function understands the difference between the VM page size and
1408  *       the kernel page size and will update multiple PTEs if the sizes differ.
1409  *       In other words, enough PTEs will always get updated to change the
1410  *       permissions on a PAGE_SIZE amount of memory.
1411  *
1412  * @note The PVH lock for the physical page represented by this mapping must
1413  *       already be locked.
1414  *
1415  * @note This function assumes the caller has already verified that the PTE
1416  *       pointer does indeed point to a physical aperture or static region page
1417  *       table. Please validate your inputs before passing it along to this
1418  *       function.
1419  *
1420  * @param ptep Pointer to the physical aperture or static region page table to
1421  *             update with a new XPRR index.
1422  * @param expected_perm The XPRR index that is expected to already exist at the
1423  *                      current mapping. If the current index doesn't match this
1424  *                      then the system will panic.
1425  * @param new_perm The new XPRR index to update the mapping with.
1426  */
1427 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1428 pmap_set_pte_xprr_perm(
1429 	pt_entry_t * const ptep,
1430 	unsigned int expected_perm,
1431 	unsigned int new_perm)
1432 {
1433 	assert(ptep != NULL);
1434 
1435 	pt_entry_t spte = *ptep;
1436 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1437 
1438 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1439 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1440 		    __func__, ptep, new_perm, expected_perm);
1441 	}
1442 
1443 	/**
1444 	 * The PTE involved should be valid, should not have the hint bit set, and
1445 	 * should have the expected XPRR index.
1446 	 */
1447 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1448 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1449 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1450 		    __func__, ptep, spte, new_perm, expected_perm);
1451 	}
1452 
1453 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1454 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1455 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1456 		    __func__, ptep, spte, new_perm, expected_perm);
1457 	}
1458 
1459 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1460 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1461 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1462 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1463 	}
1464 
1465 	pt_entry_t template = spte;
1466 	template &= ~ARM_PTE_XPRR_MASK;
1467 	template |= xprr_perm_to_pte(new_perm);
1468 
1469 	write_pte_strong(ptep, template);
1470 }
1471 
1472 /**
1473  * Update the protections on a single physical aperture mapping and invalidate
1474  * the TLB so the mapping can be used.
1475  *
1476  * @note The PVH lock for the physical page must already be locked.
1477  *
1478  * @param pai The physical address index of the page whose physical aperture
1479  *            mapping will be updated with new permissions.
1480  * @param expected_perm The XPRR index that is expected to already exist at the
1481  *                      current mapping. If the current index doesn't match this
1482  *                      then the system will panic.
1483  * @param new_perm The new XPRR index to update the mapping with.
1484  */
1485 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1486 pmap_set_xprr_perm(
1487 	unsigned int pai,
1488 	unsigned int expected_perm,
1489 	unsigned int new_perm)
1490 {
1491 	pvh_assert_locked(pai);
1492 
1493 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1494 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1495 
1496 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1497 
1498 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1499 	sync_tlb_flush();
1500 }
1501 
1502 /**
1503  * Update the protections on a range of physical aperture or static region
1504  * mappings and invalidate the TLB so the mappings can be used.
1505  *
1506  * @note Static region mappings can only be updated before machine_lockdown().
1507  *       Physical aperture mappings can be updated at any time.
1508  *
1509  * @param start The starting virtual address of the static region or physical
1510  *              aperture range whose permissions will be updated.
1511  * @param end The final (inclusive) virtual address of the static region or
1512  *            physical aperture range whose permissions will be updated.
1513  * @param expected_perm The XPRR index that is expected to already exist at the
1514  *                      current mappings. If the current indices don't match
1515  *                      this then the system will panic.
1516  * @param new_perm The new XPRR index to update the mappings with.
1517  */
1518 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1519 pmap_set_range_xprr_perm(
1520 	vm_address_t start,
1521 	vm_address_t end,
1522 	unsigned int expected_perm,
1523 	unsigned int new_perm)
1524 {
1525 	/**
1526 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1527 	 */
1528 	if (__improbable((start | end) & ARM_PGMASK)) {
1529 		panic_plain("%s: start or end not page aligned, "
1530 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1531 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1532 	}
1533 
1534 	if (__improbable(start > end)) {
1535 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1536 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1537 	}
1538 
1539 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1540 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1541 
1542 	if (__improbable(!(in_physmap || in_static))) {
1543 		panic_plain("%s: address not in static region or physical aperture, "
1544 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1545 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1546 	}
1547 
1548 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1549 		panic_plain("%s: invalid XPRR index, "
1550 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1551 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1552 	}
1553 
1554 	/*
1555 	 * Walk over the PTEs for the given range, and set the protections on those
1556 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1557 	 * one twig entry (whichever twig entry currently maps "va").
1558 	 */
1559 	vm_address_t va = start;
1560 	while (va < end) {
1561 		/**
1562 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1563 		 * PTEs from va to tte_va_end will have their permissions updated.
1564 		 */
1565 		vm_address_t tte_va_end =
1566 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1567 
1568 		if (tte_va_end > end) {
1569 			tte_va_end = end;
1570 		}
1571 
1572 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1573 
1574 		if (ttep == NULL) {
1575 			panic_plain("%s: physical aperture or static region tte is NULL, "
1576 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1577 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1578 		}
1579 
1580 		tt_entry_t tte = *ttep;
1581 
1582 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1583 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1584 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1585 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1586 		}
1587 
1588 		/* Walk over the given L3 page table page and update the PTEs. */
1589 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1590 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1591 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1592 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1593 
1594 		/**
1595 		 * The current PTE pointer is incremented by the page ratio (ratio of
1596 		 * VM page size to kernel hardware page size) because one call to
1597 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1598 		 * a PAGE_SIZE worth of hardware pages.
1599 		 */
1600 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1601 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1602 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1603 			pvh_lock(pai);
1604 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1605 			pvh_unlock(pai);
1606 		}
1607 
1608 		va = tte_va_end;
1609 	}
1610 
1611 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1612 }
1613 
1614 #endif /* XNU_MONITOR */
1615 
1616 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1617 PMAP_ZINFO_PALLOC(
1618 	pmap_t pmap, int bytes)
1619 {
1620 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1621 }
1622 
1623 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1624 PMAP_ZINFO_PFREE(
1625 	pmap_t pmap,
1626 	int bytes)
1627 {
1628 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1629 }
1630 
1631 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1632 pmap_tt_ledger_credit(
1633 	pmap_t          pmap,
1634 	vm_size_t       size)
1635 {
1636 	if (pmap != kernel_pmap) {
1637 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1638 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1639 	}
1640 }
1641 
1642 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1643 pmap_tt_ledger_debit(
1644 	pmap_t          pmap,
1645 	vm_size_t       size)
1646 {
1647 	if (pmap != kernel_pmap) {
1648 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1649 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1650 	}
1651 }
1652 
1653 static inline void
pmap_update_plru(uint16_t asid_index __unused)1654 pmap_update_plru(uint16_t asid_index __unused)
1655 {
1656 #if !HAS_16BIT_ASID
1657 	if (__probable(pmap_asid_plru)) {
1658 		unsigned plru_index = asid_index >> 6;
1659 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1660 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1661 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1662 		}
1663 	}
1664 #endif /* !HAS_16BIT_ASID */
1665 }
1666 
1667 static bool
alloc_asid(pmap_t pmap)1668 alloc_asid(pmap_t pmap)
1669 {
1670 	int vasid = -1;
1671 	uint16_t hw_asid;
1672 
1673 	pmap_simple_lock(&asid_lock);
1674 
1675 #if !HAS_16BIT_ASID
1676 	if (__probable(pmap_asid_plru)) {
1677 		unsigned plru_index = 0;
1678 		uint64_t lowest_gen = asid_plru_generation[0];
1679 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1680 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1681 			if (asid_plru_generation[i] < lowest_gen) {
1682 				plru_index = i;
1683 				lowest_gen = asid_plru_generation[i];
1684 				lowest_gen_bitmap = asid_plru_bitmap[i];
1685 			}
1686 		}
1687 
1688 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1689 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1690 			if (temp_plru) {
1691 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1692 #if DEVELOPMENT || DEBUG
1693 				++pmap_asid_hits;
1694 #endif
1695 				break;
1696 			}
1697 		}
1698 	}
1699 #else
1700 	/**
1701 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1702 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1703 	 * However, we first try to allocate starting from the position of the most-recently allocated
1704 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1705 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1706 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1707 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1708 	 * logic, without requiring prohibitively expensive RCTX instructions.
1709 	 */
1710 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1711 #endif /* !HAS_16BIT_ASID */
1712 	if (__improbable(vasid < 0)) {
1713 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1714 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1715 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1716 #if DEVELOPMENT || DEBUG
1717 		++pmap_asid_misses;
1718 #endif
1719 	}
1720 	if (__improbable(vasid < 0)) {
1721 		pmap_simple_unlock(&asid_lock);
1722 		return false;
1723 	}
1724 	assert((uint32_t)vasid < pmap_max_asids);
1725 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1726 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1727 #if HAS_16BIT_ASID
1728 	last_allocated_asid = (uint16_t)vasid;
1729 #endif /* HAS_16BIT_ASID */
1730 	pmap_simple_unlock(&asid_lock);
1731 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1732 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1733 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1734 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1735 		 * reassign to a reserved VASID. */
1736 		assert(pmap->sw_asid < UINT8_MAX);
1737 		pmap->sw_asid = UINT8_MAX;
1738 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1739 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1740 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1741 		assert(hw_asid < MAX_HW_ASIDS);
1742 	}
1743 	pmap_update_plru(hw_asid);
1744 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1745 #if __ARM_KERNEL_PROTECT__
1746 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1747 #endif
1748 	pmap->hw_asid = hw_asid;
1749 	return true;
1750 }
1751 
1752 static void
free_asid(pmap_t pmap)1753 free_asid(pmap_t pmap)
1754 {
1755 	unsigned int vasid;
1756 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1757 	if (__improbable(hw_asid == 0)) {
1758 		return;
1759 	}
1760 
1761 #if __ARM_KERNEL_PROTECT__
1762 	hw_asid >>= 1;
1763 #endif
1764 	hw_asid -= 1;
1765 
1766 #if HAS_16BIT_ASID
1767 	vasid = hw_asid;
1768 #else
1769 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1770 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1771 	} else {
1772 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1773 	}
1774 
1775 	if (__probable(pmap_asid_plru)) {
1776 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1777 	}
1778 #endif /* HAS_16BIT_ASID */
1779 	pmap_simple_lock(&asid_lock);
1780 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1781 	bitmap_set(&asid_bitmap[0], vasid);
1782 	pmap_simple_unlock(&asid_lock);
1783 }
1784 
1785 
1786 boolean_t
pmap_valid_address(pmap_paddr_t addr)1787 pmap_valid_address(
1788 	pmap_paddr_t addr)
1789 {
1790 	return pa_valid(addr);
1791 }
1792 
1793 
1794 
1795 
1796 
1797 
1798 /*
1799  *      Map memory at initialization.  The physical addresses being
1800  *      mapped are not managed and are never unmapped.
1801  *
1802  *      For now, VM is already on, we only need to map the
1803  *      specified memory.
1804  */
1805 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1806 pmap_map(
1807 	vm_map_address_t virt,
1808 	vm_offset_t start,
1809 	vm_offset_t end,
1810 	vm_prot_t prot,
1811 	unsigned int flags)
1812 {
1813 	kern_return_t   kr;
1814 	vm_size_t       ps;
1815 
1816 	ps = PAGE_SIZE;
1817 	while (start < end) {
1818 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1819 		    prot, VM_PROT_NONE, flags, FALSE);
1820 
1821 		if (kr != KERN_SUCCESS) {
1822 			panic("%s: failed pmap_enter, "
1823 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1824 			    __FUNCTION__,
1825 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1826 		}
1827 
1828 		virt += ps;
1829 		start += ps;
1830 	}
1831 	return virt;
1832 }
1833 
1834 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1835 pmap_map_bd_with_options(
1836 	vm_map_address_t virt,
1837 	vm_offset_t start,
1838 	vm_offset_t end,
1839 	vm_prot_t prot,
1840 	int32_t options)
1841 {
1842 	pt_entry_t      tmplate;
1843 	pt_entry_t     *ptep;
1844 	vm_map_address_t vaddr;
1845 	vm_offset_t     paddr;
1846 	pt_entry_t      mem_attr;
1847 
1848 	switch (options & PMAP_MAP_BD_MASK) {
1849 	case PMAP_MAP_BD_WCOMB:
1850 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1851 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1852 		break;
1853 	case PMAP_MAP_BD_POSTED:
1854 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1855 		break;
1856 	case PMAP_MAP_BD_POSTED_REORDERED:
1857 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1858 		break;
1859 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1860 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1861 		break;
1862 	default:
1863 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1864 		break;
1865 	}
1866 
1867 	tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1868 	    mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1869 #if __ARM_KERNEL_PROTECT__
1870 	tmplate |= ARM_PTE_NG;
1871 #endif /* __ARM_KERNEL_PROTECT__ */
1872 
1873 	vaddr = virt;
1874 	paddr = start;
1875 	while (paddr < end) {
1876 		ptep = pmap_pte(kernel_pmap, vaddr);
1877 		if (ptep == PT_ENTRY_NULL) {
1878 			panic("%s: no PTE for vaddr=%p, "
1879 			    "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1880 			    __FUNCTION__, (void*)vaddr,
1881 			    (void*)virt, (void*)start, (void*)end, prot, options);
1882 		}
1883 
1884 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1885 		write_pte_strong(ptep, tmplate);
1886 
1887 		pte_increment_pa(tmplate);
1888 		vaddr += PAGE_SIZE;
1889 		paddr += PAGE_SIZE;
1890 	}
1891 
1892 	if (end >= start) {
1893 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1894 	}
1895 
1896 	return vaddr;
1897 }
1898 
1899 #if XNU_MONITOR
1900 /**
1901  * Remove kernel writeablity from an IO PTE value if the page is owned by
1902  * guarded mode software.
1903  *
1904  * @param paddr The physical address of the page which has to be non-DRAM.
1905  * @param tmplate The PTE value to be evaluated.
1906  *
1907  * @return A new PTE value with permission bits modified.
1908  */
1909 static inline
1910 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1911 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1912 {
1913 	assert(!pa_valid(paddr));
1914 
1915 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1916 
1917 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1918 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1919 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1920 		switch (xprr_perm) {
1921 		case XPRR_KERN_RO_PERM:
1922 			break;
1923 		case XPRR_KERN_RW_PERM:
1924 			tmplate &= ~ARM_PTE_XPRR_MASK;
1925 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1926 			break;
1927 		default:
1928 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1929 		}
1930 	}
1931 
1932 	return tmplate;
1933 }
1934 #endif /* XNU_MONITOR */
1935 
1936 /*
1937  *      Back-door routine for mapping kernel VM at initialization.
1938  *      Useful for mapping memory outside the range
1939  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1940  *      Otherwise like pmap_map.
1941  */
1942 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1943 pmap_map_bd(
1944 	vm_map_address_t virt,
1945 	vm_offset_t start,
1946 	vm_offset_t end,
1947 	vm_prot_t prot)
1948 {
1949 	pt_entry_t      tmplate;
1950 	pt_entry_t              *ptep;
1951 	vm_map_address_t vaddr;
1952 	vm_offset_t             paddr;
1953 
1954 	/* not cacheable and not buffered */
1955 	tmplate = pa_to_pte(start)
1956 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1957 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1958 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1959 #if __ARM_KERNEL_PROTECT__
1960 	tmplate |= ARM_PTE_NG;
1961 #endif /* __ARM_KERNEL_PROTECT__ */
1962 
1963 	vaddr = virt;
1964 	paddr = start;
1965 	while (paddr < end) {
1966 		ptep = pmap_pte(kernel_pmap, vaddr);
1967 		if (ptep == PT_ENTRY_NULL) {
1968 			panic("pmap_map_bd");
1969 		}
1970 
1971 #if XNU_MONITOR
1972 		if (!pa_valid(paddr)) {
1973 			tmplate = pmap_construct_io_pte(paddr, tmplate);
1974 		}
1975 #endif
1976 
1977 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1978 		write_pte_strong(ptep, tmplate);
1979 
1980 		pte_increment_pa(tmplate);
1981 		vaddr += PAGE_SIZE;
1982 		paddr += PAGE_SIZE;
1983 	}
1984 
1985 	if (end >= start) {
1986 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1987 	}
1988 
1989 	return vaddr;
1990 }
1991 
1992 /*
1993  *      Back-door routine for mapping kernel VM at initialization.
1994  *      Useful for mapping memory specific physical addresses in early
1995  *      boot (i.e., before kernel_map is initialized).
1996  *
1997  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1998  */
1999 
2000 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2001 pmap_map_high_window_bd(
2002 	vm_offset_t pa_start,
2003 	vm_size_t len,
2004 	vm_prot_t prot)
2005 {
2006 	pt_entry_t              *ptep, pte;
2007 	vm_map_address_t        va_start = VREGION1_START;
2008 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
2009 	vm_map_address_t        va_end;
2010 	vm_map_address_t        va;
2011 	vm_size_t               offset;
2012 
2013 	offset = pa_start & PAGE_MASK;
2014 	pa_start -= offset;
2015 	len += offset;
2016 
2017 	if (len > (va_max - va_start)) {
2018 		panic("%s: area too large, "
2019 		    "pa_start=%p, len=%p, prot=0x%x",
2020 		    __FUNCTION__,
2021 		    (void*)pa_start, (void*)len, prot);
2022 	}
2023 
2024 scan:
2025 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2026 		ptep = pmap_pte(kernel_pmap, va_start);
2027 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2028 		if (*ptep == ARM_PTE_TYPE_FAULT) {
2029 			break;
2030 		}
2031 	}
2032 	if (va_start > va_max) {
2033 		panic("%s: insufficient pages, "
2034 		    "pa_start=%p, len=%p, prot=0x%x",
2035 		    __FUNCTION__,
2036 		    (void*)pa_start, (void*)len, prot);
2037 	}
2038 
2039 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2040 		ptep = pmap_pte(kernel_pmap, va_end);
2041 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2042 		if (*ptep != ARM_PTE_TYPE_FAULT) {
2043 			va_start = va_end + PAGE_SIZE;
2044 			goto scan;
2045 		}
2046 	}
2047 
2048 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2049 		ptep = pmap_pte(kernel_pmap, va);
2050 		pte = pa_to_pte(pa_start)
2051 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2052 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2053 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2054 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2055 #if __ARM_KERNEL_PROTECT__
2056 		pte |= ARM_PTE_NG;
2057 #endif /* __ARM_KERNEL_PROTECT__ */
2058 		write_pte_strong(ptep, pte);
2059 	}
2060 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2061 #if KASAN
2062 	kasan_notify_address(va_start, len);
2063 #endif
2064 	return va_start;
2065 }
2066 
2067 static uint32_t
pmap_compute_max_asids(void)2068 pmap_compute_max_asids(void)
2069 {
2070 	DTEntry entry;
2071 	void const *prop = NULL;
2072 	uint32_t max_asids;
2073 	int err;
2074 	unsigned int prop_size;
2075 
2076 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2077 	assert(err == kSuccess);
2078 
2079 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2080 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2081 		 * we can choose a more flexible default value here. */
2082 		return MAX_ASIDS;
2083 	}
2084 
2085 	if (prop_size != sizeof(max_asids)) {
2086 		panic("pmap-max-asids property is not a 32-bit integer");
2087 	}
2088 
2089 	max_asids = *((uint32_t const *)prop);
2090 #if HAS_16BIT_ASID
2091 	if (max_asids > MAX_HW_ASIDS) {
2092 		panic("pmap-max-asids 0x%x too large", max_asids);
2093 	}
2094 #else
2095 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2096 	max_asids = (max_asids + 63) & ~63UL;
2097 
2098 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2099 		/* currently capped by size of pmap->sw_asid */
2100 		panic("pmap-max-asids 0x%x too large", max_asids);
2101 	}
2102 #endif /* HAS_16BIT_ASID */
2103 	if (max_asids == 0) {
2104 		panic("pmap-max-asids cannot be zero");
2105 	}
2106 	return max_asids;
2107 }
2108 
2109 #if __arm64__
2110 /*
2111  * pmap_get_arm64_prot
2112  *
2113  * return effective armv8 VMSA block protections including
2114  * table AP/PXN/XN overrides of a pmap entry
2115  *
2116  */
2117 
2118 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2119 pmap_get_arm64_prot(
2120 	pmap_t pmap,
2121 	vm_offset_t addr)
2122 {
2123 	tt_entry_t tte = 0;
2124 	unsigned int level = 0;
2125 	uint64_t tte_type = 0;
2126 	uint64_t effective_prot_bits = 0;
2127 	uint64_t aggregate_tte = 0;
2128 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2129 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2130 
2131 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2132 		tte = *pmap_ttne(pmap, level, addr);
2133 
2134 		if (!(tte & ARM_TTE_VALID)) {
2135 			return 0;
2136 		}
2137 
2138 		tte_type = tte & ARM_TTE_TYPE_MASK;
2139 
2140 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2141 		    (level == pt_attr->pta_max_level)) {
2142 			/* Block or page mapping; both have the same protection bit layout. */
2143 			break;
2144 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2145 			/* All of the table bits we care about are overrides, so just OR them together. */
2146 			aggregate_tte |= tte;
2147 		}
2148 	}
2149 
2150 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2151 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2152 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2153 
2154 	/* Start with the PTE bits. */
2155 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2156 
2157 	/* Table AP bits mask out block/page AP bits */
2158 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2159 
2160 	/* XN/PXN bits can be OR'd in. */
2161 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2162 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2163 
2164 	return effective_prot_bits;
2165 }
2166 #endif /* __arm64__ */
2167 
2168 /*
2169  *	Bootstrap the system enough to run with virtual memory.
2170  *
2171  *	The early VM initialization code has already allocated
2172  *	the first CPU's translation table and made entries for
2173  *	all the one-to-one mappings to be found there.
2174  *
2175  *	We must set up the kernel pmap structures, the
2176  *	physical-to-virtual translation lookup tables for the
2177  *	physical memory to be managed (between avail_start and
2178  *	avail_end).
2179  *
2180  *	Map the kernel's code and data, and allocate the system page table.
2181  *	Page_size must already be set.
2182  *
2183  *	Parameters:
2184  *	first_avail	first available physical page -
2185  *			   after kernel page tables
2186  *	avail_start	PA of first managed physical page
2187  *	avail_end	PA of last managed physical page
2188  */
2189 
2190 void
pmap_bootstrap(vm_offset_t vstart)2191 pmap_bootstrap(
2192 	vm_offset_t vstart)
2193 {
2194 	vm_map_offset_t maxoffset;
2195 
2196 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2197 
2198 #if XNU_MONITOR
2199 
2200 #if DEVELOPMENT || DEBUG
2201 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2202 #endif
2203 
2204 #if CONFIG_CSR_FROM_DT
2205 	if (csr_unsafe_kernel_text) {
2206 		pmap_ppl_disable = true;
2207 	}
2208 #endif /* CONFIG_CSR_FROM_DT */
2209 
2210 #endif /* XNU_MONITOR */
2211 
2212 #if DEVELOPMENT || DEBUG
2213 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2214 		kprintf("Kernel traces for pmap operations enabled\n");
2215 	}
2216 #endif
2217 
2218 	/*
2219 	 *	Initialize the kernel pmap.
2220 	 */
2221 #if ARM_PARAMETERIZED_PMAP
2222 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2223 #endif /* ARM_PARAMETERIZED_PMAP */
2224 #if HAS_APPLE_PAC
2225 	kernel_pmap->disable_jop = 0;
2226 #endif /* HAS_APPLE_PAC */
2227 	kernel_pmap->tte = cpu_tte;
2228 	kernel_pmap->ttep = cpu_ttep;
2229 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2230 	kernel_pmap->max = UINTPTR_MAX;
2231 	os_atomic_init(&kernel_pmap->ref_count, 1);
2232 #if XNU_MONITOR
2233 	os_atomic_init(&kernel_pmap->nested_count, 0);
2234 #endif
2235 	kernel_pmap->nx_enabled = TRUE;
2236 #ifdef  __arm64__
2237 	kernel_pmap->is_64bit = TRUE;
2238 #else
2239 	kernel_pmap->is_64bit = FALSE;
2240 #endif
2241 #if CONFIG_ROSETTA
2242 	kernel_pmap->is_rosetta = FALSE;
2243 #endif
2244 
2245 #if ARM_PARAMETERIZED_PMAP
2246 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2247 #endif /* ARM_PARAMETERIZED_PMAP */
2248 
2249 	kernel_pmap->nested_region_addr = 0x0ULL;
2250 	kernel_pmap->nested_region_size = 0x0ULL;
2251 	kernel_pmap->nested_region_asid_bitmap = NULL;
2252 	kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2253 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2254 
2255 	kernel_pmap->hw_asid = 0;
2256 	kernel_pmap->sw_asid = 0;
2257 
2258 	pmap_lock_init(kernel_pmap);
2259 
2260 	pmap_max_asids = pmap_compute_max_asids();
2261 #if HAS_16BIT_ASID
2262 	asid_chunk_size = MAX_HW_ASIDS;
2263 #else
2264 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2265 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2266 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2267 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2268 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2269 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2270 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2271 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2272 #endif /* HAS_16BIT_ASIDS */
2273 
2274 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2275 
2276 	/**
2277 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2278 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2279 	 * space for these data structures.
2280 	 */
2281 	pmap_data_bootstrap();
2282 
2283 	/**
2284 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2285 	 */
2286 	uat_bootstrap();
2287 
2288 
2289 	/**
2290 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2291 	 */
2292 	sart_bootstrap();
2293 
2294 	/**
2295 	 * Don't make any assumptions about the alignment of avail_start before this
2296 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2297 	 */
2298 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2299 
2300 	const pmap_paddr_t pmap_struct_start = avail_start;
2301 
2302 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2303 	avail_start = round_page(avail_start + asid_table_size);
2304 
2305 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2306 
2307 	vm_first_phys = gPhysBase;
2308 	vm_last_phys = trunc_page(avail_end);
2309 
2310 	queue_init(&map_pmap_list);
2311 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2312 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2313 	free_page_size_tt_count = 0;
2314 	free_page_size_tt_max = 0;
2315 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2316 	free_two_page_size_tt_count = 0;
2317 	free_two_page_size_tt_max = 0;
2318 	free_tt_list = TT_FREE_ENTRY_NULL;
2319 	free_tt_count = 0;
2320 	free_tt_max = 0;
2321 
2322 	virtual_space_start = vstart;
2323 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2324 
2325 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2326 #if !HAS_16BIT_ASID
2327 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2328 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2329 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2330 #endif /* !HAS_16BIT_ASID */
2331 
2332 
2333 
2334 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2335 		maxoffset = trunc_page(maxoffset);
2336 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2337 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2338 			arm_pmap_max_offset_default = maxoffset;
2339 		}
2340 	}
2341 #if defined(__arm64__)
2342 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2343 		maxoffset = trunc_page(maxoffset);
2344 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2345 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2346 			arm64_pmap_max_offset_default = maxoffset;
2347 		}
2348 	}
2349 #endif
2350 
2351 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2352 
2353 
2354 #if PMAP_CS_PPL_MONITOR
2355 	/* Initialize the PPL trust cache read-write lock */
2356 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2357 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2358 #endif
2359 
2360 #if MACH_ASSERT
2361 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2362 	    &vm_footprint_suspend_allowed,
2363 	    sizeof(vm_footprint_suspend_allowed));
2364 #endif /* MACH_ASSERT */
2365 
2366 #if KASAN
2367 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2368 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2369 #endif /* KASAN */
2370 
2371 	/**
2372 	 * Ensure that avail_start is always left on a page boundary. The calling
2373 	 * code might not perform any alignment before allocating page tables so
2374 	 * this is important.
2375 	 */
2376 	avail_start = round_page(avail_start);
2377 }
2378 
2379 #if XNU_MONITOR
2380 
2381 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2382 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2383 {
2384 	pmap_paddr_t cur_pa;
2385 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2386 		assert(pa_valid(cur_pa));
2387 		ppattr_pa_set_monitor(cur_pa);
2388 	}
2389 }
2390 
2391 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2392 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2393     pmap_paddr_t end_pa,
2394     unsigned int expected_perm,
2395     unsigned int new_perm)
2396 {
2397 	vm_offset_t start_va = phystokv(start_pa);
2398 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2399 
2400 	pa_set_range_monitor(start_pa, end_pa);
2401 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2402 }
2403 
2404 static void
pmap_lockdown_kc(void)2405 pmap_lockdown_kc(void)
2406 {
2407 	extern vm_offset_t vm_kernelcache_base;
2408 	extern vm_offset_t vm_kernelcache_top;
2409 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2410 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2411 	pmap_paddr_t cur_pa = start_pa;
2412 	vm_offset_t cur_va = vm_kernelcache_base;
2413 	while (cur_pa < end_pa) {
2414 		vm_size_t range_size = end_pa - cur_pa;
2415 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2416 		if (ptov_va != cur_va) {
2417 			/*
2418 			 * If the physical address maps back to a virtual address that is non-linear
2419 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2420 			 * reclaimed by the OS and should therefore not be locked down.
2421 			 */
2422 			cur_pa += range_size;
2423 			cur_va += range_size;
2424 			continue;
2425 		}
2426 		unsigned int pai = pa_index(cur_pa);
2427 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2428 
2429 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2430 
2431 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2432 			panic("pai %d already locked down", pai);
2433 		}
2434 
2435 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2436 		cur_pa += ARM_PGBYTES;
2437 		cur_va += ARM_PGBYTES;
2438 	}
2439 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2440 	extern uint64_t ctrr_ro_test;
2441 	extern uint64_t ctrr_nx_test;
2442 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2443 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2444 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2445 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2446 	}
2447 #endif
2448 }
2449 
2450 void
pmap_static_allocations_done(void)2451 pmap_static_allocations_done(void)
2452 {
2453 	pmap_paddr_t monitor_start_pa;
2454 	pmap_paddr_t monitor_end_pa;
2455 
2456 	/*
2457 	 * Protect the bootstrap (V=P and V->P) page tables.
2458 	 *
2459 	 * These bootstrap allocations will be used primarily for page tables.
2460 	 * If we wish to secure the page tables, we need to start by marking
2461 	 * these bootstrap allocations as pages that we want to protect.
2462 	 */
2463 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2464 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2465 
2466 	/* The bootstrap page tables are mapped RW at boostrap. */
2467 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2468 
2469 	/*
2470 	 * We use avail_start as a pointer to the first address that has not
2471 	 * been reserved for bootstrap, so we know which pages to give to the
2472 	 * virtual memory layer.
2473 	 */
2474 	monitor_start_pa = first_avail_phys;
2475 	monitor_end_pa = avail_start;
2476 
2477 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2478 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2479 
2480 	/*
2481 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2482 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2483 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2484 	 * they can't be allocated for other uses.  We don't need a special xPRR
2485 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2486 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2487 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2488 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2489 	 * to believe we are dealing with an user XO page upon performing a translation.
2490 	 */
2491 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2492 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2493 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2494 
2495 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2496 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2497 
2498 	/* PPL data is RW for the PPL, RO for the kernel. */
2499 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2500 
2501 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2502 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2503 
2504 	/* PPL text is RX for the PPL, RO for the kernel. */
2505 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2506 
2507 
2508 	/*
2509 	 * In order to support DTrace, the save areas for the PPL must be
2510 	 * writable.  This is due to the fact that DTrace will try to update
2511 	 * register state.
2512 	 */
2513 	if (pmap_ppl_disable) {
2514 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2515 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2516 
2517 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2518 	}
2519 
2520 
2521 	if (segSizePPLDATACONST > 0) {
2522 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2523 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2524 
2525 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2526 	}
2527 
2528 	/*
2529 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2530 	 * precaution.  The real RW mappings are at a different location with guard pages.
2531 	 */
2532 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2533 
2534 	/* Prevent remapping of the kernelcache */
2535 	pmap_lockdown_kc();
2536 }
2537 
2538 void
pmap_lockdown_ppl(void)2539 pmap_lockdown_ppl(void)
2540 {
2541 	/* Mark the PPL as being locked down. */
2542 
2543 	mp_disable_preemption(); // for _nopreempt locking operations
2544 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2545 	if (commpage_text_kva != 0) {
2546 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2547 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2548 	}
2549 	mp_enable_preemption();
2550 
2551 	/* Write-protect the kernel RO commpage. */
2552 #error "XPRR configuration error"
2553 }
2554 #endif /* XNU_MONITOR */
2555 
2556 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2557 pmap_virtual_space(
2558 	vm_offset_t *startp,
2559 	vm_offset_t *endp
2560 	)
2561 {
2562 	*startp = virtual_space_start;
2563 	*endp = virtual_space_end;
2564 }
2565 
2566 
2567 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2568 pmap_virtual_region(
2569 	unsigned int region_select,
2570 	vm_map_offset_t *startp,
2571 	vm_map_size_t *size
2572 	)
2573 {
2574 	boolean_t       ret = FALSE;
2575 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2576 	if (region_select == 0) {
2577 		/*
2578 		 * In this config, the bootstrap mappings should occupy their own L2
2579 		 * TTs, as they should be immutable after boot.  Having the associated
2580 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2581 		 * while allowing the rest of the kernel address range to be remapped.
2582 		 */
2583 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2584 #if defined(ARM_LARGE_MEMORY)
2585 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2586 #else
2587 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2588 #endif
2589 		ret = TRUE;
2590 	}
2591 
2592 #if defined(ARM_LARGE_MEMORY)
2593 	if (region_select == 1) {
2594 		*startp = VREGION1_START;
2595 		*size = VREGION1_SIZE;
2596 		ret = TRUE;
2597 	}
2598 #endif
2599 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2600 #if defined(ARM_LARGE_MEMORY)
2601 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2602 	if (region_select == 0) {
2603 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2604 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2605 		ret = TRUE;
2606 	}
2607 
2608 	if (region_select == 1) {
2609 		*startp = VREGION1_START;
2610 		*size = VREGION1_SIZE;
2611 		ret = TRUE;
2612 	}
2613 #else /* !defined(ARM_LARGE_MEMORY) */
2614 	unsigned long low_global_vr_mask = 0;
2615 	vm_map_size_t low_global_vr_size = 0;
2616 
2617 	if (region_select == 0) {
2618 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2619 		if (!TEST_PAGE_SIZE_4K) {
2620 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2621 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2622 		} else {
2623 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2624 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2625 		}
2626 		ret = TRUE;
2627 	}
2628 	if (region_select == 1) {
2629 		*startp = VREGION1_START;
2630 		*size = VREGION1_SIZE;
2631 		ret = TRUE;
2632 	}
2633 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2634 	if (!TEST_PAGE_SIZE_4K) {
2635 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2636 		low_global_vr_size = 0x2000000;
2637 	} else {
2638 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2639 		low_global_vr_size = 0x800000;
2640 	}
2641 
2642 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2643 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2644 		*size = low_global_vr_size;
2645 		ret = TRUE;
2646 	}
2647 
2648 	if (region_select == 3) {
2649 		/* In this config, we allow the bootstrap mappings to occupy the same
2650 		 * page table pages as the heap.
2651 		 */
2652 		*startp = VM_MIN_KERNEL_ADDRESS;
2653 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2654 		ret = TRUE;
2655 	}
2656 #endif /* defined(ARM_LARGE_MEMORY) */
2657 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2658 	return ret;
2659 }
2660 
2661 /*
2662  * Routines to track and allocate physical pages during early boot.
2663  * On most systems that memory runs from first_avail through to avail_end
2664  * with no gaps.
2665  *
2666  * If the system supports ECC and ecc_bad_pages_count > 0, we
2667  * need to skip those pages.
2668  */
2669 
2670 static unsigned int avail_page_count = 0;
2671 static bool need_ram_ranges_init = true;
2672 
2673 
2674 /**
2675  * Checks to see if a given page is in
2676  * the array of known bad pages
2677  *
2678  * @param ppn page number to check
2679  */
2680 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2681 pmap_is_bad_ram(__unused ppnum_t ppn)
2682 {
2683 	return false;
2684 }
2685 
2686 /**
2687  * Prepare bad ram pages to be skipped.
2688  */
2689 
2690 /*
2691  * Initialize the count of available pages. No lock needed here,
2692  * as this code is called while kernel boot up is single threaded.
2693  */
2694 static void
initialize_ram_ranges(void)2695 initialize_ram_ranges(void)
2696 {
2697 	pmap_paddr_t first = first_avail;
2698 	pmap_paddr_t end = avail_end;
2699 
2700 	assert(first <= end);
2701 	assert(first == (first & ~PAGE_MASK));
2702 	assert(end == (end & ~PAGE_MASK));
2703 	avail_page_count = atop(end - first);
2704 
2705 	need_ram_ranges_init = false;
2706 }
2707 
2708 unsigned int
pmap_free_pages(void)2709 pmap_free_pages(
2710 	void)
2711 {
2712 	if (need_ram_ranges_init) {
2713 		initialize_ram_ranges();
2714 	}
2715 	return avail_page_count;
2716 }
2717 
2718 unsigned int
pmap_free_pages_span(void)2719 pmap_free_pages_span(
2720 	void)
2721 {
2722 	if (need_ram_ranges_init) {
2723 		initialize_ram_ranges();
2724 	}
2725 	return (unsigned int)atop(avail_end - first_avail);
2726 }
2727 
2728 
2729 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2730 pmap_next_page_hi(
2731 	ppnum_t            * pnum,
2732 	__unused boolean_t might_free)
2733 {
2734 	return pmap_next_page(pnum);
2735 }
2736 
2737 
2738 boolean_t
pmap_next_page(ppnum_t * pnum)2739 pmap_next_page(
2740 	ppnum_t *pnum)
2741 {
2742 	if (need_ram_ranges_init) {
2743 		initialize_ram_ranges();
2744 	}
2745 
2746 
2747 	if (first_avail != avail_end) {
2748 		*pnum = (ppnum_t)atop(first_avail);
2749 		first_avail += PAGE_SIZE;
2750 		assert(avail_page_count > 0);
2751 		--avail_page_count;
2752 		return TRUE;
2753 	}
2754 	assert(avail_page_count == 0);
2755 	return FALSE;
2756 }
2757 
2758 
2759 /*
2760  *	Initialize the pmap module.
2761  *	Called by vm_init, to initialize any structures that the pmap
2762  *	system needs to map virtual memory.
2763  */
2764 void
pmap_init(void)2765 pmap_init(
2766 	void)
2767 {
2768 	/*
2769 	 *	Protect page zero in the kernel map.
2770 	 *	(can be overruled by permanent transltion
2771 	 *	table entries at page zero - see arm_vm_init).
2772 	 */
2773 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2774 
2775 	pmap_initialized = TRUE;
2776 
2777 	/*
2778 	 *	Create the zone of physical maps
2779 	 *	and the physical-to-virtual entries.
2780 	 */
2781 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2782 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2783 
2784 
2785 	/*
2786 	 *	Initialize the pmap object (for tracking the vm_page_t
2787 	 *	structures for pages we allocate to be page tables in
2788 	 *	pmap_expand().
2789 	 */
2790 	_vm_object_allocate(mem_size, pmap_object);
2791 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2792 
2793 	/*
2794 	 * The values of [hard_]maxproc may have been scaled, make sure
2795 	 * they are still less than the value of pmap_max_asids.
2796 	 */
2797 	if ((uint32_t)maxproc > pmap_max_asids) {
2798 		maxproc = pmap_max_asids;
2799 	}
2800 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2801 		hard_maxproc = pmap_max_asids;
2802 	}
2803 }
2804 
2805 /**
2806  * Verify that a given physical page contains no mappings (outside of the
2807  * default physical aperture mapping).
2808  *
2809  * @param ppnum Physical page number to check there are no mappings to.
2810  *
2811  * @return True if there are no mappings, false otherwise or if the page is not
2812  *         kernel-managed.
2813  */
2814 bool
pmap_verify_free(ppnum_t ppnum)2815 pmap_verify_free(ppnum_t ppnum)
2816 {
2817 	const pmap_paddr_t pa = ptoa(ppnum);
2818 
2819 	assert(pa != vm_page_fictitious_addr);
2820 
2821 	/* Only mappings to kernel-managed physical memory are tracked. */
2822 	if (!pa_valid(pa)) {
2823 		return false;
2824 	}
2825 
2826 	const unsigned int pai = pa_index(pa);
2827 	pv_entry_t **pvh = pai_to_pvh(pai);
2828 
2829 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2830 }
2831 
2832 #if MACH_ASSERT
2833 /**
2834  * Verify that a given physical page contains no mappings (outside of the
2835  * default physical aperture mapping) and if it does, then panic.
2836  *
2837  * @note It's recommended to use pmap_verify_free() directly when operating in
2838  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2839  *       normally being called from outside of the PPL, and the pv_head_table
2840  *       can't be modified outside of the PPL).
2841  *
2842  * @param ppnum Physical page number to check there are no mappings to.
2843  */
2844 void
pmap_assert_free(ppnum_t ppnum)2845 pmap_assert_free(ppnum_t ppnum)
2846 {
2847 	const pmap_paddr_t pa = ptoa(ppnum);
2848 
2849 	/* Only mappings to kernel-managed physical memory are tracked. */
2850 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2851 		return;
2852 	}
2853 
2854 	const unsigned int pai = pa_index(pa);
2855 	pv_entry_t **pvh = pai_to_pvh(pai);
2856 
2857 	/**
2858 	 * This function is always called from outside of the PPL. Because of this,
2859 	 * the PVH entry can't be locked. This function is generally only called
2860 	 * before the VM reclaims a physical page and shouldn't be creating new
2861 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2862 	 * the worst case is that the system will panic in another way, and we were
2863 	 * already about to panic anyway.
2864 	 */
2865 
2866 	/**
2867 	 * Since pmap_verify_free() returned false, that means there is at least one
2868 	 * mapping left. Let's get some extra info on the first mapping we find to
2869 	 * dump in the panic string (the common case is that there is one spare
2870 	 * mapping that was never unmapped).
2871 	 */
2872 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2873 
2874 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2875 		first_ptep = pvh_ptep(pvh);
2876 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2877 		pv_entry_t *pvep = pvh_pve_list(pvh);
2878 
2879 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2880 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2881 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2882 			if (first_ptep != PT_ENTRY_NULL) {
2883 				break;
2884 			}
2885 		}
2886 
2887 		/* The PVE should have at least one valid PTE. */
2888 		assert(first_ptep != PT_ENTRY_NULL);
2889 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2890 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2891 		    __func__, pvh, pai);
2892 	} else {
2893 		/**
2894 		 * The mapping disappeared between here and the pmap_verify_free() call.
2895 		 * The only way that can happen is if the VM was racing this call with
2896 		 * a call that unmaps PTEs. Operations on this page should not be
2897 		 * occurring at the same time as this check, and unfortunately we can't
2898 		 * lock the PVH entry to prevent it, so just panic instead.
2899 		 */
2900 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2901 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2902 		    __func__, pvh, pai);
2903 	}
2904 
2905 	/* Panic with a unique string identifying the first bad mapping and owner. */
2906 	{
2907 		/* First PTE is mapped by the main CPUs. */
2908 		pmap_t pmap = ptep_get_pmap(first_ptep);
2909 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2910 
2911 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2912 		    "%s CPU mapping (pmap: %p)",
2913 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2914 	}
2915 }
2916 #endif
2917 
2918 
2919 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2920 pmap_root_alloc_size(pmap_t pmap)
2921 {
2922 #pragma unused(pmap)
2923 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2924 	unsigned int root_level = pt_attr_root_level(pt_attr);
2925 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2926 }
2927 
2928 
2929 /*
2930  *	Create and return a physical map.
2931  *
2932  *	If the size specified for the map
2933  *	is zero, the map is an actual physical
2934  *	map, and may be referenced by the
2935  *	hardware.
2936  *
2937  *	If the size specified is non-zero,
2938  *	the map will be used in software only, and
2939  *	is bounded by that size.
2940  */
2941 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2942 pmap_create_options_internal(
2943 	ledger_t ledger,
2944 	vm_map_size_t size,
2945 	unsigned int flags,
2946 	kern_return_t *kr)
2947 {
2948 	unsigned        i;
2949 	unsigned        tte_index_max;
2950 	pmap_t          p;
2951 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2952 #if defined(HAS_APPLE_PAC)
2953 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2954 #endif /* defined(HAS_APPLE_PAC) */
2955 	kern_return_t   local_kr = KERN_SUCCESS;
2956 
2957 	if (size != 0) {
2958 		{
2959 			// Size parameter should only be set for stage 2.
2960 			return PMAP_NULL;
2961 		}
2962 	}
2963 
2964 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2965 		return PMAP_NULL;
2966 	}
2967 
2968 #if XNU_MONITOR
2969 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2970 		goto pmap_create_fail;
2971 	}
2972 
2973 	assert(p != PMAP_NULL);
2974 
2975 	if (ledger) {
2976 		pmap_ledger_validate(ledger);
2977 		pmap_ledger_retain(ledger);
2978 	}
2979 #else
2980 	/*
2981 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2982 	 *	the translation table of the right size for the pmap.
2983 	 */
2984 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2985 		local_kr = KERN_RESOURCE_SHORTAGE;
2986 		goto pmap_create_fail;
2987 	}
2988 #endif
2989 
2990 	p->ledger = ledger;
2991 
2992 
2993 	p->pmap_vm_map_cs_enforced = false;
2994 	p->min = 0;
2995 
2996 
2997 #if CONFIG_ROSETTA
2998 	if (flags & PMAP_CREATE_ROSETTA) {
2999 		p->is_rosetta = TRUE;
3000 	} else {
3001 		p->is_rosetta = FALSE;
3002 	}
3003 #endif /* CONFIG_ROSETTA */
3004 
3005 #if defined(HAS_APPLE_PAC)
3006 	p->disable_jop = disable_jop;
3007 #endif /* defined(HAS_APPLE_PAC) */
3008 
3009 	p->nested_region_true_start = 0;
3010 	p->nested_region_true_end = ~0;
3011 
3012 	p->nx_enabled = true;
3013 	p->is_64bit = is_64bit;
3014 	p->nested_pmap = PMAP_NULL;
3015 	p->type = PMAP_TYPE_USER;
3016 
3017 #if ARM_PARAMETERIZED_PMAP
3018 	/* Default to the native pt_attr */
3019 	p->pmap_pt_attr = native_pt_attr;
3020 #endif /* ARM_PARAMETERIZED_PMAP */
3021 #if __ARM_MIXED_PAGE_SIZE__
3022 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3023 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3024 	}
3025 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3026 	p->max = pmap_user_va_size(p);
3027 
3028 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3029 		local_kr = KERN_NO_SPACE;
3030 		goto id_alloc_fail;
3031 	}
3032 
3033 	pmap_lock_init(p);
3034 
3035 	p->tt_entry_free = (tt_entry_t *)0;
3036 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3037 
3038 
3039 #if XNU_MONITOR
3040 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3041 #else
3042 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3043 #endif
3044 	if (!(p->tte)) {
3045 		local_kr = KERN_RESOURCE_SHORTAGE;
3046 		goto tt1_alloc_fail;
3047 	}
3048 
3049 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3050 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3051 
3052 	/* nullify the translation table */
3053 	for (i = 0; i < tte_index_max; i++) {
3054 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3055 	}
3056 
3057 	FLUSH_PTE();
3058 
3059 	/*
3060 	 *  initialize the rest of the structure
3061 	 */
3062 	p->nested_region_addr = 0x0ULL;
3063 	p->nested_region_size = 0x0ULL;
3064 	p->nested_region_asid_bitmap = NULL;
3065 	p->nested_region_asid_bitmap_size = 0x0UL;
3066 
3067 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3068 	p->nested_no_bounds_refcnt = 0;
3069 	p->nested_bounds_set = false;
3070 
3071 
3072 #if MACH_ASSERT
3073 	p->pmap_pid = 0;
3074 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3075 #endif /* MACH_ASSERT */
3076 #if DEVELOPMENT || DEBUG
3077 	p->footprint_was_suspended = FALSE;
3078 #endif /* DEVELOPMENT || DEBUG */
3079 
3080 #if XNU_MONITOR
3081 	os_atomic_init(&p->nested_count, 0);
3082 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3083 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3084 	os_atomic_thread_fence(release);
3085 #endif
3086 	os_atomic_init(&p->ref_count, 1);
3087 	pmap_simple_lock(&pmaps_lock);
3088 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3089 	pmap_simple_unlock(&pmaps_lock);
3090 
3091 	/*
3092 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3093 	 * which can lead to a concurrent disconnect operation making the balance
3094 	 * transiently negative.  The ledger should still ultimately balance out,
3095 	 * which we still check upon pmap destruction.
3096 	 */
3097 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3098 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3099 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3100 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3101 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3102 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3103 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3104 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3105 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3106 
3107 	return p;
3108 
3109 tt1_alloc_fail:
3110 	pmap_get_pt_ops(p)->free_id(p);
3111 id_alloc_fail:
3112 #if XNU_MONITOR
3113 	pmap_free_pmap(p);
3114 
3115 	if (ledger) {
3116 		pmap_ledger_release(ledger);
3117 	}
3118 #else
3119 	zfree(pmap_zone, p);
3120 #endif
3121 pmap_create_fail:
3122 #if XNU_MONITOR
3123 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3124 #endif
3125 	*kr = local_kr;
3126 #if XNU_MONITOR
3127 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3128 #endif
3129 	return PMAP_NULL;
3130 }
3131 
3132 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3133 pmap_create_options(
3134 	ledger_t ledger,
3135 	vm_map_size_t size,
3136 	unsigned int flags)
3137 {
3138 	pmap_t pmap;
3139 	kern_return_t kr = KERN_SUCCESS;
3140 
3141 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3142 
3143 	ledger_reference(ledger);
3144 
3145 #if XNU_MONITOR
3146 	for (;;) {
3147 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3148 		if (kr != KERN_RESOURCE_SHORTAGE) {
3149 			break;
3150 		}
3151 		assert(pmap == PMAP_NULL);
3152 		pmap_alloc_page_for_ppl(0);
3153 		kr = KERN_SUCCESS;
3154 	}
3155 #else
3156 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3157 #endif
3158 
3159 	if (pmap == PMAP_NULL) {
3160 		ledger_dereference(ledger);
3161 	}
3162 
3163 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3164 
3165 	return pmap;
3166 }
3167 
3168 #if XNU_MONITOR
3169 /*
3170  * This symbol remains in place when the PPL is enabled so that the dispatch
3171  * table does not change from development to release configurations.
3172  */
3173 #endif
3174 #if MACH_ASSERT || XNU_MONITOR
3175 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3176 pmap_set_process_internal(
3177 	__unused pmap_t pmap,
3178 	__unused int pid,
3179 	__unused char *procname)
3180 {
3181 #if MACH_ASSERT
3182 	if (pmap == NULL || pmap->pmap_pid == -1) {
3183 		return;
3184 	}
3185 
3186 	validate_pmap_mutable(pmap);
3187 
3188 	pmap->pmap_pid = pid;
3189 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3190 #endif /* MACH_ASSERT */
3191 }
3192 #endif /* MACH_ASSERT || XNU_MONITOR */
3193 
3194 #if MACH_ASSERT
3195 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3196 pmap_set_process(
3197 	pmap_t pmap,
3198 	int pid,
3199 	char *procname)
3200 {
3201 #if XNU_MONITOR
3202 	pmap_set_process_ppl(pmap, pid, procname);
3203 #else
3204 	pmap_set_process_internal(pmap, pid, procname);
3205 #endif
3206 }
3207 #endif /* MACH_ASSERT */
3208 
3209 /*
3210  * pmap_deallocate_all_leaf_tts:
3211  *
3212  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3213  * removing and deallocating all TTEs.
3214  */
3215 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3216 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3217 {
3218 	tt_entry_t tte = ARM_TTE_EMPTY;
3219 	tt_entry_t * ttep = NULL;
3220 	tt_entry_t * last_ttep = NULL;
3221 
3222 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3223 
3224 	assert(level < pt_attr_leaf_level(pt_attr));
3225 
3226 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3227 
3228 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3229 		tte = *ttep;
3230 
3231 		if (!(tte & ARM_TTE_VALID)) {
3232 			continue;
3233 		}
3234 
3235 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3236 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3237 			    "pmap=%p, first_ttep=%p, level=%u",
3238 			    __FUNCTION__, ttep, (void *)tte,
3239 			    pmap, first_ttep, level);
3240 		}
3241 
3242 		/* Must be valid, type table */
3243 		if (level < pt_attr_twig_level(pt_attr)) {
3244 			/* If we haven't reached the twig level, recurse to the next level. */
3245 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3246 		}
3247 
3248 		/* Remove the TTE. */
3249 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3250 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3251 	}
3252 }
3253 
3254 /*
3255  * We maintain stats and ledgers so that a task's physical footprint is:
3256  * phys_footprint = ((internal - alternate_accounting)
3257  *                   + (internal_compressed - alternate_accounting_compressed)
3258  *                   + iokit_mapped
3259  *                   + purgeable_nonvolatile
3260  *                   + purgeable_nonvolatile_compressed
3261  *                   + page_table)
3262  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3263  */
3264 
3265 /*
3266  *	Retire the given physical map from service.
3267  *	Should only be called if the map contains
3268  *	no valid mappings.
3269  */
3270 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3271 pmap_destroy_internal(
3272 	pmap_t pmap)
3273 {
3274 	if (pmap == PMAP_NULL) {
3275 		return;
3276 	}
3277 
3278 	validate_pmap(pmap);
3279 
3280 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3281 
3282 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3283 	if (ref_count > 0) {
3284 		return;
3285 	} else if (__improbable(ref_count < 0)) {
3286 		panic("pmap %p: refcount underflow", pmap);
3287 	} else if (__improbable(pmap == kernel_pmap)) {
3288 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3289 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3290 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3291 	}
3292 
3293 #if XNU_MONITOR
3294 	/*
3295 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3296 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3297 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3298 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3299 	 * ref_count of 0 and panic.
3300 	 */
3301 	os_atomic_thread_fence(seq_cst);
3302 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3303 		panic("pmap %p: attempt to destroy while nested", pmap);
3304 	}
3305 	const int max_cpu = ml_get_max_cpu_number();
3306 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3307 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3308 		if (cpu_data == NULL) {
3309 			continue;
3310 		}
3311 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3312 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3313 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3314 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3315 		}
3316 	}
3317 #endif
3318 	pmap_unmap_commpage(pmap);
3319 
3320 	pmap_simple_lock(&pmaps_lock);
3321 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3322 	pmap_simple_unlock(&pmaps_lock);
3323 
3324 	pmap_trim_self(pmap);
3325 
3326 	/*
3327 	 *	Free the memory maps, then the
3328 	 *	pmap structure.
3329 	 */
3330 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3331 
3332 
3333 
3334 	if (pmap->tte) {
3335 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3336 		pmap->tte = (tt_entry_t *) NULL;
3337 		pmap->ttep = 0;
3338 	}
3339 
3340 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3341 
3342 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3343 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3344 		sync_tlb_flush();
3345 	} else {
3346 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3347 		sync_tlb_flush();
3348 		/* return its asid to the pool */
3349 		pmap_get_pt_ops(pmap)->free_id(pmap);
3350 		if (pmap->nested_pmap != NULL) {
3351 #if XNU_MONITOR
3352 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3353 #endif
3354 			/* release the reference we hold on the nested pmap */
3355 			pmap_destroy_internal(pmap->nested_pmap);
3356 		}
3357 	}
3358 
3359 	pmap_check_ledgers(pmap);
3360 
3361 	if (pmap->nested_region_asid_bitmap) {
3362 #if XNU_MONITOR
3363 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3364 #else
3365 		kfree_data(pmap->nested_region_asid_bitmap,
3366 		    pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3367 #endif
3368 	}
3369 
3370 #if XNU_MONITOR
3371 	if (pmap->ledger) {
3372 		pmap_ledger_release(pmap->ledger);
3373 	}
3374 
3375 	pmap_lock_destroy(pmap);
3376 	pmap_free_pmap(pmap);
3377 #else
3378 	pmap_lock_destroy(pmap);
3379 	zfree(pmap_zone, pmap);
3380 #endif
3381 }
3382 
3383 void
pmap_destroy(pmap_t pmap)3384 pmap_destroy(
3385 	pmap_t pmap)
3386 {
3387 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3388 
3389 	ledger_t ledger = pmap->ledger;
3390 
3391 #if XNU_MONITOR
3392 	pmap_destroy_ppl(pmap);
3393 
3394 	pmap_ledger_check_balance(pmap);
3395 #else
3396 	pmap_destroy_internal(pmap);
3397 #endif
3398 
3399 	ledger_dereference(ledger);
3400 
3401 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3402 }
3403 
3404 
3405 /*
3406  *	Add a reference to the specified pmap.
3407  */
3408 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3409 pmap_reference_internal(
3410 	pmap_t pmap)
3411 {
3412 	if (pmap != PMAP_NULL) {
3413 		validate_pmap_mutable(pmap);
3414 		os_atomic_inc(&pmap->ref_count, relaxed);
3415 	}
3416 }
3417 
3418 void
pmap_reference(pmap_t pmap)3419 pmap_reference(
3420 	pmap_t pmap)
3421 {
3422 #if XNU_MONITOR
3423 	pmap_reference_ppl(pmap);
3424 #else
3425 	pmap_reference_internal(pmap);
3426 #endif
3427 }
3428 
3429 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3430 pmap_tt1_allocate(
3431 	pmap_t          pmap,
3432 	vm_size_t       size,
3433 	unsigned        option)
3434 {
3435 	tt_entry_t      *tt1 = NULL;
3436 	tt_free_entry_t *tt1_free;
3437 	pmap_paddr_t    pa;
3438 	vm_address_t    va;
3439 	vm_address_t    va_end;
3440 	kern_return_t   ret;
3441 
3442 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3443 		size = PAGE_SIZE;
3444 	}
3445 
3446 	pmap_simple_lock(&tt1_lock);
3447 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3448 		free_page_size_tt_count--;
3449 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3450 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3451 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3452 		free_two_page_size_tt_count--;
3453 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3454 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3455 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3456 		free_tt_count--;
3457 		tt1 = (tt_entry_t *)free_tt_list;
3458 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3459 	}
3460 
3461 	pmap_simple_unlock(&tt1_lock);
3462 
3463 	if (tt1 != NULL) {
3464 		pmap_tt_ledger_credit(pmap, size);
3465 		return (tt_entry_t *)tt1;
3466 	}
3467 
3468 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3469 
3470 	if (ret == KERN_RESOURCE_SHORTAGE) {
3471 		return (tt_entry_t *)0;
3472 	}
3473 
3474 #if XNU_MONITOR
3475 	assert(pa);
3476 #endif
3477 
3478 	if (size < PAGE_SIZE) {
3479 		va = phystokv(pa) + size;
3480 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3481 		tt_free_entry_t *next_free = NULL;
3482 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3483 			tt1_free = (tt_free_entry_t *)va;
3484 			tt1_free->next = next_free;
3485 			next_free = tt1_free;
3486 		}
3487 		pmap_simple_lock(&tt1_lock);
3488 		local_free_list->next = free_tt_list;
3489 		free_tt_list = next_free;
3490 		free_tt_count += ((PAGE_SIZE / size) - 1);
3491 		if (free_tt_count > free_tt_max) {
3492 			free_tt_max = free_tt_count;
3493 		}
3494 		pmap_simple_unlock(&tt1_lock);
3495 	}
3496 
3497 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3498 	 * Depending on the device, this can vary between 512b and 16K. */
3499 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3500 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3501 	pmap_tt_ledger_credit(pmap, size);
3502 
3503 	return (tt_entry_t *) phystokv(pa);
3504 }
3505 
3506 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3507 pmap_tt1_deallocate(
3508 	pmap_t pmap,
3509 	tt_entry_t *tt,
3510 	vm_size_t size,
3511 	unsigned option)
3512 {
3513 	tt_free_entry_t *tt_entry;
3514 
3515 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3516 		size = PAGE_SIZE;
3517 	}
3518 
3519 	tt_entry = (tt_free_entry_t *)tt;
3520 	assert(not_in_kdp);
3521 	pmap_simple_lock(&tt1_lock);
3522 
3523 	if (size < PAGE_SIZE) {
3524 		free_tt_count++;
3525 		if (free_tt_count > free_tt_max) {
3526 			free_tt_max = free_tt_count;
3527 		}
3528 		tt_entry->next = free_tt_list;
3529 		free_tt_list = tt_entry;
3530 	}
3531 
3532 	if (size == PAGE_SIZE) {
3533 		free_page_size_tt_count++;
3534 		if (free_page_size_tt_count > free_page_size_tt_max) {
3535 			free_page_size_tt_max = free_page_size_tt_count;
3536 		}
3537 		tt_entry->next = free_page_size_tt_list;
3538 		free_page_size_tt_list = tt_entry;
3539 	}
3540 
3541 	if (size == 2 * PAGE_SIZE) {
3542 		free_two_page_size_tt_count++;
3543 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3544 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3545 		}
3546 		tt_entry->next = free_two_page_size_tt_list;
3547 		free_two_page_size_tt_list = tt_entry;
3548 	}
3549 
3550 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3551 		pmap_simple_unlock(&tt1_lock);
3552 		pmap_tt_ledger_debit(pmap, size);
3553 		return;
3554 	}
3555 
3556 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3557 		free_page_size_tt_count--;
3558 		tt = (tt_entry_t *)free_page_size_tt_list;
3559 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3560 
3561 		pmap_simple_unlock(&tt1_lock);
3562 
3563 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3564 
3565 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3566 
3567 		pmap_simple_lock(&tt1_lock);
3568 	}
3569 
3570 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3571 		free_two_page_size_tt_count--;
3572 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3573 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3574 
3575 		pmap_simple_unlock(&tt1_lock);
3576 
3577 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3578 
3579 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3580 
3581 		pmap_simple_lock(&tt1_lock);
3582 	}
3583 	pmap_simple_unlock(&tt1_lock);
3584 	pmap_tt_ledger_debit(pmap, size);
3585 }
3586 
3587 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3588 pmap_tt_allocate(
3589 	pmap_t pmap,
3590 	tt_entry_t **ttp,
3591 	unsigned int level,
3592 	unsigned int options)
3593 {
3594 	pmap_paddr_t pa;
3595 	*ttp = NULL;
3596 
3597 	/* Traverse the tt_entry_free list to find a free tt_entry */
3598 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3599 		return KERN_ABORTED;
3600 	}
3601 
3602 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3603 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3604 
3605 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3606 		tt_free_next = tt_free_cur->next;
3607 		tt_free_cur->next = NULL;
3608 		*ttp = (tt_entry_t *)tt_free_cur;
3609 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3610 	}
3611 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3612 
3613 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3614 	if (*ttp == NULL) {
3615 		pt_desc_t       *ptdp;
3616 
3617 		/*
3618 		 *  Allocate a VM page for the level x page table entries.
3619 		 */
3620 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3621 			if (options & PMAP_OPTIONS_NOWAIT) {
3622 				return KERN_RESOURCE_SHORTAGE;
3623 			}
3624 			VM_PAGE_WAIT();
3625 		}
3626 
3627 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3628 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3629 			if (options & PMAP_OPTIONS_NOWAIT) {
3630 				/* Deallocate all allocated resources so far. */
3631 				pmap_pages_free(pa, PAGE_SIZE);
3632 				return KERN_RESOURCE_SHORTAGE;
3633 			}
3634 			VM_PAGE_WAIT();
3635 		}
3636 
3637 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3638 			OSAddAtomic64(1, &alloc_ttepages_count);
3639 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3640 		} else {
3641 			OSAddAtomic64(1, &alloc_ptepages_count);
3642 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3643 		}
3644 
3645 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3646 
3647 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3648 
3649 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3650 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3651 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3652 
3653 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3654 		if (PAGE_SIZE > pmap_page_size) {
3655 			vm_address_t    va;
3656 			vm_address_t    va_end;
3657 
3658 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3659 				/* Deallocate all allocated resources so far. */
3660 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3661 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3662 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3663 				pmap_pages_free(pa, PAGE_SIZE);
3664 				ptd_deallocate(ptdp);
3665 
3666 				return KERN_ABORTED;
3667 			}
3668 
3669 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3670 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3671 				pmap->tt_entry_free = (tt_entry_t *)va;
3672 			}
3673 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3674 		}
3675 
3676 		*ttp = (tt_entry_t *)phystokv(pa);
3677 	}
3678 
3679 #if XNU_MONITOR
3680 	assert(*ttp);
3681 #endif
3682 
3683 	return KERN_SUCCESS;
3684 }
3685 
3686 
3687 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3688 pmap_tt_deallocate(
3689 	pmap_t pmap,
3690 	tt_entry_t *ttp,
3691 	unsigned int level)
3692 {
3693 	pt_desc_t *ptdp;
3694 	ptd_info_t *ptd_info;
3695 	unsigned pt_acc_cnt;
3696 	unsigned i;
3697 	vm_offset_t     free_page = 0;
3698 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3699 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3700 
3701 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3702 
3703 	ptdp = ptep_get_ptd(ttp);
3704 	ptd_info = ptd_get_info(ptdp, ttp);
3705 
3706 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3707 
3708 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3709 		ptd_info->refcnt = 0;
3710 	}
3711 
3712 	if (__improbable(ptd_info->refcnt != 0)) {
3713 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3714 	}
3715 
3716 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3717 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3718 	}
3719 
3720 	if (pt_acc_cnt == 0) {
3721 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 		unsigned pt_free_entry_cnt = 1;
3723 
3724 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3725 			tt_free_entry_t *tt_free_list_next;
3726 
3727 			tt_free_list_next = tt_free_list->next;
3728 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3729 				pt_free_entry_cnt++;
3730 			}
3731 			tt_free_list = tt_free_list_next;
3732 		}
3733 		if (pt_free_entry_cnt == max_pt_index) {
3734 			tt_free_entry_t *tt_free_list_cur;
3735 
3736 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3737 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3738 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3739 
3740 			while (tt_free_list_cur) {
3741 				tt_free_entry_t *tt_free_list_next;
3742 
3743 				tt_free_list_next = tt_free_list_cur->next;
3744 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3745 					tt_free_list->next = tt_free_list_next->next;
3746 				} else {
3747 					tt_free_list = tt_free_list_next;
3748 				}
3749 				tt_free_list_cur = tt_free_list_next;
3750 			}
3751 		} else {
3752 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3753 			pmap->tt_entry_free = ttp;
3754 		}
3755 	} else {
3756 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3757 		pmap->tt_entry_free = ttp;
3758 	}
3759 
3760 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3761 
3762 	if (free_page != 0) {
3763 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3764 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3765 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3766 		if (level < pt_attr_leaf_level(pt_attr)) {
3767 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3768 		} else {
3769 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3770 		}
3771 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3772 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3773 	}
3774 }
3775 
3776 /**
3777  * Safely clear out a translation table entry.
3778  *
3779  * @note If the TTE to clear out points to a leaf table, then that leaf table
3780  *       must have a refcnt of zero before the TTE can be removed.
3781  * @note This function expects to be called with pmap locked exclusive, and will
3782  *       return with pmap unlocked.
3783  *
3784  * @param pmap The pmap containing the page table whose TTE is being removed.
3785  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3786  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3787  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3788  * @param ttep Pointer to the TTE that should be cleared out.
3789  * @param level The level of the page table that contains the TTE to be removed.
3790  */
3791 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3792 pmap_tte_remove(
3793 	pmap_t pmap,
3794 	vm_offset_t va_start,
3795 	vm_offset_t va_end,
3796 	bool need_strong_sync,
3797 	tt_entry_t *ttep,
3798 	unsigned int level)
3799 {
3800 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3801 
3802 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3803 	const tt_entry_t tte = *ttep;
3804 
3805 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3806 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3807 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3808 	}
3809 
3810 	*ttep = (tt_entry_t) 0;
3811 	FLUSH_PTE_STRONG();
3812 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3813 	if (va_end > va_start) {
3814 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3815 	}
3816 
3817 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3818 
3819 	/**
3820 	 * Remember, the passed in "level" parameter refers to the level above the
3821 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3822 	 * page table).
3823 	 */
3824 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3825 
3826 	/**
3827 	 * Non-leaf pagetables don't track active references in the PTD and instead
3828 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3829 	 * the real refcount below.
3830 	 */
3831 	unsigned short refcnt = PT_DESC_REFCOUNT;
3832 
3833 	/*
3834 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3835 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3836 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3837 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3838 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3839 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3840 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3841 	 * synchronize it against the disconnect operation.  If that removal caused the
3842 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3843 	 * operation is finished using the relevant pagetable descriptor.
3844 	 * Address these cases by waiting until all CPUs have been observed to not be
3845 	 * executing pmap_disconnect().
3846 	 */
3847 	if (remove_leaf_table) {
3848 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3849 		const int max_cpu = ml_get_max_cpu_number();
3850 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3851 		bool inflight_disconnect;
3852 
3853 		/*
3854 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3855 		 * ahead of any prior PTE load which may have observed the effect of a
3856 		 * concurrent disconnect operation.  An acquire fence is required for this;
3857 		 * a load-acquire operation is insufficient.
3858 		 */
3859 		os_atomic_thread_fence(acquire);
3860 		do {
3861 			inflight_disconnect = false;
3862 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3863 			    i >= 0;
3864 			    i = bitmap_next(&active_disconnects[0], i)) {
3865 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3866 				if (cpu_data == NULL) {
3867 					continue;
3868 				}
3869 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3870 					__builtin_arm_wfe();
3871 					inflight_disconnect = true;
3872 					continue;
3873 				}
3874 				os_atomic_clear_exclusive();
3875 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3876 			}
3877 		} while (inflight_disconnect);
3878 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3879 		os_atomic_thread_fence(acquire);
3880 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3881 	}
3882 
3883 #if MACH_ASSERT
3884 	/**
3885 	 * On internal devices, always do the page table consistency check
3886 	 * regardless of page table level or the actual refcnt value.
3887 	 */
3888 	{
3889 #else /* MACH_ASSERT */
3890 	/**
3891 	 * Only perform the page table consistency check when deleting leaf page
3892 	 * tables and it seems like there might be valid/compressed mappings
3893 	 * leftover.
3894 	 */
3895 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3896 #endif /* MACH_ASSERT */
3897 
3898 		/**
3899 		 * There are multiple problems that can arise as a non-zero refcnt:
3900 		 * 1. A bug in the refcnt management logic.
3901 		 * 2. A memory stomper or hardware failure.
3902 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3903 		 *    space before destroying a pmap.
3904 		 *
3905 		 * By looping over the page table and determining how many valid or
3906 		 * compressed entries there actually are, we can narrow down which of
3907 		 * these three cases is causing this panic. If the expected refcnt
3908 		 * (valid + compressed) and the actual refcnt don't match then the
3909 		 * problem is probably either a memory corruption issue (if the
3910 		 * non-empty entries don't match valid+compressed, that could also be a
3911 		 * sign of corruption) or refcnt management bug. Otherwise, there
3912 		 * actually are leftover mappings and the higher layers of xnu are
3913 		 * probably at fault.
3914 		 */
3915 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3916 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3917 
3918 		pt_entry_t *ptep = bpte;
3919 		unsigned short non_empty = 0, valid = 0, comp = 0;
3920 
3921 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3922 			/**
3923 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3924 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3925 			 * That's because it's possible for the 4-tuple PTE clear operation in
3926 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3927 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3928 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3929 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3930 			 * but we don't want it to trip our internal checks here.
3931 			 */
3932 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3933 				if ((i % PAGE_RATIO) == 0) {
3934 					comp++;
3935 				} else {
3936 					continue;
3937 				}
3938 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3939 				valid++;
3940 			}
3941 
3942 			/* Keep track of all non-empty entries to detect memory corruption. */
3943 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3944 				non_empty++;
3945 			}
3946 		}
3947 
3948 #if MACH_ASSERT
3949 		/**
3950 		 * On internal machines, panic whenever a page table getting deleted has
3951 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3952 		 * non-zero refcnt.
3953 		 */
3954 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3955 #else /* MACH_ASSERT */
3956 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3957 		{
3958 #endif /* MACH_ASSERT */
3959 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3960 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3961 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3962 		}
3963 	}
3964 }
3965 
3966 /**
3967  * Given a pointer to an entry within a `level` page table, delete the
3968  * page table at `level` + 1 that is represented by that entry. For instance,
3969  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3970  * contains the PA of the L3 table, and `level` would be "2".
3971  *
3972  * @note If the table getting deallocated is a leaf table, then that leaf table
3973  *       must have a refcnt of zero before getting deallocated. All other levels
3974  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3975  * @note This function expects to be called with pmap locked exclusive and will
3976  *       return with pmap unlocked.
3977  *
3978  * @param pmap The pmap that owns the page table to be deallocated.
3979  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3980  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3981  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3982  * @param ttep Pointer to the `level` TTE to remove.
3983  * @param level The level of the table that contains an entry pointing to the
3984  *              table to be removed. The deallocated page table will be a
3985  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3986  *              deleted).
3987  */
3988 void
3989 pmap_tte_deallocate(
3990 	pmap_t pmap,
3991 	vm_offset_t va_start,
3992 	vm_offset_t va_end,
3993 	bool need_strong_sync,
3994 	tt_entry_t *ttep,
3995 	unsigned int level)
3996 {
3997 	tt_entry_t tte;
3998 
3999 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4000 
4001 	tte = *ttep;
4002 
4003 	if (tte_get_ptd(tte)->pmap != pmap) {
4004 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4005 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4006 	}
4007 
4008 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
4009 	    __func__, ttep, (unsigned long long)tte);
4010 
4011 	/* pmap_tte_remove() will drop the pmap lock */
4012 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4013 
4014 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4015 }
4016 
4017 /*
4018  *	Remove a range of hardware page-table entries.
4019  *	The entries given are the first (inclusive)
4020  *	and last (exclusive) entries for the VM pages.
4021  *	The virtual address is the va for the first pte.
4022  *
4023  *	The pmap must be locked.
4024  *	If the pmap is not the kernel pmap, the range must lie
4025  *	entirely within one pte-page.  This is NOT checked.
4026  *	Assumes that the pte-page exists.
4027  *
4028  *	Returns the number of PTE changed
4029  */
4030 MARK_AS_PMAP_TEXT static int
4031 pmap_remove_range(
4032 	pmap_t pmap,
4033 	vm_map_address_t va,
4034 	pt_entry_t *bpte,
4035 	pt_entry_t *epte)
4036 {
4037 	bool need_strong_sync = false;
4038 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4039 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4040 	if (num_changed > 0) {
4041 		PMAP_UPDATE_TLBS(pmap, va,
4042 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4043 	}
4044 	return num_changed;
4045 }
4046 
4047 
4048 #ifdef PVH_FLAG_EXEC
4049 
4050 /*
4051  *	Update the access protection bits of the physical aperture mapping for a page.
4052  *	This is useful, for example, in guranteeing that a verified executable page
4053  *	has no writable mappings anywhere in the system, including the physical
4054  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4055  *	synchronization overhead in cases where the call to this function is
4056  *	guaranteed to be followed by other TLB operations.
4057  */
4058 void
4059 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4060 {
4061 #if __ARM_PTE_PHYSMAP__
4062 	pvh_assert_locked(pai);
4063 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4064 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4065 
4066 	pt_entry_t tmplate = *pte_p;
4067 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4068 		return;
4069 	}
4070 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4071 	if (tmplate & ARM_PTE_HINT_MASK) {
4072 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4073 		    __func__, pte_p, (void *)kva, tmplate);
4074 	}
4075 	write_pte_strong(pte_p, tmplate);
4076 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4077 	if (!flush_tlb_async) {
4078 		sync_tlb_flush();
4079 	}
4080 #endif
4081 }
4082 #endif /* defined(PVH_FLAG_EXEC) */
4083 
4084 
4085 
4086 MARK_AS_PMAP_TEXT int
4087 pmap_remove_range_options(
4088 	pmap_t pmap,
4089 	vm_map_address_t va,
4090 	pt_entry_t *bpte,
4091 	pt_entry_t *epte,
4092 	vm_map_address_t *eva,
4093 	bool *need_strong_sync __unused,
4094 	int options)
4095 {
4096 	pt_entry_t     *cpte;
4097 	size_t          npages = 0;
4098 	int             num_removed, num_unwired;
4099 	int             num_pte_changed;
4100 	unsigned int    pai = 0;
4101 	pmap_paddr_t    pa;
4102 	int             num_external, num_internal, num_reusable;
4103 	int             num_alt_internal;
4104 	uint64_t        num_compressed, num_alt_compressed;
4105 	int16_t         refcnt = 0;
4106 
4107 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4108 
4109 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4110 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4111 
4112 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4113 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4114 	}
4115 
4116 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4117 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4118 	}
4119 
4120 	num_removed = 0;
4121 	num_unwired = 0;
4122 	num_pte_changed = 0;
4123 	num_external = 0;
4124 	num_internal = 0;
4125 	num_reusable = 0;
4126 	num_compressed = 0;
4127 	num_alt_internal = 0;
4128 	num_alt_compressed = 0;
4129 
4130 #if XNU_MONITOR
4131 	bool ro_va = false;
4132 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4133 		ro_va = true;
4134 	}
4135 #endif
4136 	for (cpte = bpte; cpte < epte;
4137 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4138 		pt_entry_t      spte;
4139 		boolean_t       managed = FALSE;
4140 
4141 		/*
4142 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4143 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4144 		 */
4145 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4146 			*eva = va;
4147 			break;
4148 		}
4149 
4150 		spte = *((volatile pt_entry_t*)cpte);
4151 
4152 		while (!managed) {
4153 			if (pmap != kernel_pmap &&
4154 			    (options & PMAP_OPTIONS_REMOVE) &&
4155 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4156 				/*
4157 				 * "pmap" must be locked at this point,
4158 				 * so this should not race with another
4159 				 * pmap_remove_range() or pmap_enter().
4160 				 */
4161 
4162 				/* one less "compressed"... */
4163 				num_compressed++;
4164 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4165 					/* ... but it used to be "ALTACCT" */
4166 					num_alt_compressed++;
4167 				}
4168 
4169 				/* clear marker */
4170 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4171 				/*
4172 				 * "refcnt" also accounts for
4173 				 * our "compressed" markers,
4174 				 * so let's update it here.
4175 				 */
4176 				--refcnt;
4177 				spte = *((volatile pt_entry_t*)cpte);
4178 			}
4179 			/*
4180 			 * It may be possible for the pte to transition from managed
4181 			 * to unmanaged in this timeframe; for now, elide the assert.
4182 			 * We should break out as a consequence of checking pa_valid.
4183 			 */
4184 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4185 			pa = pte_to_pa(spte);
4186 			if (!pa_valid(pa)) {
4187 #if XNU_MONITOR
4188 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4189 #endif
4190 #if XNU_MONITOR
4191 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4192 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4193 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4194 					    __func__, (uint64_t)pa);
4195 				}
4196 #endif
4197 				break;
4198 			}
4199 #if HAS_FEAT_XS
4200 			if (pte_is_xs(pt_attr, spte)) {
4201 				*need_strong_sync = true;
4202 			}
4203 #endif /* HAS_FEAT_XS */
4204 			pai = pa_index(pa);
4205 			pvh_lock(pai);
4206 			spte = *((volatile pt_entry_t*)cpte);
4207 			pa = pte_to_pa(spte);
4208 			if (pai == pa_index(pa)) {
4209 				managed = TRUE;
4210 				break; // Leave pai locked as we will unlock it after we free the PV entry
4211 			}
4212 			pvh_unlock(pai);
4213 		}
4214 
4215 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4216 			/*
4217 			 * There used to be a valid mapping here but it
4218 			 * has already been removed when the page was
4219 			 * sent to the VM compressor, so nothing left to
4220 			 * remove now...
4221 			 */
4222 			continue;
4223 		}
4224 
4225 		/* remove the translation, do not flush the TLB */
4226 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4227 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4228 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4229 #if MACH_ASSERT
4230 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4231 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4232 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4233 			}
4234 #endif
4235 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4236 			num_pte_changed++;
4237 		}
4238 
4239 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4240 		    (pmap != kernel_pmap)) {
4241 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4242 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4243 			--refcnt;
4244 		}
4245 
4246 		if (pte_is_wired(spte)) {
4247 			pte_set_wired(pmap, cpte, 0);
4248 			num_unwired++;
4249 		}
4250 		/*
4251 		 * if not managed, we're done
4252 		 */
4253 		if (!managed) {
4254 			continue;
4255 		}
4256 
4257 #if XNU_MONITOR
4258 		if (__improbable(ro_va)) {
4259 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4260 		}
4261 #endif
4262 
4263 		/*
4264 		 * find and remove the mapping from the chain for this
4265 		 * physical address.
4266 		 */
4267 		bool is_internal, is_altacct;
4268 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4269 
4270 		if (is_altacct) {
4271 			assert(is_internal);
4272 			num_internal++;
4273 			num_alt_internal++;
4274 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4275 				ppattr_clear_altacct(pai);
4276 				ppattr_clear_internal(pai);
4277 			}
4278 		} else if (is_internal) {
4279 			if (ppattr_test_reusable(pai)) {
4280 				num_reusable++;
4281 			} else {
4282 				num_internal++;
4283 			}
4284 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4285 				ppattr_clear_internal(pai);
4286 			}
4287 		} else {
4288 			num_external++;
4289 		}
4290 		pvh_unlock(pai);
4291 		num_removed++;
4292 	}
4293 
4294 	/*
4295 	 *	Update the counts
4296 	 */
4297 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4298 
4299 	if (pmap != kernel_pmap) {
4300 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4301 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4302 		}
4303 
4304 		/* update ledgers */
4305 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4306 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4307 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4308 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4309 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4310 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4311 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4312 		/* make needed adjustments to phys_footprint */
4313 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4314 		    ((num_internal -
4315 		    num_alt_internal) +
4316 		    (num_compressed -
4317 		    num_alt_compressed)) * pmap_page_size);
4318 	}
4319 
4320 	/* flush the ptable entries we have written */
4321 	if (num_pte_changed > 0) {
4322 		FLUSH_PTE_STRONG();
4323 	}
4324 
4325 	return num_pte_changed;
4326 }
4327 
4328 
4329 /*
4330  *	Remove the given range of addresses
4331  *	from the specified map.
4332  *
4333  *	It is assumed that the start and end are properly
4334  *	rounded to the hardware page size.
4335  */
4336 void
4337 pmap_remove(
4338 	pmap_t pmap,
4339 	vm_map_address_t start,
4340 	vm_map_address_t end)
4341 {
4342 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4343 }
4344 
4345 MARK_AS_PMAP_TEXT vm_map_address_t
4346 pmap_remove_options_internal(
4347 	pmap_t pmap,
4348 	vm_map_address_t start,
4349 	vm_map_address_t end,
4350 	int options)
4351 {
4352 	vm_map_address_t eva = end;
4353 	pt_entry_t     *bpte, *epte;
4354 	pt_entry_t     *pte_p;
4355 	tt_entry_t     *tte_p;
4356 	int             remove_count = 0;
4357 	bool            need_strong_sync = false;
4358 	bool            unlock = true;
4359 
4360 	if (__improbable(end < start)) {
4361 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4362 	}
4363 
4364 	validate_pmap_mutable(pmap);
4365 
4366 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4367 
4368 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4369 
4370 	tte_p = pmap_tte(pmap, start);
4371 
4372 	if (tte_p == (tt_entry_t *) NULL) {
4373 		goto done;
4374 	}
4375 
4376 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4377 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4378 		bpte = &pte_p[pte_index(pt_attr, start)];
4379 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4380 
4381 		/*
4382 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4383 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4384 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4385 		 */
4386 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4387 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4388 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4389 		}
4390 
4391 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4392 		    &need_strong_sync, options);
4393 
4394 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4395 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4396 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4397 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4398 		}
4399 	}
4400 
4401 done:
4402 	if (unlock) {
4403 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4404 	}
4405 
4406 	if (remove_count > 0) {
4407 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4408 	}
4409 	return eva;
4410 }
4411 
4412 void
4413 pmap_remove_options(
4414 	pmap_t pmap,
4415 	vm_map_address_t start,
4416 	vm_map_address_t end,
4417 	int options)
4418 {
4419 	vm_map_address_t va;
4420 
4421 	if (pmap == PMAP_NULL) {
4422 		return;
4423 	}
4424 
4425 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4426 
4427 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4428 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4429 	    VM_KERNEL_ADDRHIDE(end));
4430 
4431 #if MACH_ASSERT
4432 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4433 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4434 		    pmap, (uint64_t)start, (uint64_t)end);
4435 	}
4436 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4437 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4438 		    pmap, (uint64_t)start, (uint64_t)end);
4439 	}
4440 #endif
4441 
4442 	/*
4443 	 * We allow single-page requests to execute non-preemptibly,
4444 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4445 	 * operation, and there are a couple of special use cases that
4446 	 * require a non-preemptible single-page operation.
4447 	 */
4448 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4449 		pmap_verify_preemptible();
4450 	}
4451 
4452 	/*
4453 	 *      Invalidate the translation buffer first
4454 	 */
4455 	va = start;
4456 	while (va < end) {
4457 		vm_map_address_t l;
4458 
4459 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4460 		if (l > end) {
4461 			l = end;
4462 		}
4463 
4464 #if XNU_MONITOR
4465 		va = pmap_remove_options_ppl(pmap, va, l, options);
4466 
4467 		pmap_ledger_check_balance(pmap);
4468 #else
4469 		va = pmap_remove_options_internal(pmap, va, l, options);
4470 #endif
4471 	}
4472 
4473 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4474 }
4475 
4476 
4477 /*
4478  *	Remove phys addr if mapped in specified map
4479  */
4480 void
4481 pmap_remove_some_phys(
4482 	__unused pmap_t map,
4483 	__unused ppnum_t pn)
4484 {
4485 	/* Implement to support working set code */
4486 }
4487 
4488 /*
4489  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4490  * switch a thread onto a new vm_map.
4491  */
4492 void
4493 pmap_switch_user(thread_t thread, vm_map_t new_map)
4494 {
4495 	pmap_t new_pmap = new_map->pmap;
4496 
4497 
4498 	thread->map = new_map;
4499 	pmap_set_pmap(new_pmap, thread);
4500 
4501 }
4502 
4503 void
4504 pmap_set_pmap(
4505 	pmap_t pmap,
4506 #if     !__ARM_USER_PROTECT__
4507 	__unused
4508 #endif
4509 	thread_t        thread)
4510 {
4511 	pmap_switch(pmap);
4512 #if __ARM_USER_PROTECT__
4513 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4514 	thread->machine.asid = pmap->hw_asid;
4515 #endif
4516 }
4517 
4518 static void
4519 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4520 {
4521 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4522 }
4523 
4524 static inline bool
4525 pmap_user_ttb_is_clear(void)
4526 {
4527 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4528 }
4529 
4530 MARK_AS_PMAP_TEXT void
4531 pmap_switch_internal(
4532 	pmap_t pmap)
4533 {
4534 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4535 #if XNU_MONITOR
4536 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4537 #endif
4538 	validate_pmap_mutable(pmap);
4539 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4540 	uint16_t asid_index = pmap->hw_asid;
4541 	bool do_asid_flush = false;
4542 	bool do_commpage_flush = false;
4543 
4544 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4545 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4546 	}
4547 #if __ARM_KERNEL_PROTECT__
4548 	asid_index >>= 1;
4549 #endif
4550 
4551 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4552 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4553 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4554 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4555 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4556 	bool break_before_make = do_shared_region_flush;
4557 
4558 #if !HAS_16BIT_ASID
4559 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4560 		asid_index -= 1;
4561 		pmap_update_plru(asid_index);
4562 
4563 		/* Paranoia. */
4564 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4565 
4566 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4567 		uint8_t new_sw_asid = pmap->sw_asid;
4568 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4569 
4570 		if (new_sw_asid != last_sw_asid) {
4571 			/*
4572 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4573 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4574 			 * then this switch runs the risk of aliasing.  We need to flush the
4575 			 * TLB for this phyiscal ASID in this case.
4576 			 */
4577 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4578 			do_asid_flush = true;
4579 			break_before_make = true;
4580 		}
4581 	}
4582 #endif /* !HAS_16BIT_ASID */
4583 
4584 #if __ARM_MIXED_PAGE_SIZE__
4585 	if (pt_attr->pta_tcr_value != get_tcr()) {
4586 		break_before_make = true;
4587 	}
4588 #endif
4589 #if __ARM_MIXED_PAGE_SIZE__
4590 	/*
4591 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4592 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4593 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4594 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4595 	 * conflict abort or other unpredictable behavior.
4596 	 */
4597 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4598 		do_commpage_flush = true;
4599 	}
4600 	if (do_commpage_flush) {
4601 		break_before_make = true;
4602 	}
4603 #endif
4604 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4605 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4606 		pmap_clear_user_ttb_internal();
4607 	}
4608 
4609 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4610 	 * to flush the userspace mappings for that region.  Those mappings are global
4611 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4612 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4613 	if (__improbable(do_shared_region_flush)) {
4614 #if __ARM_RANGE_TLBI__
4615 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4616 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4617 
4618 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4619 		 * There may still be non-global entries that overlap with the incoming pmap's
4620 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4621 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4622 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4623 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4624 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4625 		 * to consider additional invalidation here in the future. */
4626 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4627 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4628 		} else {
4629 			/*
4630 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4631 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4632 			 * have a single-page shared region anyway, not least because pmap_nest()
4633 			 * requires L2 block alignment of the address and size.
4634 			 */
4635 			do_asid_flush = false;
4636 			flush_core_tlb_async();
4637 		}
4638 #else
4639 		do_asid_flush = false;
4640 		flush_core_tlb_async();
4641 #endif // __ARM_RANGE_TLBI__
4642 	}
4643 
4644 #if __ARM_MIXED_PAGE_SIZE__
4645 	if (__improbable(do_commpage_flush)) {
4646 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4647 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4648 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4649 		flush_core_tlb_allrange_async(rtlbi_param);
4650 	}
4651 #endif
4652 	if (__improbable(do_asid_flush)) {
4653 		pmap_flush_core_tlb_asid_async(pmap);
4654 #if DEVELOPMENT || DEBUG
4655 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4656 #endif
4657 	}
4658 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4659 		sync_tlb_flush_local();
4660 	}
4661 
4662 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4663 }
4664 
4665 void
4666 pmap_switch(
4667 	pmap_t pmap)
4668 {
4669 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4670 #if XNU_MONITOR
4671 	pmap_switch_ppl(pmap);
4672 #else
4673 	pmap_switch_internal(pmap);
4674 #endif
4675 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4676 }
4677 
4678 void
4679 pmap_page_protect(
4680 	ppnum_t ppnum,
4681 	vm_prot_t prot)
4682 {
4683 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4684 }
4685 
4686 /*
4687  *	Routine:	pmap_page_protect_options
4688  *
4689  *	Function:
4690  *		Lower the permission for all mappings to a given
4691  *		page.
4692  */
4693 MARK_AS_PMAP_TEXT static void
4694 pmap_page_protect_options_with_flush_range(
4695 	ppnum_t ppnum,
4696 	vm_prot_t prot,
4697 	unsigned int options,
4698 	pmap_tlb_flush_range_t *flush_range)
4699 {
4700 	pmap_paddr_t    phys = ptoa(ppnum);
4701 	pv_entry_t    **pv_h;
4702 	pv_entry_t     *pve_p, *orig_pve_p;
4703 	pv_entry_t     *pveh_p;
4704 	pv_entry_t     *pvet_p;
4705 	pt_entry_t     *pte_p, *orig_pte_p;
4706 	pv_entry_t     *new_pve_p;
4707 	pt_entry_t     *new_pte_p;
4708 	vm_offset_t     pvh_flags;
4709 	unsigned int    pai;
4710 	bool            remove;
4711 	bool            set_NX;
4712 	unsigned int    pvh_cnt = 0;
4713 	unsigned int    pass1_updated = 0;
4714 	unsigned int    pass2_updated = 0;
4715 
4716 	assert(ppnum != vm_page_fictitious_addr);
4717 
4718 	/* Only work with managed pages. */
4719 	if (!pa_valid(phys)) {
4720 		return;
4721 	}
4722 
4723 	/*
4724 	 * Determine the new protection.
4725 	 */
4726 	switch (prot) {
4727 	case VM_PROT_ALL:
4728 		return;         /* nothing to do */
4729 	case VM_PROT_READ:
4730 	case VM_PROT_READ | VM_PROT_EXECUTE:
4731 		remove = false;
4732 		break;
4733 	default:
4734 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4735 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4736 		remove = true;
4737 		break;
4738 	}
4739 
4740 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4741 	if (remove) {
4742 #if !XNU_MONITOR
4743 		mp_disable_preemption();
4744 #endif
4745 		pmap_cpu_data = pmap_get_cpu_data();
4746 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4747 		/*
4748 		 * Ensure the store to inflight_disconnect will be observed before any of the
4749 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4750 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4751 		 * another CPU, in between this function's clearing a PTE and dropping the
4752 		 * corresponding pagetable refcount.  That can lead to a panic if the
4753 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4754 		 * store barrier; a store-release operation would not be sufficient.
4755 		 */
4756 		os_atomic_thread_fence(release);
4757 	}
4758 
4759 	pai = pa_index(phys);
4760 	pvh_lock(pai);
4761 	pv_h = pai_to_pvh(pai);
4762 	pvh_flags = pvh_get_flags(pv_h);
4763 
4764 #if XNU_MONITOR
4765 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4766 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4767 	}
4768 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4769 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4770 	}
4771 #endif
4772 
4773 
4774 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4775 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4776 	pveh_p = PV_ENTRY_NULL;
4777 	pvet_p = PV_ENTRY_NULL;
4778 	new_pve_p = PV_ENTRY_NULL;
4779 	new_pte_p = PT_ENTRY_NULL;
4780 
4781 
4782 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4783 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4784 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4785 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4786 		pveh_p = pve_p;
4787 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4788 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4789 	}
4790 
4791 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4792 	int pve_ptep_idx = 0;
4793 
4794 	/*
4795 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4796 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4797 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4798 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4799 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4800 	 * tlb_flush_needed to be true while issue_tlbi is false.
4801 	 */
4802 	bool issue_tlbi = false;
4803 	bool tlb_flush_needed = false;
4804 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4805 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4806 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4807 		bool update = false;
4808 
4809 		if (pve_p != PV_ENTRY_NULL) {
4810 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4811 			if (pte_p == PT_ENTRY_NULL) {
4812 				goto protect_skip_pve_pass1;
4813 			}
4814 		}
4815 
4816 #ifdef PVH_FLAG_IOMMU
4817 		if (pvh_ptep_is_iommu(pte_p)) {
4818 #if XNU_MONITOR
4819 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4820 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4821 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4822 			}
4823 #endif
4824 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4825 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4826 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4827 			}
4828 			goto protect_skip_pve_pass1;
4829 		}
4830 #endif
4831 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4832 		const pmap_t pmap = ptdp->pmap;
4833 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4834 
4835 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4836 #if MACH_ASSERT
4837 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4838 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4839 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4840 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4841 
4842 				pv_entry_t *check_pvep = pve_p;
4843 
4844 				do {
4845 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4846 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4847 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4848 					}
4849 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4850 
4851 				/* Restore previous PTEP value. */
4852 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4853 			}
4854 #endif
4855 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4856 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4857 		}
4858 
4859 #if DEVELOPMENT || DEBUG
4860 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4861 #else
4862 		if ((prot & VM_PROT_EXECUTE))
4863 #endif
4864 		{
4865 			set_NX = false;
4866 		} else {
4867 			set_NX = true;
4868 		}
4869 
4870 #if HAS_FEAT_XS
4871 		/**
4872 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4873 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4874 		 */
4875 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4876 #endif /* HAS_FEAT_XS */
4877 
4878 		/* Remove the mapping if new protection is NONE */
4879 		if (remove) {
4880 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4881 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4882 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4883 			pt_entry_t spte = *pte_p;
4884 
4885 			if (pte_is_wired(spte)) {
4886 				pte_set_wired(pmap, pte_p, 0);
4887 				spte = *pte_p;
4888 				if (pmap != kernel_pmap) {
4889 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4890 				}
4891 			}
4892 
4893 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4894 			    (uint64_t)spte, pte_p, ppnum);
4895 
4896 			if (compress && is_internal && (pmap != kernel_pmap)) {
4897 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4898 				/* mark this PTE as having been "compressed" */
4899 				tmplate = ARM_PTE_COMPRESSED;
4900 				if (is_altacct) {
4901 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4902 				}
4903 			} else {
4904 				tmplate = ARM_PTE_TYPE_FAULT;
4905 			}
4906 
4907 			assert(spte != tmplate);
4908 			write_pte_fast(pte_p, tmplate);
4909 			update = true;
4910 			++pass1_updated;
4911 
4912 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4913 
4914 			if (pmap != kernel_pmap) {
4915 				if (ppattr_test_reusable(pai) &&
4916 				    is_internal &&
4917 				    !is_altacct) {
4918 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4919 				} else if (!is_internal) {
4920 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4921 				}
4922 
4923 				if (is_altacct) {
4924 					assert(is_internal);
4925 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4926 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4927 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4928 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4929 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4930 					}
4931 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4932 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4933 				} else if (ppattr_test_reusable(pai)) {
4934 					assert(is_internal);
4935 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4936 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4937 						/* was not in footprint, but is now */
4938 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4939 					}
4940 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4941 				} else if (is_internal) {
4942 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4943 
4944 					/*
4945 					 * Update all stats related to physical footprint, which only
4946 					 * deals with internal pages.
4947 					 */
4948 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4949 						/*
4950 						 * This removal is only being done so we can send this page to
4951 						 * the compressor; therefore it mustn't affect total task footprint.
4952 						 */
4953 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4954 					} else {
4955 						/*
4956 						 * This internal page isn't going to the compressor, so adjust stats to keep
4957 						 * phys_footprint up to date.
4958 						 */
4959 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4960 					}
4961 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4962 				} else {
4963 					/* external page: no impact on ledgers */
4964 				}
4965 			}
4966 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4967 		} else {
4968 			pt_entry_t spte = *pte_p;
4969 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4970 
4971 			if (pmap == kernel_pmap) {
4972 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4973 			} else {
4974 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4975 			}
4976 
4977 			/*
4978 			 * While the naive implementation of this would serve to add execute
4979 			 * permission, this is not how the VM uses this interface, or how
4980 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4981 			 */
4982 			if (set_NX) {
4983 				tmplate |= pt_attr_leaf_xn(pt_attr);
4984 			}
4985 
4986 
4987 			assert(spte != ARM_PTE_TYPE_FAULT);
4988 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4989 
4990 			if (spte != tmplate) {
4991 				/*
4992 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4993 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4994 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4995 				 * should always be cleared by this function.
4996 				 */
4997 				pte_set_was_writeable(tmplate, true);
4998 				write_pte_fast(pte_p, tmplate);
4999 				update = true;
5000 				++pass1_updated;
5001 			} else if (pte_was_writeable(tmplate)) {
5002 				/*
5003 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5004 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5005 				 * write access to a page, this function should always at least clear that flag for
5006 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5007 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5008 				 * be handled through arm_fast_fault().
5009 				 */
5010 				pte_set_was_writeable(tmplate, false);
5011 				write_pte_fast(pte_p, tmplate);
5012 			}
5013 		}
5014 
5015 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5016 			tlb_flush_needed = true;
5017 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5018 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5019 				issue_tlbi = true;
5020 			}
5021 		}
5022 protect_skip_pve_pass1:
5023 		pte_p = PT_ENTRY_NULL;
5024 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5025 			pve_ptep_idx = 0;
5026 			pve_p = pve_next(pve_p);
5027 		}
5028 	}
5029 
5030 	if (tlb_flush_needed) {
5031 		FLUSH_PTE_STRONG();
5032 	}
5033 
5034 	if (!remove && !issue_tlbi) {
5035 		goto protect_finish;
5036 	}
5037 
5038 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5039 	pv_entry_t **pve_pp = pv_h;
5040 	pve_p = orig_pve_p;
5041 	pte_p = orig_pte_p;
5042 	pve_ptep_idx = 0;
5043 
5044 	/*
5045 	 * We need to keep track of whether a particular PVE list contains IOMMU
5046 	 * mappings when removing entries, because we should only remove CPU
5047 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5048 	 * it around.
5049 	 */
5050 	bool iommu_mapping_in_pve = false;
5051 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5052 		if (pve_p != PV_ENTRY_NULL) {
5053 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5054 			if (pte_p == PT_ENTRY_NULL) {
5055 				goto protect_skip_pve_pass2;
5056 			}
5057 		}
5058 
5059 #ifdef PVH_FLAG_IOMMU
5060 		if (pvh_ptep_is_iommu(pte_p)) {
5061 			iommu_mapping_in_pve = true;
5062 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5063 				/*
5064 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5065 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5066 				 * contain the single IOMMU PTE and exit the loop.
5067 				 */
5068 				new_pte_p = pte_p;
5069 				break;
5070 			}
5071 			goto protect_skip_pve_pass2;
5072 		}
5073 #endif
5074 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5075 		const pmap_t pmap = ptdp->pmap;
5076 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5077 
5078 		if (remove) {
5079 			if (!compress && (pmap != kernel_pmap)) {
5080 				/*
5081 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5082 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5083 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5084 				 * under us.
5085 				 */
5086 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5087 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5088 				}
5089 			}
5090 			/* Remove this CPU mapping from PVE list. */
5091 			if (pve_p != PV_ENTRY_NULL) {
5092 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5093 			}
5094 		} else {
5095 			pt_entry_t spte = *pte_p;
5096 			if (pte_was_writeable(spte)) {
5097 				pte_set_was_writeable(spte, false);
5098 				write_pte_fast(pte_p, spte);
5099 			} else {
5100 				goto protect_skip_pve_pass2;
5101 			}
5102 		}
5103 		++pass2_updated;
5104 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5105 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5106 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5107 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5108 		}
5109 
5110 protect_skip_pve_pass2:
5111 		pte_p = PT_ENTRY_NULL;
5112 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5113 			pve_ptep_idx = 0;
5114 
5115 			if (remove) {
5116 				/**
5117 				 * If there are any IOMMU mappings in the PVE list, preserve
5118 				 * those mappings in a new PVE list (new_pve_p) which will later
5119 				 * become the new PVH entry. Keep track of the CPU mappings in
5120 				 * pveh_p/pvet_p so they can be deallocated later.
5121 				 */
5122 				if (iommu_mapping_in_pve) {
5123 					iommu_mapping_in_pve = false;
5124 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5125 					pve_remove(pv_h, pve_pp, pve_p);
5126 					pveh_p = pvh_pve_list(pv_h);
5127 					pve_p->pve_next = new_pve_p;
5128 					new_pve_p = pve_p;
5129 					pve_p = temp_pve_p;
5130 					continue;
5131 				} else {
5132 					pvet_p = pve_p;
5133 					pvh_cnt++;
5134 				}
5135 			}
5136 
5137 			pve_pp = pve_next_ptr(pve_p);
5138 			pve_p = pve_next(pve_p);
5139 			iommu_mapping_in_pve = false;
5140 		}
5141 	}
5142 
5143 protect_finish:
5144 
5145 #ifdef PVH_FLAG_EXEC
5146 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5147 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5148 	}
5149 #endif
5150 	if (__improbable(pass1_updated != pass2_updated)) {
5151 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5152 		    __func__, pass1_updated, pass2_updated);
5153 	}
5154 	/* if we removed a bunch of entries, take care of them now */
5155 	if (remove) {
5156 		if (new_pve_p != PV_ENTRY_NULL) {
5157 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5158 			pvh_set_flags(pv_h, pvh_flags);
5159 		} else if (new_pte_p != PT_ENTRY_NULL) {
5160 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5161 			pvh_set_flags(pv_h, pvh_flags);
5162 		} else {
5163 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5164 				pmap_flush_noncoherent_page(phys);
5165 			}
5166 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5167 		}
5168 	}
5169 
5170 	if (flush_range && tlb_flush_needed) {
5171 		if (!remove) {
5172 			flush_range->ptfr_flush_needed = true;
5173 			tlb_flush_needed = false;
5174 		}
5175 	}
5176 
5177 	/*
5178 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5179 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5180 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5181 	 * a page to be repurposed while it is still live in the TLBs.
5182 	 */
5183 	if (remove && tlb_flush_needed) {
5184 		sync_tlb_flush();
5185 	}
5186 
5187 
5188 	pvh_unlock(pai);
5189 
5190 	if (remove) {
5191 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5192 #if !XNU_MONITOR
5193 		mp_enable_preemption();
5194 #endif
5195 	}
5196 
5197 	if (!remove && tlb_flush_needed) {
5198 		sync_tlb_flush();
5199 	}
5200 
5201 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5202 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5203 	}
5204 }
5205 
5206 MARK_AS_PMAP_TEXT void
5207 pmap_page_protect_options_internal(
5208 	ppnum_t ppnum,
5209 	vm_prot_t prot,
5210 	unsigned int options,
5211 	void *arg)
5212 {
5213 	if (arg != NULL) {
5214 		/*
5215 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5216 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5217 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5218 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5219 		 * In that case, force the flush to take place.
5220 		 */
5221 		options &= ~PMAP_OPTIONS_NOFLUSH;
5222 	}
5223 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5224 }
5225 
5226 void
5227 pmap_page_protect_options(
5228 	ppnum_t ppnum,
5229 	vm_prot_t prot,
5230 	unsigned int options,
5231 	void *arg)
5232 {
5233 	pmap_paddr_t    phys = ptoa(ppnum);
5234 
5235 	assert(ppnum != vm_page_fictitious_addr);
5236 
5237 	/* Only work with managed pages. */
5238 	if (!pa_valid(phys)) {
5239 		return;
5240 	}
5241 
5242 	/*
5243 	 * Determine the new protection.
5244 	 */
5245 	if (prot == VM_PROT_ALL) {
5246 		return;         /* nothing to do */
5247 	}
5248 
5249 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5250 
5251 #if XNU_MONITOR
5252 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5253 #else
5254 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5255 #endif
5256 
5257 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5258 }
5259 
5260 
5261 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5262 MARK_AS_PMAP_TEXT void
5263 pmap_disable_user_jop_internal(pmap_t pmap)
5264 {
5265 	if (pmap == kernel_pmap) {
5266 		panic("%s: called with kernel_pmap", __func__);
5267 	}
5268 	validate_pmap_mutable(pmap);
5269 	pmap->disable_jop = true;
5270 }
5271 
5272 void
5273 pmap_disable_user_jop(pmap_t pmap)
5274 {
5275 #if XNU_MONITOR
5276 	pmap_disable_user_jop_ppl(pmap);
5277 #else
5278 	pmap_disable_user_jop_internal(pmap);
5279 #endif
5280 }
5281 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5282 
5283 /*
5284  * Indicates if the pmap layer enforces some additional restrictions on the
5285  * given set of protections.
5286  */
5287 bool
5288 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5289 {
5290 	return false;
5291 }
5292 
5293 /*
5294  *	Set the physical protection on the
5295  *	specified range of this map as requested.
5296  *	VERY IMPORTANT: Will not increase permissions.
5297  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5298  */
5299 void
5300 pmap_protect(
5301 	pmap_t pmap,
5302 	vm_map_address_t b,
5303 	vm_map_address_t e,
5304 	vm_prot_t prot)
5305 {
5306 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5307 }
5308 
5309 MARK_AS_PMAP_TEXT vm_map_address_t
5310 pmap_protect_options_internal(
5311 	pmap_t pmap,
5312 	vm_map_address_t start,
5313 	vm_map_address_t end,
5314 	vm_prot_t prot,
5315 	unsigned int options,
5316 	__unused void *args)
5317 {
5318 	tt_entry_t      *tte_p;
5319 	pt_entry_t      *bpte_p, *epte_p;
5320 	pt_entry_t      *pte_p;
5321 	boolean_t        set_NX = TRUE;
5322 	boolean_t        set_XO = FALSE;
5323 	boolean_t        should_have_removed = FALSE;
5324 	bool             need_strong_sync = false;
5325 
5326 	/* Validate the pmap input before accessing its data. */
5327 	validate_pmap_mutable(pmap);
5328 
5329 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5330 
5331 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5332 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5333 	}
5334 
5335 #if DEVELOPMENT || DEBUG
5336 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5337 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5338 			should_have_removed = TRUE;
5339 		}
5340 	} else
5341 #endif
5342 	{
5343 		/* Determine the new protection. */
5344 		switch (prot) {
5345 		case VM_PROT_EXECUTE:
5346 			set_XO = TRUE;
5347 			OS_FALLTHROUGH;
5348 		case VM_PROT_READ:
5349 		case VM_PROT_READ | VM_PROT_EXECUTE:
5350 			break;
5351 		case VM_PROT_READ | VM_PROT_WRITE:
5352 		case VM_PROT_ALL:
5353 			return end;         /* nothing to do */
5354 		default:
5355 			should_have_removed = TRUE;
5356 		}
5357 	}
5358 
5359 	if (should_have_removed) {
5360 		panic("%s: should have been a remove operation, "
5361 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5362 		    __FUNCTION__,
5363 		    pmap, (void *)start, (void *)end, prot, options, args);
5364 	}
5365 
5366 #if DEVELOPMENT || DEBUG
5367 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5368 #else
5369 	if ((prot & VM_PROT_EXECUTE))
5370 #endif
5371 	{
5372 		set_NX = FALSE;
5373 	} else {
5374 		set_NX = TRUE;
5375 	}
5376 
5377 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5378 	vm_map_address_t va = start;
5379 	unsigned int npages = 0;
5380 
5381 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5382 
5383 	tte_p = pmap_tte(pmap, start);
5384 
5385 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5386 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5387 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5388 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5389 		pte_p = bpte_p;
5390 
5391 		for (pte_p = bpte_p;
5392 		    pte_p < epte_p;
5393 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5394 			++npages;
5395 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5396 			    pmap_pending_preemption())) {
5397 				break;
5398 			}
5399 			pt_entry_t spte;
5400 #if DEVELOPMENT || DEBUG
5401 			boolean_t  force_write = FALSE;
5402 #endif
5403 
5404 			spte = *((volatile pt_entry_t*)pte_p);
5405 
5406 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5407 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5408 				continue;
5409 			}
5410 
5411 			pmap_paddr_t    pa;
5412 			unsigned int    pai = 0;
5413 			boolean_t       managed = FALSE;
5414 
5415 			while (!managed) {
5416 				/*
5417 				 * It may be possible for the pte to transition from managed
5418 				 * to unmanaged in this timeframe; for now, elide the assert.
5419 				 * We should break out as a consequence of checking pa_valid.
5420 				 */
5421 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5422 				pa = pte_to_pa(spte);
5423 				if (!pa_valid(pa)) {
5424 					break;
5425 				}
5426 				pai = pa_index(pa);
5427 				pvh_lock(pai);
5428 				spte = *((volatile pt_entry_t*)pte_p);
5429 				pa = pte_to_pa(spte);
5430 				if (pai == pa_index(pa)) {
5431 					managed = TRUE;
5432 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5433 				}
5434 				pvh_unlock(pai);
5435 			}
5436 
5437 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5438 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5439 				continue;
5440 			}
5441 
5442 			pt_entry_t      tmplate;
5443 
5444 			if (pmap == kernel_pmap) {
5445 #if DEVELOPMENT || DEBUG
5446 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5447 					force_write = TRUE;
5448 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5449 				} else
5450 #endif
5451 				{
5452 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5453 				}
5454 			} else {
5455 #if DEVELOPMENT || DEBUG
5456 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5457 					assert(pmap->type != PMAP_TYPE_NESTED);
5458 					force_write = TRUE;
5459 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5460 				} else
5461 #endif
5462 				{
5463 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5464 				}
5465 			}
5466 
5467 			/*
5468 			 * XXX Removing "NX" would
5469 			 * grant "execute" access
5470 			 * immediately, bypassing any
5471 			 * checks VM might want to do
5472 			 * in its soft fault path.
5473 			 * pmap_protect() and co. are
5474 			 * not allowed to increase
5475 			 * access permissions.
5476 			 */
5477 			if (set_NX) {
5478 				tmplate |= pt_attr_leaf_xn(pt_attr);
5479 			} else {
5480 				if (pmap == kernel_pmap) {
5481 					/* do NOT clear "PNX"! */
5482 					tmplate |= ARM_PTE_NX;
5483 				} else {
5484 					/* do NOT clear "NX"! */
5485 					tmplate |= pt_attr_leaf_x(pt_attr);
5486 					if (set_XO) {
5487 						tmplate &= ~ARM_PTE_APMASK;
5488 						tmplate |= pt_attr_leaf_rona(pt_attr);
5489 					}
5490 				}
5491 			}
5492 
5493 #if DEVELOPMENT || DEBUG
5494 			if (force_write) {
5495 				/*
5496 				 * TODO: Run CS/Monitor checks here.
5497 				 */
5498 				if (managed) {
5499 					/*
5500 					 * We are marking the page as writable,
5501 					 * so we consider it to be modified and
5502 					 * referenced.
5503 					 */
5504 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5505 					tmplate |= ARM_PTE_AF;
5506 
5507 					if (ppattr_test_reffault(pai)) {
5508 						ppattr_clear_reffault(pai);
5509 					}
5510 
5511 					if (ppattr_test_modfault(pai)) {
5512 						ppattr_clear_modfault(pai);
5513 					}
5514 				}
5515 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5516 				/*
5517 				 * An immediate request for anything other than
5518 				 * write should still mark the page as
5519 				 * referenced if managed.
5520 				 */
5521 				if (managed) {
5522 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5523 					tmplate |= ARM_PTE_AF;
5524 
5525 					if (ppattr_test_reffault(pai)) {
5526 						ppattr_clear_reffault(pai);
5527 					}
5528 				}
5529 			}
5530 #endif
5531 
5532 			/* We do not expect to write fast fault the entry. */
5533 			pte_set_was_writeable(tmplate, false);
5534 #if HAS_FEAT_XS
5535 			if (pte_is_xs(pt_attr, spte)) {
5536 				need_strong_sync = true;
5537 			}
5538 #endif /* HAS_FEAT_XS */
5539 
5540 			write_pte_fast(pte_p, tmplate);
5541 
5542 			if (managed) {
5543 				pvh_assert_locked(pai);
5544 				pvh_unlock(pai);
5545 			}
5546 		}
5547 		FLUSH_PTE_STRONG();
5548 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5549 	} else {
5550 		va = end;
5551 	}
5552 
5553 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5554 	return va;
5555 }
5556 
5557 void
5558 pmap_protect_options(
5559 	pmap_t pmap,
5560 	vm_map_address_t b,
5561 	vm_map_address_t e,
5562 	vm_prot_t prot,
5563 	unsigned int options,
5564 	__unused void *args)
5565 {
5566 	vm_map_address_t l, beg;
5567 
5568 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5569 
5570 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5571 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5572 		    pmap, (uint64_t)b, (uint64_t)e);
5573 	}
5574 
5575 	/*
5576 	 * We allow single-page requests to execute non-preemptibly,
5577 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5578 	 * operation, and there are a couple of special use cases that
5579 	 * require a non-preemptible single-page operation.
5580 	 */
5581 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5582 		pmap_verify_preemptible();
5583 	}
5584 
5585 #if DEVELOPMENT || DEBUG
5586 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5587 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5588 			pmap_remove_options(pmap, b, e, options);
5589 			return;
5590 		}
5591 	} else
5592 #endif
5593 	{
5594 		/* Determine the new protection. */
5595 		switch (prot) {
5596 		case VM_PROT_EXECUTE:
5597 		case VM_PROT_READ:
5598 		case VM_PROT_READ | VM_PROT_EXECUTE:
5599 			break;
5600 		case VM_PROT_READ | VM_PROT_WRITE:
5601 		case VM_PROT_ALL:
5602 			return;         /* nothing to do */
5603 		default:
5604 			pmap_remove_options(pmap, b, e, options);
5605 			return;
5606 		}
5607 	}
5608 
5609 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5610 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5611 	    VM_KERNEL_ADDRHIDE(e));
5612 
5613 	beg = b;
5614 
5615 	while (beg < e) {
5616 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5617 
5618 		if (l > e) {
5619 			l = e;
5620 		}
5621 
5622 #if XNU_MONITOR
5623 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5624 #else
5625 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5626 #endif
5627 	}
5628 
5629 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5630 }
5631 
5632 /**
5633  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5634  *
5635  * @param pmap pmap to insert the pages into.
5636  * @param va virtual address to map the pages into.
5637  * @param pa page number of the first physical page to map.
5638  * @param size block size, in number of pages.
5639  * @param prot mapping protection attributes.
5640  * @param attr flags to pass to pmap_enter().
5641  *
5642  * @return KERN_SUCCESS.
5643  */
5644 kern_return_t
5645 pmap_map_block(
5646 	pmap_t pmap,
5647 	addr64_t va,
5648 	ppnum_t pa,
5649 	uint32_t size,
5650 	vm_prot_t prot,
5651 	int attr,
5652 	unsigned int flags)
5653 {
5654 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5655 }
5656 
5657 /**
5658  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5659  * As opposed to pmap_map_block(), this function takes
5660  * a physical address as an input and operates using the
5661  * page size associated with the input pmap.
5662  *
5663  * @param pmap pmap to insert the pages into.
5664  * @param va virtual address to map the pages into.
5665  * @param pa physical address of the first physical page to map.
5666  * @param size block size, in number of pages.
5667  * @param prot mapping protection attributes.
5668  * @param attr flags to pass to pmap_enter().
5669  *
5670  * @return KERN_SUCCESS.
5671  */
5672 kern_return_t
5673 pmap_map_block_addr(
5674 	pmap_t pmap,
5675 	addr64_t va,
5676 	pmap_paddr_t pa,
5677 	uint32_t size,
5678 	vm_prot_t prot,
5679 	int attr,
5680 	unsigned int flags)
5681 {
5682 #if __ARM_MIXED_PAGE_SIZE__
5683 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5684 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5685 #else
5686 	const uint64_t pmap_page_size = PAGE_SIZE;
5687 #endif
5688 
5689 	for (ppnum_t page = 0; page < size; page++) {
5690 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5691 			panic("%s: failed pmap_enter_addr, "
5692 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5693 			    __FUNCTION__,
5694 			    pmap, va, (uint64_t)pa, size, prot, flags);
5695 		}
5696 
5697 		va += pmap_page_size;
5698 		pa += pmap_page_size;
5699 	}
5700 
5701 	return KERN_SUCCESS;
5702 }
5703 
5704 kern_return_t
5705 pmap_enter_addr(
5706 	pmap_t pmap,
5707 	vm_map_address_t v,
5708 	pmap_paddr_t pa,
5709 	vm_prot_t prot,
5710 	vm_prot_t fault_type,
5711 	unsigned int flags,
5712 	boolean_t wired)
5713 {
5714 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5715 }
5716 
5717 /*
5718  *	Insert the given physical page (p) at
5719  *	the specified virtual address (v) in the
5720  *	target physical map with the protection requested.
5721  *
5722  *	If specified, the page will be wired down, meaning
5723  *	that the related pte can not be reclaimed.
5724  *
5725  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5726  *	or lose information.  That is, this routine must actually
5727  *	insert this page into the given map eventually (must make
5728  *	forward progress eventually.
5729  */
5730 kern_return_t
5731 pmap_enter(
5732 	pmap_t pmap,
5733 	vm_map_address_t v,
5734 	ppnum_t pn,
5735 	vm_prot_t prot,
5736 	vm_prot_t fault_type,
5737 	unsigned int flags,
5738 	boolean_t wired)
5739 {
5740 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5741 }
5742 
5743 /*
5744  * Attempt to commit the pte.
5745  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5746  * Performs no page table or accounting writes on failures.
5747  */
5748 static inline bool
5749 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5750 {
5751 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5752 	bool success = false, changed_wiring = false;
5753 
5754 	__unreachable_ok_push
5755 	if (TEST_PAGE_RATIO_4) {
5756 		/*
5757 		 * 16K virtual pages w/ 4K hw pages.
5758 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5759 		 * As a result we require the exclusive pmap lock.
5760 		 */
5761 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5762 		*old_pte = *pte_p;
5763 		if (*old_pte == new_pte) {
5764 			/* Another thread completed this operation. Nothing to do here. */
5765 			success = true;
5766 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5767 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5768 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5769 			success = false;
5770 		} else {
5771 			write_pte_fast(pte_p, new_pte);
5772 			success = true;
5773 		}
5774 	} else {
5775 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5776 	}
5777 	__unreachable_ok_pop
5778 
5779 	if (success && *old_pte != new_pte) {
5780 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5781 			bool need_strong_sync = false;
5782 			FLUSH_PTE_STRONG();
5783 #if HAS_FEAT_XS
5784 			if (pte_is_xs(pt_attr, *old_pte)) {
5785 				need_strong_sync = true;
5786 			}
5787 #endif /* HAS_FEAT_XS */
5788 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5789 		} else {
5790 			FLUSH_PTE();
5791 			__builtin_arm_isb(ISB_SY);
5792 		}
5793 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5794 		    (new_pte & ARM_PTE_WIRED) != 0 :
5795 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5796 
5797 		if (pmap != kernel_pmap && changed_wiring) {
5798 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5799 			if (new_pte & ARM_PTE_WIRED) {
5800 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5801 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5802 			} else {
5803 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5804 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5805 			}
5806 		}
5807 
5808 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5809 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5810 	}
5811 	return success;
5812 }
5813 
5814 MARK_AS_PMAP_TEXT static pt_entry_t
5815 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5816 {
5817 	pt_entry_t pte;
5818 
5819 	switch (wimg & (VM_WIMG_MASK)) {
5820 	case VM_WIMG_IO:
5821 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5822 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5823 		// AP, while preserving the security benefits of using device
5824 		// mapping against side-channel attacks. On pre-H14 platforms,
5825 		// the accesses will still be strongly ordered.
5826 		if (is_dram_addr(pa)) {
5827 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5828 		} else {
5829 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5830 		}
5831 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5832 		break;
5833 	case VM_WIMG_RT:
5834 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5835 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5836 		break;
5837 	case VM_WIMG_POSTED:
5838 		if (is_dram_addr(pa)) {
5839 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5840 		} else {
5841 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5842 		}
5843 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5844 		break;
5845 	case VM_WIMG_POSTED_REORDERED:
5846 		if (is_dram_addr(pa)) {
5847 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5848 		} else {
5849 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5850 		}
5851 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5852 		break;
5853 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5854 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5855 #if HAS_FEAT_XS
5856 		if (!is_dram_addr(pa)) {
5857 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5858 		}
5859 #endif /* HAS_FEAT_XS */
5860 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5861 		break;
5862 	case VM_WIMG_WCOMB:
5863 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5864 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5865 		break;
5866 	case VM_WIMG_WTHRU:
5867 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5868 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5869 		break;
5870 	case VM_WIMG_COPYBACK:
5871 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5872 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5873 		break;
5874 	case VM_WIMG_INNERWBACK:
5875 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5876 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5877 		break;
5878 	default:
5879 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5880 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5881 	}
5882 
5883 	return pte;
5884 }
5885 
5886 
5887 /*
5888  * Construct a PTE (and the physical page attributes) for the given virtual to
5889  * physical mapping.
5890  *
5891  * This function has no side effects and is safe to call so that it is safe to
5892  * call while attempting a pmap_enter transaction.
5893  */
5894 MARK_AS_PMAP_TEXT static pt_entry_t
5895 pmap_construct_pte(
5896 	const pmap_t pmap,
5897 	vm_map_address_t va,
5898 	pmap_paddr_t pa,
5899 	vm_prot_t prot,
5900 	vm_prot_t fault_type,
5901 	boolean_t wired,
5902 	const pt_attr_t* const pt_attr,
5903 	uint16_t *pp_attr_bits /* OUTPUT */
5904 	)
5905 {
5906 	bool set_NX = false, set_XO = false;
5907 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5908 	assert(pp_attr_bits != NULL);
5909 	*pp_attr_bits = 0;
5910 
5911 	if (wired) {
5912 		pte |= ARM_PTE_WIRED;
5913 	}
5914 
5915 #if DEVELOPMENT || DEBUG
5916 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5917 #else
5918 	if ((prot & VM_PROT_EXECUTE))
5919 #endif
5920 	{
5921 		set_NX = false;
5922 	} else {
5923 		set_NX = true;
5924 	}
5925 
5926 	if (prot == VM_PROT_EXECUTE) {
5927 		set_XO = true;
5928 	}
5929 
5930 	if (set_NX) {
5931 		pte |= pt_attr_leaf_xn(pt_attr);
5932 	} else {
5933 		if (pmap == kernel_pmap) {
5934 			pte |= ARM_PTE_NX;
5935 		} else {
5936 			pte |= pt_attr_leaf_x(pt_attr);
5937 		}
5938 	}
5939 
5940 	if (pmap == kernel_pmap) {
5941 #if __ARM_KERNEL_PROTECT__
5942 		pte |= ARM_PTE_NG;
5943 #endif /* __ARM_KERNEL_PROTECT__ */
5944 		if (prot & VM_PROT_WRITE) {
5945 			pte |= ARM_PTE_AP(AP_RWNA);
5946 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5947 		} else {
5948 			pte |= ARM_PTE_AP(AP_RONA);
5949 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5950 		}
5951 	} else {
5952 		if (pmap->type != PMAP_TYPE_NESTED) {
5953 			pte |= ARM_PTE_NG;
5954 		} else if ((pmap->nested_region_asid_bitmap)
5955 		    && (va >= pmap->nested_region_addr)
5956 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5957 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5958 
5959 			if ((pmap->nested_region_asid_bitmap)
5960 			    && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5961 				pte |= ARM_PTE_NG;
5962 			}
5963 		}
5964 		if (prot & VM_PROT_WRITE) {
5965 			assert(pmap->type != PMAP_TYPE_NESTED);
5966 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5967 				if (fault_type & VM_PROT_WRITE) {
5968 					pte |= pt_attr_leaf_rw(pt_attr);
5969 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5970 				} else {
5971 					pte |= pt_attr_leaf_ro(pt_attr);
5972 					/*
5973 					 * Mark the page as MODFAULT so that a subsequent write
5974 					 * may be handled through arm_fast_fault().
5975 					 */
5976 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5977 					pte_set_was_writeable(pte, true);
5978 				}
5979 			} else {
5980 				pte |= pt_attr_leaf_rw(pt_attr);
5981 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5982 			}
5983 		} else {
5984 			if (set_XO) {
5985 				pte |= pt_attr_leaf_rona(pt_attr);
5986 			} else {
5987 				pte |= pt_attr_leaf_ro(pt_attr);
5988 			}
5989 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5990 		}
5991 	}
5992 
5993 	pte |= ARM_PTE_AF;
5994 	return pte;
5995 }
5996 
5997 MARK_AS_PMAP_TEXT kern_return_t
5998 pmap_enter_options_internal(
5999 	pmap_t pmap,
6000 	vm_map_address_t v,
6001 	pmap_paddr_t pa,
6002 	vm_prot_t prot,
6003 	vm_prot_t fault_type,
6004 	unsigned int flags,
6005 	boolean_t wired,
6006 	unsigned int options)
6007 {
6008 	ppnum_t         pn = (ppnum_t)atop(pa);
6009 	pt_entry_t      pte;
6010 	pt_entry_t      spte;
6011 	pt_entry_t      *pte_p;
6012 	bool            refcnt_updated;
6013 	bool            wiredcnt_updated;
6014 	bool            ro_va = false;
6015 	unsigned int    wimg_bits;
6016 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6017 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6018 	kern_return_t   kr = KERN_SUCCESS;
6019 	uint16_t pp_attr_bits;
6020 	volatile uint16_t *refcnt;
6021 	volatile uint16_t *wiredcnt;
6022 	pv_free_list_t *local_pv_free;
6023 
6024 	validate_pmap_mutable(pmap);
6025 
6026 #if XNU_MONITOR
6027 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6028 		panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
6029 	}
6030 #endif
6031 
6032 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6033 
6034 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6035 		panic("pmap_enter_options() pmap %p v 0x%llx",
6036 		    pmap, (uint64_t)v);
6037 	}
6038 
6039 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6040 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6041 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6042 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6043 	}
6044 
6045 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6046 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6047 		    pmap, (uint64_t)pa);
6048 	}
6049 
6050 	/* The PA should not extend beyond the architected physical address space */
6051 	pa &= ARM_PTE_PAGE_MASK;
6052 
6053 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6054 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6055 		extern vm_offset_t ctrr_test_page;
6056 		if (__probable(v != ctrr_test_page))
6057 #endif
6058 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6059 	}
6060 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6061 		if (__improbable(prot != VM_PROT_READ)) {
6062 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6063 			    __func__, (unsigned long long)v, prot);
6064 		}
6065 		ro_va = true;
6066 	}
6067 	assert(pn != vm_page_fictitious_addr);
6068 
6069 	refcnt_updated = false;
6070 	wiredcnt_updated = false;
6071 
6072 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6073 		/*
6074 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6075 		 *
6076 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6077 		 */
6078 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6079 	}
6080 
6081 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6082 		return KERN_ABORTED;
6083 	}
6084 
6085 	/*
6086 	 *	Expand pmap to include this pte.  Assume that
6087 	 *	pmap is always expanded to include enough hardware
6088 	 *	pages to map one VM page.
6089 	 */
6090 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6091 		/* Must unlock to expand the pmap. */
6092 		pmap_unlock(pmap, lock_mode);
6093 
6094 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6095 
6096 		if (kr != KERN_SUCCESS) {
6097 			return kr;
6098 		}
6099 
6100 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6101 			return KERN_ABORTED;
6102 		}
6103 	}
6104 
6105 	if (options & PMAP_OPTIONS_NOENTER) {
6106 		pmap_unlock(pmap, lock_mode);
6107 		return KERN_SUCCESS;
6108 	}
6109 
6110 	/*
6111 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6112 	 * done via a cmpxchg loop.
6113 	 * We need to be careful about modifying non-local data structures before commiting
6114 	 * the new pte since we may need to re-do the transaction.
6115 	 */
6116 	spte = os_atomic_load(pte_p, relaxed);
6117 	while (!committed) {
6118 		refcnt = NULL;
6119 		wiredcnt = NULL;
6120 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6121 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6122 
6123 		if (pmap != kernel_pmap) {
6124 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6125 			refcnt = &ptd_info->refcnt;
6126 			wiredcnt = &ptd_info->wiredcnt;
6127 			/*
6128 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6129 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6130 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6131 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6132 			 * have PTDs, so we can't use the check there.
6133 			 */
6134 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6135 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6136 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6137 			}
6138 			/*
6139 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6140 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6141 			 * or acquire the pmap lock exclusive.
6142 			 */
6143 			if (!wiredcnt_updated) {
6144 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6145 				wiredcnt_updated = true;
6146 			}
6147 			if (!refcnt_updated) {
6148 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6149 				refcnt_updated = true;
6150 				drop_refcnt = true;
6151 			}
6152 		}
6153 
6154 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6155 			/*
6156 			 * There is already a mapping here & it's for a different physical page.
6157 			 * First remove that mapping.
6158 			 *
6159 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6160 			 */
6161 			if (lock_mode == PMAP_LOCK_SHARED) {
6162 				if (pmap_lock_shared_to_exclusive(pmap)) {
6163 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6164 				} else {
6165 					/*
6166 					 * We failed to upgrade to an exclusive lock.
6167 					 * As a result we no longer hold the lock at all,
6168 					 * so we need to re-acquire it and restart the transaction.
6169 					 */
6170 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6171 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6172 					/* pmap might have changed after we dropped the lock. Try again. */
6173 					spte = os_atomic_load(pte_p, relaxed);
6174 					continue;
6175 				}
6176 			}
6177 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6178 			spte = ARM_PTE_TYPE_FAULT;
6179 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6180 		}
6181 
6182 		/*
6183 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6184 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6185 		 * read-write protection. The PMAP layer though still needs to use the right
6186 		 * index, which is the older XO-now-TPRO one and that is specially selected
6187 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6188 		 */
6189 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6190 			if (__improbable(pmap == kernel_pmap)) {
6191 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6192 				    __func__);
6193 			}
6194 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6195 		} else {
6196 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6197 		}
6198 
6199 		if (pa_valid(pa)) {
6200 			unsigned int pai;
6201 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6202 
6203 			is_internal = FALSE;
6204 			is_altacct = FALSE;
6205 
6206 			pai = pa_index(pa);
6207 
6208 			pvh_lock(pai);
6209 
6210 			/*
6211 			 * Make sure that the current per-cpu PV free list has
6212 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6213 			 * if the transaction succeeds. We're either in the
6214 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6215 			 * Note that we can still be interrupted, but a primary
6216 			 * interrupt handler can never enter the pmap.
6217 			 */
6218 #if !XNU_MONITOR
6219 			assert(get_preemption_level() > 0);
6220 #endif
6221 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6222 			pv_entry_t **pv_h = pai_to_pvh(pai);
6223 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6224 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6225 
6226 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6227 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6228 				int new_allocated_pves = 0;
6229 
6230 				while (new_allocated_pves < 2) {
6231 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6232 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6233 					if (pv_status == PV_ALLOC_FAIL) {
6234 						break;
6235 					} else if (pv_status == PV_ALLOC_RETRY) {
6236 						/*
6237 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6238 						 * it will have dropped the pmap lock while doing so.
6239 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6240 						 * be on a different CPU now.
6241 						 */
6242 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6243 					} else {
6244 						/* If we've gotten this far then a node should've been allocated. */
6245 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6246 
6247 						new_allocated_pves++;
6248 					}
6249 				}
6250 
6251 				for (int i = 0; i < new_allocated_pves; i++) {
6252 					pv_free(new_pve_p[i]);
6253 				}
6254 			}
6255 
6256 			if (pv_status == PV_ALLOC_FAIL) {
6257 				pvh_unlock(pai);
6258 				kr = KERN_RESOURCE_SHORTAGE;
6259 				break;
6260 			} else if (pv_status == PV_ALLOC_RETRY) {
6261 				pvh_unlock(pai);
6262 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6263 				spte = os_atomic_load(pte_p, relaxed);
6264 				continue;
6265 			}
6266 
6267 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6268 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6269 			} else {
6270 				wimg_bits = pmap_cache_attributes(pn);
6271 			}
6272 
6273 			/* We may be retrying this operation after dropping the PVH lock.
6274 			 * Cache attributes for the physical page may have changed while the lock
6275 			 * was dropped, so clear any cache attributes we may have previously set
6276 			 * in the PTE template. */
6277 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6278 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6279 
6280 #if XNU_MONITOR
6281 			/* The regular old kernel is not allowed to remap PPL pages. */
6282 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6283 				panic("%s: page belongs to PPL, "
6284 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6285 				    __FUNCTION__,
6286 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6287 			}
6288 
6289 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6290 				panic("%s: page locked down, "
6291 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6292 				    __FUNCTION__,
6293 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6294 			}
6295 #endif
6296 
6297 
6298 
6299 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6300 			if (!committed) {
6301 				pvh_unlock(pai);
6302 				continue;
6303 			}
6304 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6305 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6306 
6307 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6308 			/*
6309 			 * If there was already a valid pte here then we reuse its reference
6310 			 * on the ptd and drop the one that we took above.
6311 			 */
6312 			drop_refcnt = had_valid_mapping;
6313 
6314 			if (!had_valid_mapping) {
6315 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6316 				int pve_ptep_idx = 0;
6317 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6318 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6319 				if (pv_status != PV_ALLOC_SUCCESS) {
6320 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6321 					    __func__, pv_status, new_pve_p, pmap);
6322 				}
6323 
6324 				if (pmap != kernel_pmap) {
6325 					if (options & PMAP_OPTIONS_INTERNAL) {
6326 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6327 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6328 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6329 							/*
6330 							 * Make a note to ourselves that this
6331 							 * mapping is using alternative
6332 							 * accounting. We'll need this in order
6333 							 * to know which ledger to debit when
6334 							 * the mapping is removed.
6335 							 *
6336 							 * The altacct bit must be set while
6337 							 * the pv head is locked. Defer the
6338 							 * ledger accounting until after we've
6339 							 * dropped the lock.
6340 							 */
6341 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6342 							is_altacct = TRUE;
6343 						}
6344 					}
6345 					if (ppattr_test_reusable(pai) &&
6346 					    !is_altacct) {
6347 						is_reusable = TRUE;
6348 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6349 						is_internal = TRUE;
6350 					} else {
6351 						is_external = TRUE;
6352 					}
6353 				}
6354 			}
6355 
6356 			pvh_unlock(pai);
6357 
6358 			if (pp_attr_bits != 0) {
6359 				ppattr_pa_set_bits(pa, pp_attr_bits);
6360 			}
6361 
6362 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6363 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6364 
6365 				if (is_internal) {
6366 					/*
6367 					 * Make corresponding adjustments to
6368 					 * phys_footprint statistics.
6369 					 */
6370 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6371 					if (is_altacct) {
6372 						/*
6373 						 * If this page is internal and
6374 						 * in an IOKit region, credit
6375 						 * the task's total count of
6376 						 * dirty, internal IOKit pages.
6377 						 * It should *not* count towards
6378 						 * the task's total physical
6379 						 * memory footprint, because
6380 						 * this entire region was
6381 						 * already billed to the task
6382 						 * at the time the mapping was
6383 						 * created.
6384 						 *
6385 						 * Put another way, this is
6386 						 * internal++ and
6387 						 * alternate_accounting++, so
6388 						 * net effect on phys_footprint
6389 						 * is 0. That means: don't
6390 						 * touch phys_footprint here.
6391 						 */
6392 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6393 					} else {
6394 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6395 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6396 							skip_footprint_debit = true;
6397 						} else {
6398 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6399 						}
6400 					}
6401 				}
6402 				if (is_reusable) {
6403 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6404 				} else if (is_external) {
6405 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6406 				}
6407 			}
6408 		} else {
6409 			if (prot & VM_PROT_EXECUTE) {
6410 				kr = KERN_FAILURE;
6411 				break;
6412 			}
6413 
6414 			wimg_bits = pmap_cache_attributes(pn);
6415 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6416 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6417 			}
6418 
6419 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6420 
6421 #if XNU_MONITOR
6422 			pte = pmap_construct_io_pte(pa, pte);
6423 
6424 			/**
6425 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6426 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6427 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6428 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6429 			 */
6430 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6431 			    ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6432 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6433 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6434 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6435 				    __func__, (uint64_t)pte_to_pa(spte));
6436 			}
6437 #endif
6438 
6439 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6440 			if (committed) {
6441 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6442 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6443 
6444 				/**
6445 				 * If there was already a valid pte here then we reuse its
6446 				 * reference on the ptd and drop the one that we took above.
6447 				 */
6448 				drop_refcnt = had_valid_mapping;
6449 			}
6450 		}
6451 		if (committed) {
6452 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6453 				assert(pmap != kernel_pmap);
6454 
6455 				/* One less "compressed" */
6456 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6457 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6458 
6459 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6460 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6461 				} else if (!skip_footprint_debit) {
6462 					/* Was part of the footprint */
6463 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6464 				}
6465 				/* The old entry held a reference so drop the extra one that we took above. */
6466 				drop_refcnt = true;
6467 			}
6468 		}
6469 	}
6470 
6471 	if (drop_refcnt && refcnt != NULL) {
6472 		assert(refcnt_updated);
6473 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6474 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6475 		}
6476 	}
6477 
6478 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6479 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6480 	}
6481 
6482 	pmap_unlock(pmap, lock_mode);
6483 
6484 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6485 		pmap_phys_write_disable(v);
6486 	}
6487 
6488 	return kr;
6489 }
6490 
6491 kern_return_t
6492 pmap_enter_options_addr(
6493 	pmap_t pmap,
6494 	vm_map_address_t v,
6495 	pmap_paddr_t pa,
6496 	vm_prot_t prot,
6497 	vm_prot_t fault_type,
6498 	unsigned int flags,
6499 	boolean_t wired,
6500 	unsigned int options,
6501 	__unused void   *arg)
6502 {
6503 	kern_return_t kr = KERN_FAILURE;
6504 
6505 
6506 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6507 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6508 
6509 
6510 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6511 	do {
6512 #if XNU_MONITOR
6513 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6514 #else
6515 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6516 #endif
6517 
6518 		if (kr == KERN_RESOURCE_SHORTAGE) {
6519 #if XNU_MONITOR
6520 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6521 #endif
6522 			if (nowait_requested) {
6523 				break;
6524 			}
6525 		}
6526 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6527 
6528 #if XNU_MONITOR
6529 	pmap_ledger_check_balance(pmap);
6530 #endif
6531 
6532 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6533 
6534 	return kr;
6535 }
6536 
6537 kern_return_t
6538 pmap_enter_options(
6539 	pmap_t pmap,
6540 	vm_map_address_t v,
6541 	ppnum_t pn,
6542 	vm_prot_t prot,
6543 	vm_prot_t fault_type,
6544 	unsigned int flags,
6545 	boolean_t wired,
6546 	unsigned int options,
6547 	__unused void   *arg)
6548 {
6549 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6550 }
6551 
6552 /*
6553  *	Routine:	pmap_change_wiring
6554  *	Function:	Change the wiring attribute for a map/virtual-address
6555  *			pair.
6556  *	In/out conditions:
6557  *			The mapping must already exist in the pmap.
6558  */
6559 MARK_AS_PMAP_TEXT kern_return_t
6560 pmap_change_wiring_internal(
6561 	pmap_t pmap,
6562 	vm_map_address_t v,
6563 	boolean_t wired)
6564 {
6565 	pt_entry_t     *pte_p;
6566 	pmap_paddr_t    pa;
6567 
6568 	validate_pmap_mutable(pmap);
6569 
6570 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6571 		return KERN_ABORTED;
6572 	}
6573 
6574 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6575 
6576 	pte_p = pmap_pte(pmap, v);
6577 	if (pte_p == PT_ENTRY_NULL) {
6578 		if (!wired) {
6579 			/*
6580 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6581 			 * may have been freed by a remove operation.
6582 			 */
6583 			goto pmap_change_wiring_return;
6584 		} else {
6585 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6586 		}
6587 	}
6588 	/*
6589 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6590 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6591 	 */
6592 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6593 
6594 	while (pa_valid(pa)) {
6595 		pmap_paddr_t new_pa;
6596 
6597 		pvh_lock(pa_index(pa));
6598 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6599 
6600 		if (pa == new_pa) {
6601 			break;
6602 		}
6603 
6604 		pvh_unlock(pa_index(pa));
6605 		pa = new_pa;
6606 	}
6607 
6608 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6609 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6610 		if (!wired) {
6611 			/* PTE cleared by prior remove/disconnect operation */
6612 			goto pmap_change_wiring_cleanup;
6613 		} else {
6614 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6615 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6616 		}
6617 	}
6618 
6619 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6620 	if (wired != pte_is_wired(*pte_p)) {
6621 		pte_set_wired(pmap, pte_p, wired);
6622 		if (pmap != kernel_pmap) {
6623 			if (wired) {
6624 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6625 			} else if (!wired) {
6626 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6627 			}
6628 		}
6629 	}
6630 
6631 pmap_change_wiring_cleanup:
6632 	if (pa_valid(pa)) {
6633 		pvh_unlock(pa_index(pa));
6634 	}
6635 
6636 pmap_change_wiring_return:
6637 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6638 
6639 	return KERN_SUCCESS;
6640 }
6641 
6642 void
6643 pmap_change_wiring(
6644 	pmap_t pmap,
6645 	vm_map_address_t v,
6646 	boolean_t wired)
6647 {
6648 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6649 	pmap_verify_preemptible();
6650 
6651 	kern_return_t kr = KERN_FAILURE;
6652 #if XNU_MONITOR
6653 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6654 	do {
6655 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6656 	} while (kr == KERN_ABORTED);
6657 
6658 	pmap_ledger_check_balance(pmap);
6659 #else
6660 	/* Since we verified preemptibility, call the helper only once. */
6661 	kr = pmap_change_wiring_internal(pmap, v, wired);
6662 #endif
6663 
6664 	if (kr != KERN_SUCCESS) {
6665 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6666 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6667 	}
6668 }
6669 
6670 MARK_AS_PMAP_TEXT pmap_paddr_t
6671 pmap_find_pa_internal(
6672 	pmap_t pmap,
6673 	addr64_t va)
6674 {
6675 	pmap_paddr_t    pa = 0;
6676 
6677 	validate_pmap(pmap);
6678 
6679 	if (pmap != kernel_pmap) {
6680 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6681 	}
6682 
6683 	pa = pmap_vtophys(pmap, va);
6684 
6685 	if (pmap != kernel_pmap) {
6686 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6687 	}
6688 
6689 	return pa;
6690 }
6691 
6692 pmap_paddr_t
6693 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6694 {
6695 	pmap_paddr_t pa = 0;
6696 
6697 	if (pmap == kernel_pmap) {
6698 		pa = mmu_kvtop(va);
6699 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6700 		/*
6701 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6702 		 * translation even if PAN would prevent kernel access through the translation.
6703 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6704 		 */
6705 		pa = mmu_uvtop(va);
6706 	}
6707 	return pa;
6708 }
6709 
6710 pmap_paddr_t
6711 pmap_find_pa(
6712 	pmap_t pmap,
6713 	addr64_t va)
6714 {
6715 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6716 
6717 	if (pa != 0) {
6718 		return pa;
6719 	}
6720 
6721 	if (not_in_kdp) {
6722 #if XNU_MONITOR
6723 		return pmap_find_pa_ppl(pmap, va);
6724 #else
6725 		return pmap_find_pa_internal(pmap, va);
6726 #endif
6727 	} else {
6728 		return pmap_vtophys(pmap, va);
6729 	}
6730 }
6731 
6732 ppnum_t
6733 pmap_find_phys_nofault(
6734 	pmap_t pmap,
6735 	addr64_t va)
6736 {
6737 	ppnum_t ppn;
6738 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6739 	return ppn;
6740 }
6741 
6742 ppnum_t
6743 pmap_find_phys(
6744 	pmap_t pmap,
6745 	addr64_t va)
6746 {
6747 	ppnum_t ppn;
6748 	ppn = atop(pmap_find_pa(pmap, va));
6749 	return ppn;
6750 }
6751 
6752 /**
6753  * Translate a kernel virtual address into a physical address.
6754  *
6755  * @param va The kernel virtual address to translate. Does not work on user
6756  *           virtual addresses.
6757  *
6758  * @return The physical address if the translation was successful, or zero if
6759  *         no valid mappings were found for the given virtual address.
6760  */
6761 pmap_paddr_t
6762 kvtophys(vm_offset_t va)
6763 {
6764 	/**
6765 	 * Attempt to do the translation first in hardware using the AT (address
6766 	 * translation) instruction. This will attempt to use the MMU to do the
6767 	 * translation for us.
6768 	 */
6769 	pmap_paddr_t pa = mmu_kvtop(va);
6770 
6771 	if (pa) {
6772 		return pa;
6773 	}
6774 
6775 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6776 	return pmap_vtophys(kernel_pmap, va);
6777 }
6778 
6779 /**
6780  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6781  * points to a non-kernel-managed physical page, then this call will panic().
6782  *
6783  * @note The output of this function is guaranteed to be a kernel-managed
6784  *       physical page, which means it's safe to pass the output directly to
6785  *       pa_index() to create a physical address index for various pmap data
6786  *       structures.
6787  *
6788  * @param va The kernel virtual address to translate. Does not work on user
6789  *           virtual addresses.
6790  *
6791  * @return The translated physical address for the given virtual address.
6792  */
6793 pmap_paddr_t
6794 kvtophys_nofail(vm_offset_t va)
6795 {
6796 	pmap_paddr_t pa = kvtophys(va);
6797 
6798 	if (!pa_valid(pa)) {
6799 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6800 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6801 	}
6802 
6803 	return pa;
6804 }
6805 
6806 pmap_paddr_t
6807 pmap_vtophys(
6808 	pmap_t pmap,
6809 	addr64_t va)
6810 {
6811 	if ((va < pmap->min) || (va >= pmap->max)) {
6812 		return 0;
6813 	}
6814 
6815 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6816 
6817 	tt_entry_t * ttp = NULL;
6818 	tt_entry_t * ttep = NULL;
6819 	tt_entry_t   tte = ARM_TTE_EMPTY;
6820 	pmap_paddr_t pa = 0;
6821 	unsigned int cur_level;
6822 
6823 	ttp = pmap->tte;
6824 
6825 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6826 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6827 
6828 		tte = *ttep;
6829 
6830 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6831 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6832 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6833 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6834 
6835 		if ((tte & valid_mask) != valid_mask) {
6836 			return (pmap_paddr_t) 0;
6837 		}
6838 
6839 		/* This detects both leaf entries and intermediate block mappings. */
6840 		if ((tte & type_mask) == type_block) {
6841 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6842 			break;
6843 		}
6844 
6845 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6846 	}
6847 
6848 	return pa;
6849 }
6850 
6851 /*
6852  *	pmap_init_pte_page - Initialize a page table page.
6853  */
6854 MARK_AS_PMAP_TEXT void
6855 pmap_init_pte_page(
6856 	pmap_t pmap,
6857 	pt_entry_t *pte_p,
6858 	vm_offset_t va,
6859 	unsigned int ttlevel,
6860 	boolean_t alloc_ptd)
6861 {
6862 	pt_desc_t   *ptdp = NULL;
6863 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6864 
6865 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6866 		if (alloc_ptd) {
6867 			/*
6868 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6869 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6870 			 * bootstrap request, so we check for an existing PTD here.
6871 			 */
6872 			ptdp = ptd_alloc(pmap);
6873 			if (ptdp == NULL) {
6874 				panic("%s: unable to allocate PTD", __func__);
6875 			}
6876 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6877 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6878 			pvh_set_flags(pvh, 0);
6879 		} else {
6880 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6881 		}
6882 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6883 		ptdp = pvh_ptd(pvh);
6884 	} else {
6885 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6886 	}
6887 
6888 	// below barrier ensures previous updates to the page are visible to PTW before
6889 	// it is linked to the PTE of previous level
6890 	__builtin_arm_dmb(DMB_ISHST);
6891 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6892 }
6893 
6894 /*
6895  *	Routine:	pmap_expand
6896  *
6897  *	Expands a pmap to be able to map the specified virtual address.
6898  *
6899  *	Allocates new memory for the default (COARSE) translation table
6900  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6901  *	also allocates space for the corresponding pv entries.
6902  *
6903  *	Nothing should be locked.
6904  */
6905 MARK_AS_PMAP_TEXT static kern_return_t
6906 pmap_expand(
6907 	pmap_t pmap,
6908 	vm_map_address_t v,
6909 	unsigned int options,
6910 	unsigned int level)
6911 {
6912 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6913 
6914 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6915 		return KERN_INVALID_ADDRESS;
6916 	}
6917 	pmap_paddr_t    pa;
6918 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6919 	tt_entry_t              *tte_p;
6920 	tt_entry_t              *tt_p;
6921 
6922 	pa = 0x0ULL;
6923 	tt_p =  (tt_entry_t *)NULL;
6924 
6925 	for (; ttlevel < level; ttlevel++) {
6926 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6927 			return KERN_ABORTED;
6928 		}
6929 
6930 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6931 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6932 			kern_return_t ret;
6933 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6934 				if (options & PMAP_OPTIONS_NOWAIT) {
6935 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6936 					return ret;
6937 				}
6938 #if XNU_MONITOR
6939 				panic("%s: failed to allocate tt, "
6940 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6941 				    __FUNCTION__,
6942 				    pmap, (void *)v, options, level);
6943 #else
6944 				VM_PAGE_WAIT();
6945 #endif
6946 			}
6947 
6948 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6949 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6950 				return KERN_ABORTED;
6951 			}
6952 
6953 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6954 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6955 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6956 				tte_p = pmap_ttne(pmap, ttlevel, v);
6957 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6958 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6959 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6960 				pa = 0x0ULL;
6961 				tt_p = (tt_entry_t *)NULL;
6962 			}
6963 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6964 		} else {
6965 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6966 		}
6967 
6968 		if (tt_p != (tt_entry_t *)NULL) {
6969 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6970 			tt_p = (tt_entry_t *)NULL;
6971 		}
6972 	}
6973 
6974 	return KERN_SUCCESS;
6975 }
6976 
6977 /*
6978  *	Routine:	pmap_gc
6979  *	Function:
6980  *              Pmap garbage collection
6981  *		Called by the pageout daemon when pages are scarce.
6982  *
6983  */
6984 void
6985 pmap_gc(void)
6986 {
6987 	/*
6988 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6989 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6990 	 * or may contain wired mappings.  However, with the relatively recent change to
6991 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6992 	 * page, it may make sense to call that function here.
6993 	 */
6994 }
6995 
6996 /*
6997  *      By default, don't attempt pmap GC more frequently
6998  *      than once / 1 minutes.
6999  */
7000 
7001 void
7002 compute_pmap_gc_throttle(
7003 	void *arg __unused)
7004 {
7005 }
7006 
7007 /*
7008  * pmap_attribute_cache_sync(vm_offset_t pa)
7009  *
7010  * Invalidates all of the instruction cache on a physical page and
7011  * pushes any dirty data from the data cache for the same physical page
7012  */
7013 
7014 kern_return_t
7015 pmap_attribute_cache_sync(
7016 	ppnum_t pp,
7017 	vm_size_t size,
7018 	__unused vm_machine_attribute_t attribute,
7019 	__unused vm_machine_attribute_val_t * value)
7020 {
7021 	if (size > PAGE_SIZE) {
7022 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7023 	} else {
7024 		cache_sync_page(pp);
7025 	}
7026 
7027 	return KERN_SUCCESS;
7028 }
7029 
7030 /*
7031  * pmap_sync_page_data_phys(ppnum_t pp)
7032  *
7033  * Invalidates all of the instruction cache on a physical page and
7034  * pushes any dirty data from the data cache for the same physical page
7035  */
7036 void
7037 pmap_sync_page_data_phys(
7038 	ppnum_t pp)
7039 {
7040 	cache_sync_page(pp);
7041 }
7042 
7043 /*
7044  * pmap_sync_page_attributes_phys(ppnum_t pp)
7045  *
7046  * Write back and invalidate all cachelines on a physical page.
7047  */
7048 void
7049 pmap_sync_page_attributes_phys(
7050 	ppnum_t pp)
7051 {
7052 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7053 }
7054 
7055 #if CONFIG_COREDUMP
7056 /* temporary workaround */
7057 boolean_t
7058 coredumpok(
7059 	vm_map_t map,
7060 	mach_vm_offset_t va)
7061 {
7062 	pt_entry_t     *pte_p;
7063 	pt_entry_t      spte;
7064 
7065 	pte_p = pmap_pte(map->pmap, va);
7066 	if (0 == pte_p) {
7067 		return FALSE;
7068 	}
7069 	if (vm_map_entry_has_device_pager(map, va)) {
7070 		return FALSE;
7071 	}
7072 	spte = *pte_p;
7073 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7074 }
7075 #endif
7076 
7077 void
7078 fillPage(
7079 	ppnum_t pn,
7080 	unsigned int fill)
7081 {
7082 	unsigned int   *addr;
7083 	int             count;
7084 
7085 	addr = (unsigned int *) phystokv(ptoa(pn));
7086 	count = PAGE_SIZE / sizeof(unsigned int);
7087 	while (count--) {
7088 		*addr++ = fill;
7089 	}
7090 }
7091 
7092 extern void     mapping_set_mod(ppnum_t pn);
7093 
7094 void
7095 mapping_set_mod(
7096 	ppnum_t pn)
7097 {
7098 	pmap_set_modify(pn);
7099 }
7100 
7101 extern void     mapping_set_ref(ppnum_t pn);
7102 
7103 void
7104 mapping_set_ref(
7105 	ppnum_t pn)
7106 {
7107 	pmap_set_reference(pn);
7108 }
7109 
7110 /*
7111  * Clear specified attribute bits.
7112  *
7113  * Try to force an arm_fast_fault() for all mappings of
7114  * the page - to force attributes to be set again at fault time.
7115  * If the forcing succeeds, clear the cached bits at the head.
7116  * Otherwise, something must have been wired, so leave the cached
7117  * attributes alone.
7118  */
7119 MARK_AS_PMAP_TEXT static void
7120 phys_attribute_clear_with_flush_range(
7121 	ppnum_t         pn,
7122 	unsigned int    bits,
7123 	int             options,
7124 	void            *arg,
7125 	pmap_tlb_flush_range_t *flush_range)
7126 {
7127 	pmap_paddr_t    pa = ptoa(pn);
7128 	vm_prot_t       allow_mode = VM_PROT_ALL;
7129 
7130 #if XNU_MONITOR
7131 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7132 		panic("%s: illegal request, "
7133 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7134 		    __FUNCTION__,
7135 		    pn, bits, options, arg, flush_range);
7136 	}
7137 #endif
7138 	if ((arg != NULL) || (flush_range != NULL)) {
7139 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7140 	}
7141 
7142 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7143 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7144 		    "invalid options",
7145 		    pn, bits, options, arg, flush_range);
7146 	}
7147 
7148 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7149 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7150 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7151 		    "should not clear 'modified' without flushing TLBs",
7152 		    pn, bits, options, arg, flush_range);
7153 	}
7154 
7155 	assert(pn != vm_page_fictitious_addr);
7156 
7157 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7158 		assert(bits == PP_ATTR_MODIFIED);
7159 
7160 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7161 		/*
7162 		 * We short circuit this case; it should not need to
7163 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7164 		 * pmap_page_protect has taken care of resetting
7165 		 * the state so that we'll see the next write as a fault to
7166 		 * the VM (i.e. we don't want a fast fault).
7167 		 */
7168 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7169 		return;
7170 	}
7171 	if (bits & PP_ATTR_REFERENCED) {
7172 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7173 	}
7174 	if (bits & PP_ATTR_MODIFIED) {
7175 		allow_mode &= ~VM_PROT_WRITE;
7176 	}
7177 
7178 	if (bits == PP_ATTR_NOENCRYPT) {
7179 		/*
7180 		 * We short circuit this case; it should not need to
7181 		 * invoke arm_force_fast_fault, so just clear and
7182 		 * return.  On ARM, this bit is just a debugging aid.
7183 		 */
7184 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7185 		return;
7186 	}
7187 
7188 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7189 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7190 	}
7191 }
7192 
7193 MARK_AS_PMAP_TEXT void
7194 phys_attribute_clear_internal(
7195 	ppnum_t         pn,
7196 	unsigned int    bits,
7197 	int             options,
7198 	void            *arg)
7199 {
7200 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7201 }
7202 
7203 #if __ARM_RANGE_TLBI__
7204 MARK_AS_PMAP_TEXT static vm_map_address_t
7205 phys_attribute_clear_twig_internal(
7206 	pmap_t pmap,
7207 	vm_map_address_t start,
7208 	vm_map_address_t end,
7209 	unsigned int bits,
7210 	unsigned int options,
7211 	pmap_tlb_flush_range_t *flush_range)
7212 {
7213 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7214 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7215 	assert(end >= start);
7216 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7217 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7218 	vm_map_address_t va = start;
7219 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7220 	tt_entry_t     *tte_p;
7221 	tte_p = pmap_tte(pmap, start);
7222 	unsigned int npages = 0;
7223 
7224 	if (tte_p == (tt_entry_t *) NULL) {
7225 		return end;
7226 	}
7227 
7228 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7229 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7230 
7231 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7232 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7233 		assert(end_pte_p >= start_pte_p);
7234 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7235 			if (__improbable(npages++ && pmap_pending_preemption())) {
7236 				return va;
7237 			}
7238 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7239 			if (pa_valid(pa)) {
7240 				ppnum_t pn = (ppnum_t) atop(pa);
7241 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7242 			}
7243 		}
7244 	}
7245 	return end;
7246 }
7247 
7248 MARK_AS_PMAP_TEXT vm_map_address_t
7249 phys_attribute_clear_range_internal(
7250 	pmap_t pmap,
7251 	vm_map_address_t start,
7252 	vm_map_address_t end,
7253 	unsigned int bits,
7254 	unsigned int options)
7255 {
7256 	if (__improbable(end < start)) {
7257 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7258 	}
7259 	validate_pmap_mutable(pmap);
7260 
7261 	vm_map_address_t va = start;
7262 	pmap_tlb_flush_range_t flush_range = {
7263 		.ptfr_pmap = pmap,
7264 		.ptfr_start = start,
7265 		.ptfr_end = end,
7266 		.ptfr_flush_needed = false
7267 	};
7268 
7269 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7270 		return va;
7271 	}
7272 
7273 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7274 
7275 	while (va < end) {
7276 		vm_map_address_t curr_end;
7277 
7278 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7279 		if (curr_end > end) {
7280 			curr_end = end;
7281 		}
7282 
7283 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7284 		if ((va < curr_end) || pmap_pending_preemption()) {
7285 			break;
7286 		}
7287 	}
7288 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7289 	if (flush_range.ptfr_flush_needed) {
7290 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7291 			flush_range.ptfr_start,
7292 			flush_range.ptfr_end - flush_range.ptfr_start,
7293 			flush_range.ptfr_pmap,
7294 			true,
7295 			false);
7296 		sync_tlb_flush();
7297 	}
7298 	return va;
7299 }
7300 
7301 static void
7302 phys_attribute_clear_range(
7303 	pmap_t pmap,
7304 	vm_map_address_t start,
7305 	vm_map_address_t end,
7306 	unsigned int bits,
7307 	unsigned int options)
7308 {
7309 	/*
7310 	 * We allow single-page requests to execute non-preemptibly,
7311 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7312 	 * operation, and there are a couple of special use cases that
7313 	 * require a non-preemptible single-page operation.
7314 	 */
7315 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7316 		pmap_verify_preemptible();
7317 	}
7318 
7319 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7320 
7321 	while (start < end) {
7322 #if XNU_MONITOR
7323 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7324 #else
7325 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7326 #endif
7327 	}
7328 
7329 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7330 }
7331 #endif /* __ARM_RANGE_TLBI__ */
7332 
7333 static void
7334 phys_attribute_clear(
7335 	ppnum_t         pn,
7336 	unsigned int    bits,
7337 	int             options,
7338 	void            *arg)
7339 {
7340 	/*
7341 	 * Do we really want this tracepoint?  It will be extremely chatty.
7342 	 * Also, should we have a corresponding trace point for the set path?
7343 	 */
7344 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7345 
7346 #if XNU_MONITOR
7347 	phys_attribute_clear_ppl(pn, bits, options, arg);
7348 #else
7349 	phys_attribute_clear_internal(pn, bits, options, arg);
7350 #endif
7351 
7352 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7353 }
7354 
7355 /*
7356  *	Set specified attribute bits.
7357  *
7358  *	Set cached value in the pv head because we have
7359  *	no per-mapping hardware support for referenced and
7360  *	modify bits.
7361  */
7362 MARK_AS_PMAP_TEXT void
7363 phys_attribute_set_internal(
7364 	ppnum_t pn,
7365 	unsigned int bits)
7366 {
7367 	pmap_paddr_t    pa = ptoa(pn);
7368 	assert(pn != vm_page_fictitious_addr);
7369 
7370 #if XNU_MONITOR
7371 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7372 		panic("%s: illegal request, "
7373 		    "pn=%u, bits=%#x",
7374 		    __FUNCTION__,
7375 		    pn, bits);
7376 	}
7377 #endif
7378 
7379 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7380 
7381 	return;
7382 }
7383 
7384 static void
7385 phys_attribute_set(
7386 	ppnum_t pn,
7387 	unsigned int bits)
7388 {
7389 #if XNU_MONITOR
7390 	phys_attribute_set_ppl(pn, bits);
7391 #else
7392 	phys_attribute_set_internal(pn, bits);
7393 #endif
7394 }
7395 
7396 
7397 /*
7398  *	Check specified attribute bits.
7399  *
7400  *	use the software cached bits (since no hw support).
7401  */
7402 static boolean_t
7403 phys_attribute_test(
7404 	ppnum_t pn,
7405 	unsigned int bits)
7406 {
7407 	pmap_paddr_t    pa = ptoa(pn);
7408 	assert(pn != vm_page_fictitious_addr);
7409 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7410 }
7411 
7412 
7413 /*
7414  *	Set the modify/reference bits on the specified physical page.
7415  */
7416 void
7417 pmap_set_modify(ppnum_t pn)
7418 {
7419 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7420 }
7421 
7422 
7423 /*
7424  *	Clear the modify bits on the specified physical page.
7425  */
7426 void
7427 pmap_clear_modify(
7428 	ppnum_t pn)
7429 {
7430 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7431 }
7432 
7433 
7434 /*
7435  *	pmap_is_modified:
7436  *
7437  *	Return whether or not the specified physical page is modified
7438  *	by any physical maps.
7439  */
7440 boolean_t
7441 pmap_is_modified(
7442 	ppnum_t pn)
7443 {
7444 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7445 }
7446 
7447 
7448 /*
7449  *	Set the reference bit on the specified physical page.
7450  */
7451 static void
7452 pmap_set_reference(
7453 	ppnum_t pn)
7454 {
7455 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7456 }
7457 
7458 /*
7459  *	Clear the reference bits on the specified physical page.
7460  */
7461 void
7462 pmap_clear_reference(
7463 	ppnum_t pn)
7464 {
7465 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7466 }
7467 
7468 
7469 /*
7470  *	pmap_is_referenced:
7471  *
7472  *	Return whether or not the specified physical page is referenced
7473  *	by any physical maps.
7474  */
7475 boolean_t
7476 pmap_is_referenced(
7477 	ppnum_t pn)
7478 {
7479 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7480 }
7481 
7482 /*
7483  * pmap_get_refmod(phys)
7484  *  returns the referenced and modified bits of the specified
7485  *  physical page.
7486  */
7487 unsigned int
7488 pmap_get_refmod(
7489 	ppnum_t pn)
7490 {
7491 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7492 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7493 }
7494 
7495 static inline unsigned int
7496 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7497 {
7498 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7499 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7500 }
7501 
7502 /*
7503  * pmap_clear_refmod(phys, mask)
7504  *  clears the referenced and modified bits as specified by the mask
7505  *  of the specified physical page.
7506  */
7507 void
7508 pmap_clear_refmod_options(
7509 	ppnum_t         pn,
7510 	unsigned int    mask,
7511 	unsigned int    options,
7512 	void            *arg)
7513 {
7514 	unsigned int    bits;
7515 
7516 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7517 	phys_attribute_clear(pn, bits, options, arg);
7518 }
7519 
7520 /*
7521  * Perform pmap_clear_refmod_options on a virtual address range.
7522  * The operation will be performed in bulk & tlb flushes will be coalesced
7523  * if possible.
7524  *
7525  * Returns true if the operation is supported on this platform.
7526  * If this function returns false, the operation is not supported and
7527  * nothing has been modified in the pmap.
7528  */
7529 bool
7530 pmap_clear_refmod_range_options(
7531 	pmap_t pmap __unused,
7532 	vm_map_address_t start __unused,
7533 	vm_map_address_t end __unused,
7534 	unsigned int mask __unused,
7535 	unsigned int options __unused)
7536 {
7537 #if __ARM_RANGE_TLBI__
7538 	unsigned int    bits;
7539 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7540 	phys_attribute_clear_range(pmap, start, end, bits, options);
7541 	return true;
7542 #else /* __ARM_RANGE_TLBI__ */
7543 #pragma unused(pmap, start, end, mask, options)
7544 	/*
7545 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7546 	 * contiguous range of addresses. This is large performance improvement on
7547 	 * platforms that support ranged tlbi instructions. But on older platforms,
7548 	 * we can only flush per-page or the entire asid. So we currently
7549 	 * only support this operation on platforms that support ranged tlbi.
7550 	 * instructions. On other platforms, we require that
7551 	 * the VM modify the bits on a per-page basis.
7552 	 */
7553 	return false;
7554 #endif /* __ARM_RANGE_TLBI__ */
7555 }
7556 
7557 void
7558 pmap_clear_refmod(
7559 	ppnum_t pn,
7560 	unsigned int mask)
7561 {
7562 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7563 }
7564 
7565 unsigned int
7566 pmap_disconnect_options(
7567 	ppnum_t pn,
7568 	unsigned int options,
7569 	void *arg)
7570 {
7571 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7572 		/*
7573 		 * On ARM, the "modified" bit is managed by software, so
7574 		 * we know up-front if the physical page is "modified",
7575 		 * without having to scan all the PTEs pointing to it.
7576 		 * The caller should have made the VM page "busy" so noone
7577 		 * should be able to establish any new mapping and "modify"
7578 		 * the page behind us.
7579 		 */
7580 		if (pmap_is_modified(pn)) {
7581 			/*
7582 			 * The page has been modified and will be sent to
7583 			 * the VM compressor.
7584 			 */
7585 			options |= PMAP_OPTIONS_COMPRESSOR;
7586 		} else {
7587 			/*
7588 			 * The page hasn't been modified and will be freed
7589 			 * instead of compressed.
7590 			 */
7591 		}
7592 	}
7593 
7594 	/* disconnect the page */
7595 	pmap_page_protect_options(pn, 0, options, arg);
7596 
7597 	/* return ref/chg status */
7598 	return pmap_get_refmod(pn);
7599 }
7600 
7601 /*
7602  *	Routine:
7603  *		pmap_disconnect
7604  *
7605  *	Function:
7606  *		Disconnect all mappings for this page and return reference and change status
7607  *		in generic format.
7608  *
7609  */
7610 unsigned int
7611 pmap_disconnect(
7612 	ppnum_t pn)
7613 {
7614 	pmap_page_protect(pn, 0);       /* disconnect the page */
7615 	return pmap_get_refmod(pn);   /* return ref/chg status */
7616 }
7617 
7618 boolean_t
7619 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7620 {
7621 	if (ptoa(first) >= vm_last_phys) {
7622 		return FALSE;
7623 	}
7624 	if (ptoa(last) < vm_first_phys) {
7625 		return FALSE;
7626 	}
7627 
7628 	return TRUE;
7629 }
7630 
7631 /*
7632  * The state maintained by the noencrypt functions is used as a
7633  * debugging aid on ARM.  This incurs some overhead on the part
7634  * of the caller.  A special case check in phys_attribute_clear
7635  * (the most expensive path) currently minimizes this overhead,
7636  * but stubbing these functions out on RELEASE kernels yields
7637  * further wins.
7638  */
7639 boolean_t
7640 pmap_is_noencrypt(
7641 	ppnum_t pn)
7642 {
7643 #if DEVELOPMENT || DEBUG
7644 	boolean_t result = FALSE;
7645 
7646 	if (!pa_valid(ptoa(pn))) {
7647 		return FALSE;
7648 	}
7649 
7650 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7651 
7652 	return result;
7653 #else
7654 #pragma unused(pn)
7655 	return FALSE;
7656 #endif
7657 }
7658 
7659 void
7660 pmap_set_noencrypt(
7661 	ppnum_t pn)
7662 {
7663 #if DEVELOPMENT || DEBUG
7664 	if (!pa_valid(ptoa(pn))) {
7665 		return;
7666 	}
7667 
7668 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7669 #else
7670 #pragma unused(pn)
7671 #endif
7672 }
7673 
7674 void
7675 pmap_clear_noencrypt(
7676 	ppnum_t pn)
7677 {
7678 #if DEVELOPMENT || DEBUG
7679 	if (!pa_valid(ptoa(pn))) {
7680 		return;
7681 	}
7682 
7683 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7684 #else
7685 #pragma unused(pn)
7686 #endif
7687 }
7688 
7689 #if XNU_MONITOR
7690 boolean_t
7691 pmap_is_monitor(ppnum_t pn)
7692 {
7693 	assert(pa_valid(ptoa(pn)));
7694 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7695 }
7696 #endif
7697 
7698 void
7699 pmap_lock_phys_page(ppnum_t pn)
7700 {
7701 #if !XNU_MONITOR
7702 	unsigned int    pai;
7703 	pmap_paddr_t    phys = ptoa(pn);
7704 
7705 	if (pa_valid(phys)) {
7706 		pai = pa_index(phys);
7707 		pvh_lock(pai);
7708 	} else
7709 #else
7710 	(void)pn;
7711 #endif
7712 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7713 }
7714 
7715 
7716 void
7717 pmap_unlock_phys_page(ppnum_t pn)
7718 {
7719 #if !XNU_MONITOR
7720 	unsigned int    pai;
7721 	pmap_paddr_t    phys = ptoa(pn);
7722 
7723 	if (pa_valid(phys)) {
7724 		pai = pa_index(phys);
7725 		pvh_unlock(pai);
7726 	} else
7727 #else
7728 	(void)pn;
7729 #endif
7730 	{ simple_unlock(&phys_backup_lock);}
7731 }
7732 
7733 MARK_AS_PMAP_TEXT static void
7734 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7735 {
7736 	if (pmap != kernel_pmap) {
7737 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7738 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7739 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7740 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7741 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7742 #if __ARM_MIXED_PAGE_SIZE__
7743 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7744 #endif
7745 	}
7746 
7747 
7748 #if __ARM_MIXED_PAGE_SIZE__
7749 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7750 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7751 	}
7752 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7753 
7754 
7755 	if (pmap != kernel_pmap) {
7756 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7757 	} else if (!pmap_user_ttb_is_clear()) {
7758 		pmap_clear_user_ttb_internal();
7759 	}
7760 }
7761 
7762 MARK_AS_PMAP_TEXT void
7763 pmap_clear_user_ttb_internal(void)
7764 {
7765 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7766 }
7767 
7768 void
7769 pmap_clear_user_ttb(void)
7770 {
7771 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7772 #if XNU_MONITOR
7773 	pmap_clear_user_ttb_ppl();
7774 #else
7775 	pmap_clear_user_ttb_internal();
7776 #endif
7777 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7778 }
7779 
7780 
7781 #if defined(__arm64__)
7782 /*
7783  * Marker for use in multi-pass fast-fault PV list processing.
7784  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7785  * these functions, as compressed PTEs should never be present in PV lists.
7786  * Note that this only holds true for arm64; for arm32 we don't have enough
7787  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7788  * and WRITEABLE marker depending on whether the PTE is valid.
7789  */
7790 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7791 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7792 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7793 #endif
7794 
7795 
7796 MARK_AS_PMAP_TEXT static boolean_t
7797 arm_force_fast_fault_with_flush_range(
7798 	ppnum_t         ppnum,
7799 	vm_prot_t       allow_mode,
7800 	int             options,
7801 	pmap_tlb_flush_range_t *flush_range)
7802 {
7803 	pmap_paddr_t     phys = ptoa(ppnum);
7804 	pv_entry_t      *pve_p;
7805 	pt_entry_t      *pte_p;
7806 	unsigned int     pai;
7807 	unsigned int     pass1_updated = 0;
7808 	unsigned int     pass2_updated = 0;
7809 	boolean_t        result;
7810 	pv_entry_t     **pv_h;
7811 	bool             is_reusable;
7812 	bool             ref_fault;
7813 	bool             mod_fault;
7814 	bool             clear_write_fault = false;
7815 	bool             ref_aliases_mod = false;
7816 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7817 
7818 	assert(ppnum != vm_page_fictitious_addr);
7819 
7820 	if (!pa_valid(phys)) {
7821 		return FALSE;   /* Not a managed page. */
7822 	}
7823 
7824 	result = TRUE;
7825 	ref_fault = false;
7826 	mod_fault = false;
7827 	pai = pa_index(phys);
7828 	if (__probable(mustsynch)) {
7829 		pvh_lock(pai);
7830 	}
7831 	pv_h = pai_to_pvh(pai);
7832 
7833 #if XNU_MONITOR
7834 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7835 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7836 	}
7837 #endif
7838 	pte_p = PT_ENTRY_NULL;
7839 	pve_p = PV_ENTRY_NULL;
7840 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7841 		pte_p = pvh_ptep(pv_h);
7842 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7843 		pve_p = pvh_pve_list(pv_h);
7844 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7845 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7846 	}
7847 
7848 	is_reusable = ppattr_test_reusable(pai);
7849 
7850 	/*
7851 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7852 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7853 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7854 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7855 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7856 	 * tlb_flush_needed to be true while issue_tlbi is false.
7857 	 */
7858 	bool issue_tlbi = false;
7859 	bool tlb_flush_needed = false;
7860 
7861 	pv_entry_t *orig_pve_p = pve_p;
7862 	pt_entry_t *orig_pte_p = pte_p;
7863 	int pve_ptep_idx = 0;
7864 
7865 	/*
7866 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7867 	 * TLB invalidation in pass 2.
7868 	 */
7869 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7870 		pt_entry_t       spte;
7871 		pt_entry_t       tmplate;
7872 
7873 		if (pve_p != PV_ENTRY_NULL) {
7874 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7875 			if (pte_p == PT_ENTRY_NULL) {
7876 				goto fff_skip_pve_pass1;
7877 			}
7878 		}
7879 
7880 #ifdef PVH_FLAG_IOMMU
7881 		if (pvh_ptep_is_iommu(pte_p)) {
7882 			goto fff_skip_pve_pass1;
7883 		}
7884 #endif
7885 		if (*pte_p == ARM_PTE_EMPTY) {
7886 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7887 		}
7888 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7889 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7890 		}
7891 
7892 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7893 		const pmap_t pmap = ptdp->pmap;
7894 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7895 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7896 
7897 		assert(va >= pmap->min && va < pmap->max);
7898 
7899 		/* update pmap stats and ledgers */
7900 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7901 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7902 		if (is_altacct) {
7903 			/*
7904 			 * We do not track "reusable" status for
7905 			 * "alternate accounting" mappings.
7906 			 */
7907 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7908 		    is_reusable &&
7909 		    is_internal &&
7910 		    pmap != kernel_pmap) {
7911 			/* one less "reusable" */
7912 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7913 			/* one more "internal" */
7914 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7915 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7916 
7917 			/*
7918 			 * Since the page is being marked non-reusable, we assume that it will be
7919 			 * modified soon.  Avoid the cost of another trap to handle the fast
7920 			 * fault when we next write to this page.
7921 			 */
7922 			clear_write_fault = true;
7923 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7924 		    !is_reusable &&
7925 		    is_internal &&
7926 		    pmap != kernel_pmap) {
7927 			/* one more "reusable" */
7928 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7929 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7930 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7931 		}
7932 
7933 		bool wiredskip = pte_is_wired(*pte_p) &&
7934 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7935 
7936 		if (wiredskip) {
7937 			result = FALSE;
7938 			goto fff_skip_pve_pass1;
7939 		}
7940 
7941 		spte = *pte_p;
7942 		tmplate = spte;
7943 
7944 #if HAS_FEAT_XS
7945 		/**
7946 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7947 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7948 		 */
7949 		assert(!pte_is_xs(pt_attr, spte));
7950 #endif /* HAS_FEAT_XS */
7951 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7952 			/* read protection sets the pte to fault */
7953 			tmplate =  tmplate & ~ARM_PTE_AF;
7954 			ref_fault = true;
7955 		}
7956 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7957 			/* take away write permission if set */
7958 			if (pmap == kernel_pmap) {
7959 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7960 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7961 					pte_set_was_writeable(tmplate, true);
7962 					mod_fault = true;
7963 				}
7964 			} else {
7965 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7966 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7967 					pte_set_was_writeable(tmplate, true);
7968 					mod_fault = true;
7969 				}
7970 			}
7971 		}
7972 
7973 #if MACH_ASSERT && XNU_MONITOR
7974 		if (is_pte_xprr_protected(pmap, spte)) {
7975 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7976 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7977 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7978 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7979 				    ppnum, options, allow_mode);
7980 			}
7981 		}
7982 #endif /* MACH_ASSERT && XNU_MONITOR */
7983 
7984 		if (result && (tmplate != spte)) {
7985 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7986 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7987 				tlb_flush_needed = true;
7988 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7989 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7990 #ifdef ARM_PTE_FF_MARKER
7991 					assert(!(spte & ARM_PTE_FF_MARKER));
7992 					tmplate |= ARM_PTE_FF_MARKER;
7993 					++pass1_updated;
7994 #endif
7995 					issue_tlbi = true;
7996 				}
7997 			}
7998 			write_pte_fast(pte_p, tmplate);
7999 		}
8000 
8001 fff_skip_pve_pass1:
8002 		pte_p = PT_ENTRY_NULL;
8003 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8004 			pve_ptep_idx = 0;
8005 			pve_p = pve_next(pve_p);
8006 		}
8007 	}
8008 
8009 	if (tlb_flush_needed) {
8010 		FLUSH_PTE_STRONG();
8011 	}
8012 
8013 	if (!issue_tlbi) {
8014 		goto fff_finish;
8015 	}
8016 
8017 	/* Pass 2: Issue any required TLB invalidations */
8018 	pve_p = orig_pve_p;
8019 	pte_p = orig_pte_p;
8020 	pve_ptep_idx = 0;
8021 
8022 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8023 		if (pve_p != PV_ENTRY_NULL) {
8024 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8025 			if (pte_p == PT_ENTRY_NULL) {
8026 				goto fff_skip_pve_pass2;
8027 			}
8028 		}
8029 
8030 #ifdef PVH_FLAG_IOMMU
8031 		if (pvh_ptep_is_iommu(pte_p)) {
8032 			goto fff_skip_pve_pass2;
8033 		}
8034 #endif
8035 
8036 #ifdef ARM_PTE_FF_MARKER
8037 		pt_entry_t spte = *pte_p;
8038 
8039 		if (!(spte & ARM_PTE_FF_MARKER)) {
8040 			goto fff_skip_pve_pass2;
8041 		} else {
8042 			spte &= (~ARM_PTE_FF_MARKER);
8043 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8044 			write_pte_fast(pte_p, spte);
8045 			++pass2_updated;
8046 		}
8047 #endif
8048 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8049 		const pmap_t pmap = ptdp->pmap;
8050 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8051 
8052 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8053 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8054 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8055 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8056 		}
8057 
8058 fff_skip_pve_pass2:
8059 		pte_p = PT_ENTRY_NULL;
8060 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8061 			pve_ptep_idx = 0;
8062 			pve_p = pve_next(pve_p);
8063 		}
8064 	}
8065 
8066 fff_finish:
8067 	if (__improbable(pass1_updated != pass2_updated)) {
8068 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8069 		    __func__, pass1_updated, pass2_updated);
8070 	}
8071 
8072 	/*
8073 	 * If we are using the same approach for ref and mod
8074 	 * faults on this PTE, do not clear the write fault;
8075 	 * this would cause both ref and mod to be set on the
8076 	 * page again, and prevent us from taking ANY read/write
8077 	 * fault on the mapping.
8078 	 */
8079 	if (clear_write_fault && !ref_aliases_mod) {
8080 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8081 	}
8082 	if (tlb_flush_needed) {
8083 		if (flush_range) {
8084 			/* Delayed flush. Signal to the caller that the flush is needed. */
8085 			flush_range->ptfr_flush_needed = true;
8086 		} else {
8087 			sync_tlb_flush();
8088 		}
8089 	}
8090 
8091 	/* update global "reusable" status for this page */
8092 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8093 		ppattr_clear_reusable(pai);
8094 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8095 		ppattr_set_reusable(pai);
8096 	}
8097 
8098 	if (mod_fault) {
8099 		ppattr_set_modfault(pai);
8100 	}
8101 	if (ref_fault) {
8102 		ppattr_set_reffault(pai);
8103 	}
8104 	if (__probable(mustsynch)) {
8105 		pvh_unlock(pai);
8106 	}
8107 	return result;
8108 }
8109 
8110 MARK_AS_PMAP_TEXT boolean_t
8111 arm_force_fast_fault_internal(
8112 	ppnum_t         ppnum,
8113 	vm_prot_t       allow_mode,
8114 	int             options)
8115 {
8116 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8117 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8118 	}
8119 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8120 }
8121 
8122 /*
8123  *	Routine:	arm_force_fast_fault
8124  *
8125  *	Function:
8126  *		Force all mappings for this page to fault according
8127  *		to the access modes allowed, so we can gather ref/modify
8128  *		bits again.
8129  */
8130 
8131 boolean_t
8132 arm_force_fast_fault(
8133 	ppnum_t         ppnum,
8134 	vm_prot_t       allow_mode,
8135 	int             options,
8136 	__unused void   *arg)
8137 {
8138 	pmap_paddr_t    phys = ptoa(ppnum);
8139 
8140 	assert(ppnum != vm_page_fictitious_addr);
8141 
8142 	if (!pa_valid(phys)) {
8143 		return FALSE;   /* Not a managed page. */
8144 	}
8145 
8146 #if XNU_MONITOR
8147 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8148 #else
8149 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8150 #endif
8151 }
8152 
8153 /*
8154  *	Routine:	arm_clear_fast_fault
8155  *
8156  *	Function:
8157  *		Clear pending force fault for all mappings for this page based on
8158  *		the observed fault type, update ref/modify bits.
8159  */
8160 MARK_AS_PMAP_TEXT static boolean_t
8161 arm_clear_fast_fault(
8162 	ppnum_t ppnum,
8163 	vm_prot_t fault_type,
8164 	pt_entry_t *pte_p)
8165 {
8166 	pmap_paddr_t    pa = ptoa(ppnum);
8167 	pv_entry_t     *pve_p;
8168 	unsigned int    pai;
8169 	boolean_t       result;
8170 	bool            tlb_flush_needed = false;
8171 	pv_entry_t    **pv_h;
8172 	unsigned int    npve = 0;
8173 	unsigned int    pass1_updated = 0;
8174 	unsigned int    pass2_updated = 0;
8175 
8176 	assert(ppnum != vm_page_fictitious_addr);
8177 
8178 	if (!pa_valid(pa)) {
8179 		return FALSE;   /* Not a managed page. */
8180 	}
8181 
8182 	result = FALSE;
8183 	pai = pa_index(pa);
8184 	pvh_assert_locked(pai);
8185 	pv_h = pai_to_pvh(pai);
8186 
8187 	pve_p = PV_ENTRY_NULL;
8188 	if (pte_p == PT_ENTRY_NULL) {
8189 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8190 			pte_p = pvh_ptep(pv_h);
8191 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8192 			pve_p = pvh_pve_list(pv_h);
8193 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8194 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8195 		}
8196 	}
8197 
8198 	pv_entry_t *orig_pve_p = pve_p;
8199 	pt_entry_t *orig_pte_p = pte_p;
8200 	int pve_ptep_idx = 0;
8201 
8202 	/*
8203 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8204 	 * TLB invalidation in pass 2.
8205 	 */
8206 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8207 		pt_entry_t spte;
8208 		pt_entry_t tmplate;
8209 
8210 		if (pve_p != PV_ENTRY_NULL) {
8211 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8212 			if (pte_p == PT_ENTRY_NULL) {
8213 				goto cff_skip_pve_pass1;
8214 			}
8215 		}
8216 
8217 #ifdef PVH_FLAG_IOMMU
8218 		if (pvh_ptep_is_iommu(pte_p)) {
8219 			goto cff_skip_pve_pass1;
8220 		}
8221 #endif
8222 		if (*pte_p == ARM_PTE_EMPTY) {
8223 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8224 		}
8225 
8226 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8227 		const pmap_t pmap = ptdp->pmap;
8228 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8229 
8230 		assert(va >= pmap->min && va < pmap->max);
8231 
8232 		spte = *pte_p;
8233 		tmplate = spte;
8234 
8235 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8236 			{
8237 				if (pmap == kernel_pmap) {
8238 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8239 				} else {
8240 					assert(pmap->type != PMAP_TYPE_NESTED);
8241 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8242 				}
8243 			}
8244 
8245 			tmplate |= ARM_PTE_AF;
8246 
8247 			pte_set_was_writeable(tmplate, false);
8248 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8249 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8250 			tmplate = spte | ARM_PTE_AF;
8251 
8252 			{
8253 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8254 			}
8255 		}
8256 
8257 #if MACH_ASSERT && XNU_MONITOR
8258 		if (is_pte_xprr_protected(pmap, spte)) {
8259 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8260 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8261 				    "ppnum=0x%x, fault_type=0x%x",
8262 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8263 				    ppnum, fault_type);
8264 			}
8265 		}
8266 #endif /* MACH_ASSERT && XNU_MONITOR */
8267 
8268 		assert(spte != ARM_PTE_TYPE_FAULT);
8269 		if (spte != tmplate) {
8270 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8271 #ifdef ARM_PTE_FF_MARKER
8272 				assert(!(spte & ARM_PTE_FF_MARKER));
8273 				tmplate |= ARM_PTE_FF_MARKER;
8274 				++pass1_updated;
8275 #endif
8276 				tlb_flush_needed = true;
8277 			}
8278 			write_pte_fast(pte_p, tmplate);
8279 			result = TRUE;
8280 		}
8281 
8282 cff_skip_pve_pass1:
8283 		pte_p = PT_ENTRY_NULL;
8284 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8285 			pve_ptep_idx = 0;
8286 			pve_p = pve_next(pve_p);
8287 			++npve;
8288 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8289 				break;
8290 			}
8291 		}
8292 	}
8293 
8294 	if (!tlb_flush_needed) {
8295 		goto cff_finish;
8296 	}
8297 
8298 	FLUSH_PTE_STRONG();
8299 
8300 	/* Pass 2: Issue any required TLB invalidations */
8301 	pve_p = orig_pve_p;
8302 	pte_p = orig_pte_p;
8303 	pve_ptep_idx = 0;
8304 	npve = 0;
8305 
8306 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8307 		if (pve_p != PV_ENTRY_NULL) {
8308 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8309 			if (pte_p == PT_ENTRY_NULL) {
8310 				goto cff_skip_pve_pass2;
8311 			}
8312 		}
8313 
8314 #ifdef PVH_FLAG_IOMMU
8315 		if (pvh_ptep_is_iommu(pte_p)) {
8316 			goto cff_skip_pve_pass2;
8317 		}
8318 #endif
8319 
8320 #ifdef ARM_PTE_FF_MARKER
8321 		pt_entry_t spte = *pte_p;
8322 
8323 		if (!(spte & ARM_PTE_FF_MARKER)) {
8324 			goto cff_skip_pve_pass2;
8325 		} else {
8326 			spte &= (~ARM_PTE_FF_MARKER);
8327 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8328 			write_pte_fast(pte_p, spte);
8329 			++pass2_updated;
8330 		}
8331 #endif
8332 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8333 		const pmap_t pmap = ptdp->pmap;
8334 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8335 
8336 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8337 		    pmap, true, false);
8338 
8339 cff_skip_pve_pass2:
8340 		pte_p = PT_ENTRY_NULL;
8341 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8342 			pve_ptep_idx = 0;
8343 			pve_p = pve_next(pve_p);
8344 			++npve;
8345 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8346 				break;
8347 			}
8348 		}
8349 	}
8350 
8351 cff_finish:
8352 	if (__improbable(pass1_updated != pass2_updated)) {
8353 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8354 		    __func__, pass1_updated, pass2_updated);
8355 	}
8356 	if (tlb_flush_needed) {
8357 		sync_tlb_flush();
8358 	}
8359 	return result;
8360 }
8361 
8362 /*
8363  * Determine if the fault was induced by software tracking of
8364  * modify/reference bits.  If so, re-enable the mapping (and set
8365  * the appropriate bits).
8366  *
8367  * Returns KERN_SUCCESS if the fault was induced and was
8368  * successfully handled.
8369  *
8370  * Returns KERN_FAILURE if the fault was not induced and
8371  * the function was unable to deal with it.
8372  *
8373  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8374  * disallows this type of access.
8375  *
8376  * Returns KERN_ABORTED if the pmap lock is taken and a
8377  * preemption is pending.
8378  *
8379  */
8380 MARK_AS_PMAP_TEXT kern_return_t
8381 arm_fast_fault_internal(
8382 	pmap_t pmap,
8383 	vm_map_address_t va,
8384 	vm_prot_t fault_type,
8385 	__unused bool was_af_fault,
8386 	__unused bool from_user)
8387 {
8388 	kern_return_t   result = KERN_FAILURE;
8389 	pt_entry_t     *ptep;
8390 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8391 	unsigned int    pai;
8392 	pmap_paddr_t    pa;
8393 	validate_pmap_mutable(pmap);
8394 
8395 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8396 		return KERN_ABORTED;
8397 	}
8398 
8399 	/*
8400 	 * If the entry doesn't exist, is completely invalid, or is already
8401 	 * valid, we can't fix it here.
8402 	 */
8403 
8404 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8405 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8406 	if (ptep != PT_ENTRY_NULL) {
8407 		while (true) {
8408 			spte = *((volatile pt_entry_t*)ptep);
8409 
8410 			pa = pte_to_pa(spte);
8411 
8412 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8413 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8414 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8415 				return result;
8416 			}
8417 
8418 			if (!pa_valid(pa)) {
8419 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8420 #if XNU_MONITOR
8421 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8422 					return KERN_PROTECTION_FAILURE;
8423 				} else
8424 #endif
8425 				return result;
8426 			}
8427 			pai = pa_index(pa);
8428 			pvh_lock(pai);
8429 			if (*ptep == spte) {
8430 				/*
8431 				 * Double-check the spte value, as we care about the AF bit.
8432 				 * It's also possible that pmap_page_protect() transitioned the
8433 				 * PTE to compressed/empty before we grabbed the PVH lock.
8434 				 */
8435 				break;
8436 			}
8437 			pvh_unlock(pai);
8438 		}
8439 	} else {
8440 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8441 		return result;
8442 	}
8443 
8444 
8445 	if ((result != KERN_SUCCESS) &&
8446 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8447 		/*
8448 		 * An attempted access will always clear ref/mod fault state, as
8449 		 * appropriate for the fault type.  arm_clear_fast_fault will
8450 		 * update the associated PTEs for the page as appropriate; if
8451 		 * any PTEs are updated, we redrive the access.  If the mapping
8452 		 * does not actually allow for the attempted access, the
8453 		 * following fault will (hopefully) fail to update any PTEs, and
8454 		 * thus cause arm_fast_fault to decide that it failed to handle
8455 		 * the fault.
8456 		 */
8457 		if (ppattr_test_reffault(pai)) {
8458 			ppattr_clear_reffault(pai);
8459 		}
8460 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8461 			ppattr_clear_modfault(pai);
8462 		}
8463 
8464 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8465 			/*
8466 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8467 			 * cost of not doing so is a another fault in a case
8468 			 * that should already result in an exception.
8469 			 */
8470 			result = KERN_SUCCESS;
8471 		}
8472 	}
8473 
8474 	/*
8475 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8476 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8477 	 * on mappings of the same page
8478 	 */
8479 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8480 		uintptr_t ap_ro, ap_rw, ap_x;
8481 		if (pmap == kernel_pmap) {
8482 			ap_ro = ARM_PTE_AP(AP_RONA);
8483 			ap_rw = ARM_PTE_AP(AP_RWNA);
8484 			ap_x = ARM_PTE_NX;
8485 		} else {
8486 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8487 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8488 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8489 		}
8490 		/*
8491 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8492 		 * hardware they may be xPRR-protected, in which case they'll be handled
8493 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8494 		 * handling path currently does not call arm_fast_fault() without at least
8495 		 * VM_PROT_READ in fault_type.
8496 		 */
8497 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8498 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8499 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8500 				result = KERN_SUCCESS;
8501 			}
8502 		}
8503 	}
8504 
8505 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8506 		/*
8507 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8508 		 * another pending PV list operation or an excessively large PV list.
8509 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8510 		 * taking a fault on the same mapping.
8511 		 */
8512 		result = KERN_SUCCESS;
8513 	}
8514 
8515 	pvh_unlock(pai);
8516 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8517 	return result;
8518 }
8519 
8520 kern_return_t
8521 arm_fast_fault(
8522 	pmap_t pmap,
8523 	vm_map_address_t va,
8524 	vm_prot_t fault_type,
8525 	bool was_af_fault,
8526 	__unused bool from_user)
8527 {
8528 	kern_return_t   result = KERN_FAILURE;
8529 
8530 	if (va < pmap->min || va >= pmap->max) {
8531 		return result;
8532 	}
8533 
8534 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8535 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8536 	    from_user);
8537 
8538 	do {
8539 #if XNU_MONITOR
8540 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8541 #else
8542 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8543 #endif
8544 	} while (result == KERN_ABORTED);
8545 
8546 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8547 
8548 	return result;
8549 }
8550 
8551 void
8552 pmap_copy_page(
8553 	ppnum_t psrc,
8554 	ppnum_t pdst)
8555 {
8556 	bcopy_phys((addr64_t) (ptoa(psrc)),
8557 	    (addr64_t) (ptoa(pdst)),
8558 	    PAGE_SIZE);
8559 }
8560 
8561 
8562 /*
8563  *	pmap_copy_page copies the specified (machine independent) pages.
8564  */
8565 void
8566 pmap_copy_part_page(
8567 	ppnum_t psrc,
8568 	vm_offset_t src_offset,
8569 	ppnum_t pdst,
8570 	vm_offset_t dst_offset,
8571 	vm_size_t len)
8572 {
8573 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8574 	    (addr64_t) (ptoa(pdst) + dst_offset),
8575 	    len);
8576 }
8577 
8578 
8579 /*
8580  *	pmap_zero_page zeros the specified (machine independent) page.
8581  */
8582 void
8583 pmap_zero_page(
8584 	ppnum_t pn)
8585 {
8586 	assert(pn != vm_page_fictitious_addr);
8587 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8588 }
8589 
8590 /*
8591  *	pmap_zero_part_page
8592  *	zeros the specified (machine independent) part of a page.
8593  */
8594 void
8595 pmap_zero_part_page(
8596 	ppnum_t pn,
8597 	vm_offset_t offset,
8598 	vm_size_t len)
8599 {
8600 	assert(pn != vm_page_fictitious_addr);
8601 	assert(offset + len <= PAGE_SIZE);
8602 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8603 }
8604 
8605 void
8606 pmap_map_globals(
8607 	void)
8608 {
8609 	pt_entry_t      *ptep, pte;
8610 
8611 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8612 	assert(ptep != PT_ENTRY_NULL);
8613 	assert(*ptep == ARM_PTE_EMPTY);
8614 
8615 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8616 #if __ARM_KERNEL_PROTECT__
8617 	pte |= ARM_PTE_NG;
8618 #endif /* __ARM_KERNEL_PROTECT__ */
8619 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8620 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8621 	*ptep = pte;
8622 	FLUSH_PTE();
8623 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8624 
8625 #if KASAN
8626 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8627 #endif
8628 }
8629 
8630 vm_offset_t
8631 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8632 {
8633 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8634 		panic("%s: invalid index %u", __func__, index);
8635 	}
8636 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8637 }
8638 
8639 MARK_AS_PMAP_TEXT unsigned int
8640 pmap_map_cpu_windows_copy_internal(
8641 	ppnum_t pn,
8642 	vm_prot_t prot,
8643 	unsigned int wimg_bits)
8644 {
8645 	pt_entry_t      *ptep = NULL, pte;
8646 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8647 	unsigned int    cpu_num;
8648 	unsigned int    i;
8649 	vm_offset_t     cpu_copywindow_vaddr = 0;
8650 	bool            need_strong_sync = false;
8651 
8652 #if XNU_MONITOR
8653 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8654 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8655 #endif
8656 
8657 #if XNU_MONITOR
8658 #ifdef  __ARM_COHERENT_IO__
8659 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8660 		panic("%s: attempted to map a managed page, "
8661 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8662 		    __FUNCTION__,
8663 		    pn, prot, wimg_bits);
8664 	}
8665 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8666 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8667 	}
8668 
8669 #else /* __ARM_COHERENT_IO__ */
8670 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8671 #endif /* __ARM_COHERENT_IO__ */
8672 #endif /* XNU_MONITOR */
8673 	cpu_num = pmap_cpu_data->cpu_number;
8674 
8675 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8676 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8677 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8678 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8679 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8680 			break;
8681 		}
8682 	}
8683 	if (i == CPUWINDOWS_MAX) {
8684 		panic("pmap_map_cpu_windows_copy: out of window");
8685 	}
8686 
8687 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8688 #if __ARM_KERNEL_PROTECT__
8689 	pte |= ARM_PTE_NG;
8690 #endif /* __ARM_KERNEL_PROTECT__ */
8691 
8692 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8693 
8694 	if (prot & VM_PROT_WRITE) {
8695 		pte |= ARM_PTE_AP(AP_RWNA);
8696 	} else {
8697 		pte |= ARM_PTE_AP(AP_RONA);
8698 	}
8699 #if HAS_FEAT_XS
8700 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8701 #endif
8702 	write_pte_fast(ptep, pte);
8703 	/*
8704 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8705 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8706 	 */
8707 	FLUSH_PTE_STRONG();
8708 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8709 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8710 
8711 	return i;
8712 }
8713 
8714 unsigned int
8715 pmap_map_cpu_windows_copy(
8716 	ppnum_t pn,
8717 	vm_prot_t prot,
8718 	unsigned int wimg_bits)
8719 {
8720 #if XNU_MONITOR
8721 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8722 #else
8723 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8724 #endif
8725 }
8726 
8727 MARK_AS_PMAP_TEXT void
8728 pmap_unmap_cpu_windows_copy_internal(
8729 	unsigned int index)
8730 {
8731 	pt_entry_t      *ptep;
8732 	unsigned int    cpu_num;
8733 	vm_offset_t     cpu_copywindow_vaddr = 0;
8734 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8735 
8736 	cpu_num = pmap_cpu_data->cpu_number;
8737 
8738 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8739 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8740 	 * (which are likely to have been on I/O memory) are complete before
8741 	 * tearing down the mapping. */
8742 	__builtin_arm_dsb(DSB_SY);
8743 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8744 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8745 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8746 }
8747 
8748 void
8749 pmap_unmap_cpu_windows_copy(
8750 	unsigned int index)
8751 {
8752 #if XNU_MONITOR
8753 	return pmap_unmap_cpu_windows_copy_ppl(index);
8754 #else
8755 	return pmap_unmap_cpu_windows_copy_internal(index);
8756 #endif
8757 }
8758 
8759 #if XNU_MONITOR
8760 
8761 MARK_AS_PMAP_TEXT void
8762 pmap_invoke_with_page(
8763 	ppnum_t page_number,
8764 	void *ctx,
8765 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8766 {
8767 	#pragma unused(page_number, ctx, callback)
8768 }
8769 
8770 /*
8771  * Loop over every pmap_io_range (I/O ranges marked as owned by
8772  * the PPL in the device tree) and conditionally call callback() on each range
8773  * that needs to be included in the hibernation image.
8774  *
8775  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8776  *                 context is needed in the callback.
8777  * @param callback Callback function invoked on each range (gated by flag).
8778  */
8779 MARK_AS_PMAP_TEXT void
8780 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8781 {
8782 	extern const pmap_io_range_t* io_attr_table;
8783 	extern const unsigned int num_io_rgns;
8784 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8785 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8786 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8787 		}
8788 	}
8789 }
8790 
8791 /**
8792  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8793  * PPL-owned page. Otherwise, do nothing.
8794  *
8795  * @param addr Physical address of the page to set the HASHED flag on.
8796  */
8797 MARK_AS_PMAP_TEXT void
8798 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8799 {
8800 	/* Ignore non-managed kernel memory. */
8801 	if (!pa_valid(addr)) {
8802 		return;
8803 	}
8804 
8805 	const unsigned int pai = pa_index(addr);
8806 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8807 		pv_entry_t **pv_h = pai_to_pvh(pai);
8808 
8809 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8810 		pvh_lock(pai);
8811 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8812 		pvh_unlock(pai);
8813 	}
8814 }
8815 
8816 /**
8817  * Loop through every physical page in the system and clear out the HASHED flag
8818  * on every PPL-owned page. That flag is used to keep track of which pages have
8819  * been hashed into the hibernation image during the hibernation entry process.
8820  *
8821  * The HASHED flag needs to be cleared out between hibernation cycles because the
8822  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8823  * image with the HASHED flag set on certain pages. It's important to clear the
8824  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8825  * into the hibernation image can't be compromised across hibernation cycles.
8826  */
8827 MARK_AS_PMAP_TEXT void
8828 pmap_clear_ppl_hashed_flag_all(void)
8829 {
8830 	const unsigned int last_index = pa_index(vm_last_phys);
8831 	pv_entry_t **pv_h = NULL;
8832 
8833 	for (int pai = 0; pai < last_index; ++pai) {
8834 		pv_h = pai_to_pvh(pai);
8835 
8836 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8837 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8838 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8839 			pvh_lock(pai);
8840 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8841 			pvh_unlock(pai);
8842 		}
8843 	}
8844 }
8845 
8846 /**
8847  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8848  * ppl_hib driver will call this after all wired pages have been copied into the
8849  * hibernation image.
8850  */
8851 MARK_AS_PMAP_TEXT void
8852 pmap_check_ppl_hashed_flag_all(void)
8853 {
8854 	const unsigned int last_index = pa_index(vm_last_phys);
8855 	pv_entry_t **pv_h = NULL;
8856 
8857 	for (int pai = 0; pai < last_index; ++pai) {
8858 		pv_h = pai_to_pvh(pai);
8859 
8860 		/**
8861 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8862 		 * the pages that contain the PMAP stacks.
8863 		 */
8864 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8865 		    (pai < pa_index(pmap_stacks_end_pa));
8866 
8867 		if (!is_pmap_stack &&
8868 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8869 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8870 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8871 		}
8872 	}
8873 }
8874 
8875 #endif /* XNU_MONITOR */
8876 
8877 /*
8878  * Indicate that a pmap is intended to be used as a nested pmap
8879  * within one or more larger address spaces.  This must be set
8880  * before pmap_nest() is called with this pmap as the 'subordinate'.
8881  */
8882 MARK_AS_PMAP_TEXT void
8883 pmap_set_nested_internal(
8884 	pmap_t pmap)
8885 {
8886 	validate_pmap_mutable(pmap);
8887 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8888 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8889 		    __func__, pmap, pmap->type);
8890 	}
8891 	pmap->type = PMAP_TYPE_NESTED;
8892 	pmap_get_pt_ops(pmap)->free_id(pmap);
8893 }
8894 
8895 void
8896 pmap_set_nested(
8897 	pmap_t pmap)
8898 {
8899 #if XNU_MONITOR
8900 	pmap_set_nested_ppl(pmap);
8901 #else
8902 	pmap_set_nested_internal(pmap);
8903 #endif
8904 }
8905 
8906 bool
8907 pmap_is_nested(
8908 	pmap_t pmap)
8909 {
8910 	return pmap->type == PMAP_TYPE_NESTED;
8911 }
8912 
8913 /*
8914  * pmap_trim_range(pmap, start, end)
8915  *
8916  * pmap  = pmap to operate on
8917  * start = start of the range
8918  * end   = end of the range
8919  *
8920  * Attempts to deallocate TTEs for the given range in the nested range.
8921  */
8922 MARK_AS_PMAP_TEXT static void
8923 pmap_trim_range(
8924 	pmap_t pmap,
8925 	addr64_t start,
8926 	addr64_t end)
8927 {
8928 	addr64_t cur;
8929 	addr64_t nested_region_start;
8930 	addr64_t nested_region_end;
8931 	addr64_t adjusted_start;
8932 	addr64_t adjusted_end;
8933 	addr64_t adjust_offmask;
8934 	tt_entry_t * tte_p;
8935 	pt_entry_t * pte_p;
8936 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8937 
8938 	if (__improbable(end < start)) {
8939 		panic("%s: invalid address range, "
8940 		    "pmap=%p, start=%p, end=%p",
8941 		    __func__,
8942 		    pmap, (void*)start, (void*)end);
8943 	}
8944 
8945 	nested_region_start = pmap->nested_region_addr;
8946 	nested_region_end = nested_region_start + pmap->nested_region_size;
8947 
8948 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8949 		panic("%s: range outside nested region %p-%p, "
8950 		    "pmap=%p, start=%p, end=%p",
8951 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8952 		    pmap, (void*)start, (void*)end);
8953 	}
8954 
8955 	/* Contract the range to TT page boundaries. */
8956 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8957 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8958 	adjusted_end = end & ~adjust_offmask;
8959 
8960 	/* Iterate over the range, trying to remove TTEs. */
8961 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8962 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8963 
8964 		tte_p = pmap_tte(pmap, cur);
8965 
8966 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8967 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
8968 
8969 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8970 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8971 				/* Deallocate for the nested map. */
8972 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8973 			} else if (pmap->type == PMAP_TYPE_USER) {
8974 				/**
8975 				 * Just remove for the parent map. If the leaf table pointed
8976 				 * to by the TTE being removed (owned by the nested pmap)
8977 				 * has any mappings, then this call will panic. This
8978 				 * enforces the policy that tables being trimmed must be
8979 				 * empty to prevent possible use-after-free attacks.
8980 				 */
8981 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8982 			} else {
8983 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8984 			}
8985 		} else {
8986 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8987 		}
8988 	}
8989 
8990 	/* Remove empty L2 TTs. */
8991 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8992 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8993 
8994 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8995 		/* For each L1 entry in our range... */
8996 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8997 
8998 		bool remove_tt1e = true;
8999 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9000 		tt_entry_t * tt2e_start;
9001 		tt_entry_t * tt2e_end;
9002 		tt_entry_t * tt2e_p;
9003 		tt_entry_t tt1e;
9004 
9005 		if (tt1e_p == NULL) {
9006 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9007 			continue;
9008 		}
9009 
9010 		tt1e = *tt1e_p;
9011 
9012 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9013 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9014 			continue;
9015 		}
9016 
9017 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9018 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9019 
9020 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9021 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9022 				/*
9023 				 * If any TTEs are populated, don't remove the
9024 				 * L1 TT.
9025 				 */
9026 				remove_tt1e = false;
9027 			}
9028 		}
9029 
9030 		if (remove_tt1e) {
9031 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9032 		} else {
9033 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9034 		}
9035 	}
9036 }
9037 
9038 /**
9039  * State machine for multi-step pmap trimming. Trimming is the action of
9040  * deallocating the TTEs of the shared region of pmaps down to a given range.
9041  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9042  * disabling preemption for too long. These steps include computing the bounds
9043  * of the shared region, trimming the head of the "grand", trimming the tail of
9044  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9045  * different conditions.
9046  *
9047  * @param grand the pmap in which the pages are nested
9048  * @param subord the pmap from which the pages are shared, or nested
9049  * @param vstart start of the used range in "grand"
9050  * @param size size of the used range
9051  * @param state the current state of the state machine
9052  *
9053  * @return the next state of the state machine, to be used in the next call
9054  *         into this function.
9055  */
9056 MARK_AS_PMAP_TEXT pmap_trim_state_t
9057 pmap_trim_internal(
9058 	pmap_t grand,
9059 	pmap_t subord,
9060 	addr64_t vstart,
9061 	uint64_t size,
9062 	pmap_trim_state_t state)
9063 {
9064 	/* Validation needs to be done regardless of state. */
9065 	addr64_t vend;
9066 
9067 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9068 		panic("%s: grand addr wraps around, "
9069 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9070 		    __func__, grand, subord, (void*)vstart, size, state);
9071 	}
9072 
9073 	validate_pmap_mutable(grand);
9074 	validate_pmap(subord);
9075 
9076 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9077 		panic("%s: subord is of non-nestable type 0x%hhx, "
9078 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9079 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9080 	}
9081 
9082 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9083 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9084 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9085 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9086 	}
9087 
9088 	if (__improbable(grand->nested_pmap != subord)) {
9089 		panic("%s: grand->nested != subord, "
9090 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9091 		    __func__, grand, subord, (void*)vstart, size, state);
9092 	}
9093 
9094 	if (__improbable((size != 0) &&
9095 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9096 		panic("%s: grand range not in nested region, "
9097 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9098 		    __func__, grand, subord, (void*)vstart, size, state);
9099 	}
9100 
9101 	/* Trimming starts with figuring out the bounds for the grand. */
9102 	if (state == PMAP_TRIM_STATE_START) {
9103 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9104 
9105 		/**
9106 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9107 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9108 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9109 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9110 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9111 		 * PMAP_TRIM_STATE_DONE.
9112 		 */
9113 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9114 			assert(subord->nested_bounds_set);
9115 
9116 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9117 			if (!grand->nested_bounds_set) {
9118 				/* Inherit the bounds from subord. */
9119 				grand->nested_region_true_start = subord->nested_region_true_start;
9120 				grand->nested_region_true_end = subord->nested_region_true_end;
9121 				grand->nested_bounds_set = true;
9122 			}
9123 
9124 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9125 
9126 			/* Now that the grand has bounds, we are done. */
9127 			return PMAP_TRIM_STATE_DONE;
9128 		}
9129 
9130 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9131 		if ((!subord->nested_bounds_set) && size) {
9132 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9133 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9134 
9135 			subord->nested_region_true_start = vstart;
9136 			subord->nested_region_true_end = vend;
9137 			subord->nested_region_true_start &= ~adjust_offmask;
9138 
9139 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9140 				panic("%s: padded true end wraps around, "
9141 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9142 				    __func__, grand, subord, (void*)vstart, size, state);
9143 			}
9144 
9145 			subord->nested_region_true_end &= ~adjust_offmask;
9146 			subord->nested_bounds_set = true;
9147 		}
9148 
9149 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9150 		if (subord->nested_bounds_set) {
9151 			/* Inherit the bounds from subord. */
9152 			grand->nested_region_true_start = subord->nested_region_true_start;
9153 			grand->nested_region_true_end = subord->nested_region_true_end;
9154 			grand->nested_bounds_set = true;
9155 
9156 			/* If we know the bounds, we can trim the pmap. */
9157 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9158 
9159 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9160 		} else {
9161 			/* Don't trim if we don't know the bounds. */
9162 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9163 
9164 			return PMAP_TRIM_STATE_DONE;
9165 		}
9166 	}
9167 
9168 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9169 	if (!grand->nested_bounds_set) {
9170 		panic("%s: !grand->nested_bounds_set, "
9171 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9172 		    __func__, grand, subord, (void*)vstart, size, state);
9173 	}
9174 
9175 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9176 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9177 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9178 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9179 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9180 			    (unsigned int)grand->nested_no_bounds_ref_state);
9181 		}
9182 
9183 #if XNU_MONITOR
9184 		if (pmap_pending_preemption()) {
9185 			return PMAP_TRIM_STATE_GRAND_AFTER;
9186 		}
9187 #endif
9188 
9189 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9190 	}
9191 
9192 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9193 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9194 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9195 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9196 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9197 			    (unsigned int)grand->nested_no_bounds_ref_state);
9198 		}
9199 
9200 #if XNU_MONITOR
9201 		if (pmap_pending_preemption()) {
9202 			return PMAP_TRIM_STATE_SUBORD;
9203 		}
9204 #endif
9205 
9206 		state = PMAP_TRIM_STATE_SUBORD;
9207 	}
9208 
9209 	/* START state is guaranteed to compute the bounds for the subord. */
9210 	if (!subord->nested_bounds_set) {
9211 		panic("%s: !subord->nested_bounds_set, "
9212 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9213 		    __func__, grand, subord, (void*)vstart, size, state);
9214 	}
9215 
9216 	if (state == PMAP_TRIM_STATE_SUBORD) {
9217 		/**
9218 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9219 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9220 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9221 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9222 		 * the state update is visible only once the preceding trim operation is complete.  An
9223 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9224 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9225 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9226 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9227 		 * of the state CAS.
9228 		 */
9229 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9230 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9231 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9232 			    (unsigned int)grand->nested_no_bounds_ref_state);
9233 		}
9234 		pmap_trim_subord(subord);
9235 	}
9236 
9237 	return PMAP_TRIM_STATE_DONE;
9238 }
9239 
9240 MARK_AS_PMAP_TEXT static void
9241 pmap_trim_self(pmap_t pmap)
9242 {
9243 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9244 		/* If we have a no bounds ref, we need to drop it. */
9245 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9246 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9247 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9248 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9249 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9250 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9251 
9252 		if (nested_bounds_set) {
9253 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9254 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9255 		}
9256 		/*
9257 		 * Try trimming the nested pmap, in case we had the
9258 		 * last reference.
9259 		 */
9260 		pmap_trim_subord(pmap->nested_pmap);
9261 	}
9262 }
9263 
9264 /*
9265  * pmap_trim_subord(grand, subord)
9266  *
9267  * grand  = pmap that we have nested subord in
9268  * subord = nested pmap we are attempting to trim
9269  *
9270  * Trims subord if possible
9271  */
9272 MARK_AS_PMAP_TEXT static void
9273 pmap_trim_subord(pmap_t subord)
9274 {
9275 	bool contract_subord = false;
9276 
9277 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9278 
9279 	subord->nested_no_bounds_refcnt--;
9280 
9281 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9282 		/* If this was the last no bounds reference, trim subord. */
9283 		contract_subord = true;
9284 	}
9285 
9286 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9287 
9288 	if (contract_subord) {
9289 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9290 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9291 	}
9292 }
9293 
9294 /**
9295  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9296  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9297  * disabling preemption for too long.
9298  *
9299  * @note When we load the shared region we always create pages tables for the
9300  *       entire region. In practice, the shared cache may use just a portion
9301  *       of that. Before we know the bounds of the shared region, it can
9302  *       already be mapped into processes. Therefore, once the bounds are
9303  *       known, "trimming" comes in handy to remove the unnecessary page
9304  *       tables in the processes the shared region is mapped in, and eventually
9305  *       those in the shared region itself. Note that the shared region must
9306  *       be trimmed after the user processes because it has the L3 entries
9307  *       everyone else is pointing to.
9308  *
9309  * @param grand the pmap in which the pages are nested
9310  * @param subord the pmap from which the pages are shared, or nested
9311  * @param vstart start of the used range in "grand"
9312  * @param size size of the used range
9313  */
9314 void
9315 pmap_trim(
9316 	pmap_t grand,
9317 	pmap_t subord,
9318 	addr64_t vstart,
9319 	uint64_t size)
9320 {
9321 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9322 
9323 #if XNU_MONITOR
9324 	/* On PPL systems, drives the state machine until its done. */
9325 	while (state != PMAP_TRIM_STATE_DONE) {
9326 		__assert_only pmap_trim_state_t old_state = state;
9327 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9328 
9329 		/* Are we making progress? */
9330 		assert(old_state != state);
9331 	}
9332 
9333 	pmap_ledger_check_balance(grand);
9334 	pmap_ledger_check_balance(subord);
9335 #else
9336 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9337 
9338 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9339 	assert(state == PMAP_TRIM_STATE_DONE);
9340 #endif
9341 }
9342 
9343 #if HAS_APPLE_PAC
9344 void *
9345 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9346 {
9347 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9348 		panic("attempt to sign user pointer without process independent key");
9349 	}
9350 
9351 	void *res = NULL;
9352 	uint64_t current_intr_state = pmap_interrupts_disable();
9353 
9354 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9355 
9356 	__compiler_materialize_and_prevent_reordering_on(value);
9357 	switch (key) {
9358 	case ptrauth_key_asia:
9359 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9360 		break;
9361 	case ptrauth_key_asda:
9362 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9363 		break;
9364 	default:
9365 		__builtin_unreachable();
9366 	}
9367 	__compiler_materialize_and_prevent_reordering_on(res);
9368 
9369 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9370 
9371 	pmap_interrupts_restore(current_intr_state);
9372 
9373 	return res;
9374 }
9375 
9376 void *
9377 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9378 {
9379 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9380 }
9381 
9382 void *
9383 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9384 {
9385 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9386 		panic("attempt to auth user pointer without process independent key");
9387 	}
9388 
9389 	void *res = NULL;
9390 	uint64_t current_intr_state = pmap_interrupts_disable();
9391 
9392 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9393 	__compiler_materialize_and_prevent_reordering_on(value);
9394 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9395 	__compiler_materialize_and_prevent_reordering_on(res);
9396 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9397 
9398 	pmap_interrupts_restore(current_intr_state);
9399 
9400 	return res;
9401 }
9402 
9403 void *
9404 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9405 {
9406 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9407 }
9408 #endif /* HAS_APPLE_PAC */
9409 
9410 /*
9411  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9412  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9413  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9414  * return value, to indicate where a preempted [un]nest operation should resume.
9415  * When the return value contains the ending address of the nested region with
9416  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9417  */
9418 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9419 
9420 /*
9421  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9422  *
9423  *	grand  = the pmap that we will nest subord into
9424  *	subord = the pmap that goes into the grand
9425  *	vstart  = start of range in pmap to be inserted
9426  *	size   = Size of nest area (up to 16TB)
9427  *
9428  *	Inserts a pmap into another.  This is used to implement shared segments.
9429  *
9430  */
9431 
9432 /**
9433  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9434  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9435  * This function operates in 3 main phases:
9436  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9437  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9438  *    the mapping range are present in subord.
9439  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9440  *    contains pointers to subord's leaf-level pagetable pages for the specified
9441  *    VA range.
9442  *
9443  * This function may return early due to pending AST_URGENT preemption; if so
9444  * it will indicate the need to be re-entered.
9445  *
9446  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9447  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9448  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9449  * @param size twig-aligned size of the nesting range
9450  * @param vrestart the twig-aligned starting address of the current call.  May contain
9451  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9452  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9453  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9454  *
9455  * @return the virtual address at which to restart the operation, possibly including
9456  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9457  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9458  */
9459 MARK_AS_PMAP_TEXT vm_map_offset_t
9460 pmap_nest_internal(
9461 	pmap_t grand,
9462 	pmap_t subord,
9463 	addr64_t vstart,
9464 	uint64_t size,
9465 	vm_map_offset_t vrestart,
9466 	kern_return_t *krp)
9467 {
9468 	kern_return_t kr = KERN_FAILURE;
9469 	vm_map_offset_t vaddr;
9470 	tt_entry_t     *stte_p;
9471 	tt_entry_t     *gtte_p;
9472 	unsigned int    nested_region_asid_bitmap_size;
9473 	unsigned int*   nested_region_asid_bitmap = NULL;
9474 	unsigned int    new_nested_region_asid_bitmap_size;
9475 	unsigned int*   new_nested_region_asid_bitmap = NULL;
9476 	int             expand_options = 0;
9477 	bool            deref_subord = true;
9478 	bool            grand_locked = false;
9479 
9480 	addr64_t vend;
9481 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9482 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9483 	}
9484 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9485 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9486 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9487 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9488 	}
9489 
9490 	assert(krp != NULL);
9491 	validate_pmap_mutable(grand);
9492 	validate_pmap(subord);
9493 #if XNU_MONITOR
9494 	/*
9495 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9496 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9497 	 * be in the process of being destroyed.  If destruction is already committed,
9498 	 * then the check of ref_count below will cover us.  If destruction is initiated
9499 	 * during or after this call, then pmap_destroy() will catch the non-zero
9500 	 * nested_count.
9501 	 */
9502 	os_atomic_inc(&subord->nested_count, relaxed);
9503 	os_atomic_thread_fence(seq_cst);
9504 #endif
9505 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9506 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9507 	}
9508 
9509 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9510 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9511 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9512 	}
9513 
9514 #if XNU_MONITOR
9515 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9516 #endif
9517 
9518 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9519 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9520 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9521 		    grand, vstart, size, (unsigned long long)vrestart);
9522 	}
9523 
9524 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9525 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9526 	}
9527 
9528 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9529 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9530 	}
9531 
9532 	if (subord->nested_region_asid_bitmap == NULL) {
9533 		nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9534 
9535 #if XNU_MONITOR
9536 		pmap_paddr_t pa = 0;
9537 
9538 		if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9539 			panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9540 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9541 			    __FUNCTION__, nested_region_asid_bitmap_size,
9542 			    grand, subord, vstart, size);
9543 		}
9544 
9545 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9546 
9547 		if (kr != KERN_SUCCESS) {
9548 			goto nest_cleanup;
9549 		}
9550 
9551 		assert(pa);
9552 
9553 		nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9554 #else
9555 		nested_region_asid_bitmap = kalloc_data(
9556 			nested_region_asid_bitmap_size * sizeof(unsigned int),
9557 			Z_WAITOK | Z_ZERO);
9558 #endif
9559 
9560 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9561 			kr = KERN_ABORTED;
9562 			goto nest_cleanup;
9563 		}
9564 
9565 		if (subord->nested_region_asid_bitmap == NULL) {
9566 			subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9567 			subord->nested_region_addr = vstart;
9568 			subord->nested_region_size = (mach_vm_offset_t) size;
9569 
9570 			/**
9571 			 * Ensure that the rest of the subord->nested_region_* fields are
9572 			 * initialized and visible before setting the nested_region_asid_bitmap
9573 			 * field (which is used as the flag to say that the rest are initialized).
9574 			 */
9575 			__builtin_arm_dmb(DMB_ISHST);
9576 			subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9577 			nested_region_asid_bitmap = NULL;
9578 		}
9579 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9580 		if (nested_region_asid_bitmap != NULL) {
9581 #if XNU_MONITOR
9582 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9583 #else
9584 			kfree_data(nested_region_asid_bitmap,
9585 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9586 #endif
9587 			nested_region_asid_bitmap = NULL;
9588 		}
9589 	}
9590 
9591 	/**
9592 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9593 	 * speculated before their initialization.
9594 	 */
9595 	__builtin_arm_dmb(DMB_ISHLD);
9596 
9597 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9598 		uint64_t        new_size;
9599 
9600 		nested_region_asid_bitmap = NULL;
9601 		nested_region_asid_bitmap_size = 0;
9602 		new_size =  vend - subord->nested_region_addr;
9603 
9604 		/* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9605 		new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9606 
9607 #if XNU_MONITOR
9608 		pmap_paddr_t pa = 0;
9609 
9610 		if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9611 			panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9612 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9613 			    __FUNCTION__, new_nested_region_asid_bitmap_size,
9614 			    grand, subord, vstart, new_size);
9615 		}
9616 
9617 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9618 
9619 		if (kr != KERN_SUCCESS) {
9620 			goto nest_cleanup;
9621 		}
9622 
9623 		assert(pa);
9624 
9625 		new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9626 #else
9627 		new_nested_region_asid_bitmap = kalloc_data(
9628 			new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9629 			Z_WAITOK | Z_ZERO);
9630 #endif
9631 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9632 			kr = KERN_ABORTED;
9633 			goto nest_cleanup;
9634 		}
9635 
9636 		if (subord->nested_region_size < new_size) {
9637 			bcopy(subord->nested_region_asid_bitmap,
9638 			    new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9639 			nested_region_asid_bitmap_size  = subord->nested_region_asid_bitmap_size;
9640 			nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9641 			subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9642 			subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9643 			subord->nested_region_size = new_size;
9644 			new_nested_region_asid_bitmap = NULL;
9645 		}
9646 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9647 		if (nested_region_asid_bitmap != NULL) {
9648 #if XNU_MONITOR
9649 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9650 #else
9651 			kfree_data(nested_region_asid_bitmap,
9652 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9653 #endif
9654 			nested_region_asid_bitmap = NULL;
9655 		}
9656 		if (new_nested_region_asid_bitmap != NULL) {
9657 #if XNU_MONITOR
9658 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9659 #else
9660 			kfree_data(new_nested_region_asid_bitmap,
9661 			    new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9662 #endif
9663 			new_nested_region_asid_bitmap = NULL;
9664 		}
9665 	}
9666 
9667 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9668 		kr = KERN_ABORTED;
9669 		goto nest_cleanup;
9670 	}
9671 
9672 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9673 		/*
9674 		 * If this is grand's first nesting operation, keep the reference on subord.
9675 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9676 		 */
9677 		deref_subord = false;
9678 
9679 		if (!subord->nested_bounds_set) {
9680 			/*
9681 			 * We are nesting without the shared regions bounds
9682 			 * being known.  We'll have to trim the pmap later.
9683 			 */
9684 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9685 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9686 				panic("%s: grand %p already nested", __func__, grand);
9687 			}
9688 			subord->nested_no_bounds_refcnt++;
9689 		}
9690 
9691 		grand->nested_region_addr = vstart;
9692 		grand->nested_region_size = (mach_vm_offset_t) size;
9693 	} else {
9694 		if (__improbable(grand->nested_pmap != subord)) {
9695 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9696 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9697 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9698 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9699 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9700 		}
9701 	}
9702 
9703 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9704 	if (vaddr < subord->nested_region_true_start) {
9705 		vaddr = subord->nested_region_true_start;
9706 	}
9707 
9708 	addr64_t true_end = vend;
9709 	if (true_end > subord->nested_region_true_end) {
9710 		true_end = subord->nested_region_true_end;
9711 	}
9712 	__unused unsigned int ttecount = 0;
9713 
9714 	if (vrestart & PMAP_NEST_GRAND) {
9715 		goto nest_grand;
9716 	}
9717 
9718 	while (vaddr < true_end) {
9719 		stte_p = pmap_tte(subord, vaddr);
9720 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9721 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9722 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9723 
9724 			if (kr != KERN_SUCCESS) {
9725 				goto done;
9726 			}
9727 
9728 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9729 		}
9730 		vaddr += pt_attr_twig_size(pt_attr);
9731 		vrestart = vaddr;
9732 		++ttecount;
9733 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9734 		    pmap_pending_preemption())) {
9735 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9736 			kr = KERN_SUCCESS;
9737 			goto done;
9738 		}
9739 	}
9740 	/*
9741 	 * copy TTEs from subord pmap into grand pmap
9742 	 */
9743 
9744 	vaddr = (vm_map_offset_t) vstart;
9745 	if (vaddr < subord->nested_region_true_start) {
9746 		vaddr = subord->nested_region_true_start;
9747 	}
9748 	vrestart = vaddr | PMAP_NEST_GRAND;
9749 
9750 nest_grand:
9751 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9752 
9753 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9754 		kr = KERN_ABORTED;
9755 		goto done;
9756 	}
9757 	while (vaddr < true_end) {
9758 		stte_p = pmap_tte(subord, vaddr);
9759 		gtte_p = pmap_tte(grand, vaddr);
9760 		if (gtte_p == PT_ENTRY_NULL) {
9761 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9762 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9763 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9764 				if (kr == KERN_SUCCESS) {
9765 					kr = KERN_ABORTED;
9766 				}
9767 			}
9768 
9769 			if (kr != KERN_SUCCESS) {
9770 				goto done;
9771 			}
9772 
9773 			gtte_p = pmap_tt2e(grand, vaddr);
9774 		}
9775 		/* Don't leak a page table page.  Don't violate break-before-make. */
9776 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9777 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9778 			    __func__, gtte_p, grand);
9779 		}
9780 		*gtte_p = *stte_p;
9781 
9782 		vaddr += pt_attr_twig_size(pt_attr);
9783 		vrestart = vaddr | PMAP_NEST_GRAND;
9784 		++ttecount;
9785 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9786 		    pmap_pending_preemption())) {
9787 			break;
9788 		}
9789 	}
9790 	if (vaddr >= true_end) {
9791 		vrestart = vend | PMAP_NEST_GRAND;
9792 	}
9793 
9794 	kr = KERN_SUCCESS;
9795 done:
9796 
9797 	FLUSH_PTE();
9798 	__builtin_arm_isb(ISB_SY);
9799 
9800 	if (grand_locked) {
9801 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9802 	}
9803 
9804 nest_cleanup:
9805 #if XNU_MONITOR
9806 	if (kr != KERN_SUCCESS) {
9807 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9808 		*krp = kr;
9809 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9810 	}
9811 #else
9812 	if (kr != KERN_SUCCESS) {
9813 		*krp = kr;
9814 	}
9815 #endif
9816 	if (nested_region_asid_bitmap != NULL) {
9817 #if XNU_MONITOR
9818 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9819 #else
9820 		kfree_data(nested_region_asid_bitmap,
9821 		    nested_region_asid_bitmap_size * sizeof(unsigned int));
9822 #endif
9823 	}
9824 	if (new_nested_region_asid_bitmap != NULL) {
9825 #if XNU_MONITOR
9826 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9827 #else
9828 		kfree_data(new_nested_region_asid_bitmap,
9829 		    new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9830 #endif
9831 	}
9832 	if (deref_subord) {
9833 #if XNU_MONITOR
9834 		os_atomic_dec(&subord->nested_count, relaxed);
9835 #endif
9836 		pmap_destroy_internal(subord);
9837 	}
9838 	return vrestart;
9839 }
9840 
9841 kern_return_t
9842 pmap_nest(
9843 	pmap_t grand,
9844 	pmap_t subord,
9845 	addr64_t vstart,
9846 	uint64_t size)
9847 {
9848 	kern_return_t kr = KERN_SUCCESS;
9849 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9850 	vm_map_offset_t vend = vaddr + size;
9851 	__unused vm_map_offset_t vlast = vaddr;
9852 
9853 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9854 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9855 	    VM_KERNEL_ADDRHIDE(vstart));
9856 
9857 	pmap_verify_preemptible();
9858 #if XNU_MONITOR
9859 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9860 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9861 		if (kr == KERN_RESOURCE_SHORTAGE) {
9862 			pmap_alloc_page_for_ppl(0);
9863 			kr = KERN_SUCCESS;
9864 		} else if (kr == KERN_ABORTED) {
9865 			/**
9866 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9867 			 * that it won't update kr when KERN_SUCCESS is to be returned.
9868 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9869 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9870 			 */
9871 			kr = KERN_SUCCESS;
9872 			continue;
9873 		} else if (kr != KERN_SUCCESS) {
9874 			break;
9875 		} else if (vaddr == vlast) {
9876 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9877 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9878 		}
9879 		vlast = vaddr;
9880 	}
9881 
9882 	pmap_ledger_check_balance(grand);
9883 	pmap_ledger_check_balance(subord);
9884 #else
9885 	/**
9886 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9887 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9888 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9889 	 */
9890 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9891 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9892 	}
9893 #endif
9894 
9895 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9896 
9897 	return kr;
9898 }
9899 
9900 /*
9901  *	kern_return_t pmap_unnest(grand, vaddr)
9902  *
9903  *	grand  = the pmap that will have the virtual range unnested
9904  *	vaddr  = start of range in pmap to be unnested
9905  *	size   = size of range in pmap to be unnested
9906  *
9907  */
9908 
9909 kern_return_t
9910 pmap_unnest(
9911 	pmap_t grand,
9912 	addr64_t vaddr,
9913 	uint64_t size)
9914 {
9915 	return pmap_unnest_options(grand, vaddr, size, 0);
9916 }
9917 
9918 /**
9919  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9920  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9921  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9922  * still have the region nested.  The mappings in 'grand' will be left empty
9923  * with the assumption that they will be demand-filled by subsequent access faults.
9924  *
9925  * This function operates in 2 main phases:
9926  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9927  *    them non-global.
9928  * 2. Clearing of the twig-level TTEs for the address range in grand.
9929  *
9930  * This function may return early due to pending AST_URGENT preemption; if so
9931  * it will indicate the need to be re-entered.
9932  *
9933  * @param grand pmap from which to unnest mappings
9934  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9935  * @param size twig-aligned size of the nested range
9936  * @param vrestart the page-aligned starting address of the current call.  May contain
9937  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9938  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9939  *        grand is being torn down and step 1) above is not needed.
9940  *
9941  * @return the virtual address at which to restart the operation, possibly including
9942  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9943  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9944  */
9945 MARK_AS_PMAP_TEXT vm_map_offset_t
9946 pmap_unnest_options_internal(
9947 	pmap_t grand,
9948 	addr64_t vaddr,
9949 	uint64_t size,
9950 	vm_map_offset_t vrestart,
9951 	unsigned int option)
9952 {
9953 	vm_map_offset_t start;
9954 	vm_map_offset_t addr;
9955 	tt_entry_t     *tte_p;
9956 	unsigned int    current_index;
9957 	unsigned int    start_index;
9958 	unsigned int    max_index;
9959 	unsigned int    entry_count = 0;
9960 
9961 	addr64_t vend;
9962 	addr64_t true_end;
9963 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9964 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9965 	}
9966 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9967 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9968 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9969 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9970 	}
9971 
9972 	validate_pmap_mutable(grand);
9973 
9974 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9975 
9976 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9977 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9978 		    (unsigned long long)vaddr, (unsigned long long)size);
9979 	}
9980 
9981 	if (__improbable(grand->nested_pmap == NULL)) {
9982 		panic("%s: %p has no nested pmap", __func__, grand);
9983 	}
9984 
9985 	true_end = vend;
9986 	if (true_end > grand->nested_pmap->nested_region_true_end) {
9987 		true_end = grand->nested_pmap->nested_region_true_end;
9988 	}
9989 
9990 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
9991 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9992 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9993 		}
9994 
9995 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
9996 			return vrestart;
9997 		}
9998 
9999 		start = vrestart;
10000 		if (start < grand->nested_pmap->nested_region_true_start) {
10001 			start = grand->nested_pmap->nested_region_true_start;
10002 		}
10003 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10004 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10005 		bool flush_tlb = false;
10006 
10007 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10008 			pt_entry_t  *bpte, *cpte;
10009 
10010 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10011 
10012 			bpte = pmap_pte(grand->nested_pmap, addr);
10013 
10014 			/*
10015 			 * If we've re-entered this function partway through unnesting a leaf region, the
10016 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10017 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
10018 			 * address.
10019 			 */
10020 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
10021 			    (addr & pt_attr_twig_offmask(pt_attr))) {
10022 				/*
10023 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10024 				 * the nested pmap in this region will now be marked non-global.  Do this
10025 				 * before marking any of the PTEs within the region as non-global to avoid
10026 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10027 				 * in the region, which could lead to a TLB conflict if a non-global entry
10028 				 * is later inserted for the same VA in a pmap which has fully unnested this
10029 				 * region.
10030 				 */
10031 				setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
10032 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10033 					pmap_paddr_t    pa;
10034 					unsigned int    pai = 0;
10035 					boolean_t               managed = FALSE;
10036 					pt_entry_t  spte;
10037 
10038 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10039 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10040 						spte = *((volatile pt_entry_t*)cpte);
10041 						while (!managed) {
10042 							pa = pte_to_pa(spte);
10043 							if (!pa_valid(pa)) {
10044 								break;
10045 							}
10046 							pai = pa_index(pa);
10047 							pvh_lock(pai);
10048 							spte = *((volatile pt_entry_t*)cpte);
10049 							pa = pte_to_pa(spte);
10050 							if (pai == pa_index(pa)) {
10051 								managed = TRUE;
10052 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10053 							}
10054 							pvh_unlock(pai);
10055 						}
10056 
10057 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10058 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10059 							flush_tlb = true;
10060 						}
10061 
10062 						if (managed) {
10063 							pvh_assert_locked(pai);
10064 							pvh_unlock(pai);
10065 						}
10066 					}
10067 
10068 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10069 					vrestart = addr;
10070 					++entry_count;
10071 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10072 					    pmap_pending_preemption())) {
10073 						goto unnest_subord_done;
10074 					}
10075 				}
10076 			}
10077 			addr = vlim;
10078 			vrestart = addr;
10079 			++entry_count;
10080 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10081 			    pmap_pending_preemption())) {
10082 				break;
10083 			}
10084 		}
10085 
10086 unnest_subord_done:
10087 		if (flush_tlb) {
10088 			FLUSH_PTE_STRONG();
10089 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10090 		}
10091 
10092 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10093 		if (current_index < max_index) {
10094 			return vrestart;
10095 		}
10096 	}
10097 
10098 	/*
10099 	 * invalidate all pdes for segment at vaddr in pmap grand
10100 	 */
10101 	if (vrestart & PMAP_NEST_GRAND) {
10102 		addr = vrestart & ~PMAP_NEST_GRAND;
10103 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10104 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10105 		}
10106 	} else {
10107 		addr = vaddr;
10108 		vrestart = vaddr | PMAP_NEST_GRAND;
10109 	}
10110 
10111 	/**
10112 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10113 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10114 	 * upon reentry.
10115 	 */
10116 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10117 		return vrestart;
10118 	}
10119 
10120 	if (addr < grand->nested_pmap->nested_region_true_start) {
10121 		addr = grand->nested_pmap->nested_region_true_start;
10122 	}
10123 
10124 	while (addr < true_end) {
10125 		tte_p = pmap_tte(grand, addr);
10126 		/*
10127 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10128 		 * so it's possible that a region we're trying to unnest may not have been
10129 		 * nested in the first place.
10130 		 */
10131 		if (tte_p != NULL) {
10132 			*tte_p = ARM_TTE_TYPE_FAULT;
10133 		}
10134 		addr += pt_attr_twig_size(pt_attr);
10135 		vrestart = addr | PMAP_NEST_GRAND;
10136 		++entry_count;
10137 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10138 		    pmap_pending_preemption())) {
10139 			break;
10140 		}
10141 	}
10142 	if (addr >= true_end) {
10143 		vrestart = vend | PMAP_NEST_GRAND;
10144 	}
10145 
10146 	FLUSH_PTE_STRONG();
10147 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10148 
10149 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10150 
10151 	return vrestart;
10152 }
10153 
10154 kern_return_t
10155 pmap_unnest_options(
10156 	pmap_t grand,
10157 	addr64_t vaddr,
10158 	uint64_t size,
10159 	unsigned int option)
10160 {
10161 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10162 	vm_map_offset_t vend = vaddr + size;
10163 
10164 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10165 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10166 
10167 	pmap_verify_preemptible();
10168 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10169 #if XNU_MONITOR
10170 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10171 #else
10172 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10173 #endif
10174 	}
10175 
10176 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10177 
10178 	return KERN_SUCCESS;
10179 }
10180 
10181 boolean_t
10182 pmap_adjust_unnest_parameters(
10183 	__unused pmap_t p,
10184 	__unused vm_map_offset_t *s,
10185 	__unused vm_map_offset_t *e)
10186 {
10187 	return TRUE; /* to get to log_unnest_badness()... */
10188 }
10189 
10190 #if PMAP_FORK_NEST
10191 /**
10192  * Perform any necessary pre-nesting of the parent's shared region at fork()
10193  * time.
10194  *
10195  * @note This should only be called from vm_map_fork().
10196  *
10197  * @param old_pmap The pmap of the parent task.
10198  * @param new_pmap The pmap of the child task.
10199  * @param nesting_start An output parameter that is updated with the start
10200  *                      address of the range that was pre-nested
10201  * @param nesting_end An output parameter that is updated with the end
10202  *                      address of the range that was pre-nested
10203  *
10204  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10205  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10206  */
10207 kern_return_t
10208 pmap_fork_nest(
10209 	pmap_t old_pmap,
10210 	pmap_t new_pmap,
10211 	vm_map_offset_t *nesting_start,
10212 	vm_map_offset_t *nesting_end)
10213 {
10214 	if (old_pmap == NULL || new_pmap == NULL) {
10215 		return KERN_INVALID_ARGUMENT;
10216 	}
10217 	if (old_pmap->nested_pmap == NULL) {
10218 		return KERN_SUCCESS;
10219 	}
10220 	pmap_nest(new_pmap,
10221 	    old_pmap->nested_pmap,
10222 	    old_pmap->nested_region_addr,
10223 	    old_pmap->nested_region_size);
10224 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10225 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10226 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10227 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10228 	    new_pmap->nested_pmap,
10229 	    new_pmap->nested_region_addr,
10230 	    new_pmap->nested_region_size,
10231 	    old_pmap->nested_pmap,
10232 	    old_pmap->nested_region_addr,
10233 	    old_pmap->nested_region_size);
10234 	*nesting_start = old_pmap->nested_region_addr;
10235 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10236 	return KERN_SUCCESS;
10237 }
10238 #endif /* PMAP_FORK_NEST */
10239 
10240 /*
10241  * disable no-execute capability on
10242  * the specified pmap
10243  */
10244 #if DEVELOPMENT || DEBUG
10245 void
10246 pmap_disable_NX(
10247 	pmap_t pmap)
10248 {
10249 	pmap->nx_enabled = FALSE;
10250 }
10251 #else
10252 void
10253 pmap_disable_NX(
10254 	__unused pmap_t pmap)
10255 {
10256 }
10257 #endif
10258 
10259 /*
10260  * flush a range of hardware TLB entries.
10261  * NOTE: assumes the smallest TLB entry in use will be for
10262  * an ARM small page (4K).
10263  */
10264 
10265 #if __ARM_RANGE_TLBI__
10266 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10267 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10268 #else
10269 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10270 #endif // __ARM_RANGE_TLBI__
10271 
10272 static void
10273 flush_mmu_tlb_region_asid_async(
10274 	vm_offset_t va,
10275 	size_t length,
10276 	pmap_t pmap,
10277 	bool last_level_only __unused,
10278 	bool strong __unused)
10279 {
10280 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10281 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10282 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10283 	uint32_t    asid;
10284 
10285 	asid = pmap->hw_asid;
10286 
10287 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10288 		boolean_t       flush_all = FALSE;
10289 
10290 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10291 			flush_all = TRUE;
10292 		}
10293 		if (flush_all) {
10294 			flush_mmu_tlb_async();
10295 		} else {
10296 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10297 		}
10298 		return;
10299 	}
10300 #if __ARM_RANGE_TLBI__
10301 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10302 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10303 		if (pmap->type == PMAP_TYPE_NESTED) {
10304 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10305 		} else {
10306 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10307 		}
10308 		return;
10309 	}
10310 #endif
10311 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10312 	va = tlbi_asid(asid) | tlbi_addr(va);
10313 
10314 	if (pmap->type == PMAP_TYPE_NESTED) {
10315 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10316 	} else {
10317 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10318 	}
10319 }
10320 
10321 MARK_AS_PMAP_TEXT static void
10322 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10323 {
10324 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10325 }
10326 
10327 void
10328 flush_mmu_tlb_region(
10329 	vm_offset_t va,
10330 	unsigned length)
10331 {
10332 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10333 	sync_tlb_flush();
10334 }
10335 
10336 unsigned int
10337 pmap_cache_attributes(
10338 	ppnum_t pn)
10339 {
10340 	pmap_paddr_t    paddr;
10341 	unsigned int    pai;
10342 	unsigned int    result;
10343 	pp_attr_t       pp_attr_current;
10344 
10345 	paddr = ptoa(pn);
10346 
10347 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10348 
10349 	if (!pa_valid(paddr)) {
10350 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10351 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10352 	}
10353 
10354 	result = VM_WIMG_DEFAULT;
10355 
10356 	pai = pa_index(paddr);
10357 
10358 	pp_attr_current = pp_attr_table[pai];
10359 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10360 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10361 	}
10362 	return result;
10363 }
10364 
10365 MARK_AS_PMAP_TEXT static void
10366 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10367 {
10368 	if ((wimg_bits_prev != wimg_bits_new)
10369 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10370 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10371 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10372 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10373 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10374 		pmap_sync_page_attributes_phys(pn);
10375 	}
10376 
10377 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10378 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10379 	}
10380 }
10381 
10382 MARK_AS_PMAP_TEXT __unused void
10383 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10384 {
10385 	pmap_paddr_t paddr = ptoa(pn);
10386 	const unsigned int pai = pa_index(paddr);
10387 
10388 	if (__improbable(!pa_valid(paddr))) {
10389 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10390 	}
10391 
10392 	pvh_lock(pai);
10393 
10394 #if XNU_MONITOR
10395 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10396 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10397 	}
10398 #endif
10399 
10400 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10401 
10402 	pvh_unlock(pai);
10403 
10404 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10405 }
10406 
10407 void *
10408 pmap_map_compressor_page(ppnum_t pn)
10409 {
10410 #if __ARM_PTE_PHYSMAP__
10411 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10412 	if (cacheattr != VM_WIMG_DEFAULT) {
10413 #if XNU_MONITOR
10414 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10415 #else
10416 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10417 #endif
10418 	}
10419 #endif
10420 	return (void*)phystokv(ptoa(pn));
10421 }
10422 
10423 void
10424 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10425 {
10426 #if __ARM_PTE_PHYSMAP__
10427 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10428 	if (cacheattr != VM_WIMG_DEFAULT) {
10429 #if XNU_MONITOR
10430 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10431 #else
10432 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10433 #endif
10434 	}
10435 #endif
10436 }
10437 
10438 /**
10439  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10440  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10441  *
10442  * @param user_page_list List of pages to be updated.
10443  * @param page_cnt Number of pages in total in user_page_list.
10444  * @param cacheattr The new cache attribute.
10445  *
10446  * @return Success if true is returned.
10447  */
10448 bool
10449 pmap_batch_set_cache_attributes(
10450 	upl_page_info_array_t user_page_list,
10451 	unsigned int page_cnt,
10452 	unsigned int cacheattr)
10453 {
10454 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10455 
10456 	if (page_cnt == 0) {
10457 		return true;
10458 	}
10459 
10460 	batch_set_cache_attr_state_t states;
10461 	states.page_index = 0;
10462 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10463 	states.tlb_flush_pass_needed = false;
10464 	states.rt_cache_flush_pass_needed = false;
10465 
10466 	/* Verify we are being called from a preemptible context. */
10467 	pmap_verify_preemptible();
10468 
10469 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10470 #if XNU_MONITOR
10471 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10472 #else /* !XNU_MONITOR */
10473 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10474 #endif /* XNU_MONITOR */
10475 	}
10476 
10477 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10478 	return true;
10479 }
10480 
10481 /**
10482  * Flushes TLB entries associated with the page specified by paddr, but do not
10483  * issue barriers yet.
10484  *
10485  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10486  */
10487 MARK_AS_PMAP_TEXT static void
10488 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10489 {
10490 #if __ARM_PTE_PHYSMAP__
10491 	/* Flush the physical aperture mappings. */
10492 	const vm_offset_t kva = phystokv(paddr);
10493 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10494 #endif /* __ARM_PTE_PHYSMAP__ */
10495 
10496 	/* Flush the mappings tracked in the ptes. */
10497 	const unsigned int pai = pa_index(paddr);
10498 	pv_entry_t **pv_h = pai_to_pvh(pai);
10499 
10500 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10501 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10502 
10503 	pvh_assert_locked(pai);
10504 
10505 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10506 		pte_p = pvh_ptep(pv_h);
10507 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10508 		pve_p = pvh_pve_list(pv_h);
10509 		pte_p = PT_ENTRY_NULL;
10510 	}
10511 
10512 	int pve_ptep_idx = 0;
10513 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10514 		if (pve_p != PV_ENTRY_NULL) {
10515 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10516 			if (pte_p == PT_ENTRY_NULL) {
10517 				goto flush_tlb_skip_pte;
10518 			}
10519 		}
10520 
10521 #ifdef PVH_FLAG_IOMMU
10522 		if (pvh_ptep_is_iommu(pte_p)) {
10523 			goto flush_tlb_skip_pte;
10524 		}
10525 #endif /* PVH_FLAG_IOMMU */
10526 		pmap_t pmap = ptep_get_pmap(pte_p);
10527 		vm_map_address_t va = ptep_get_va(pte_p);
10528 
10529 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10530 		    pmap, true, false);
10531 
10532 flush_tlb_skip_pte:
10533 		pte_p = PT_ENTRY_NULL;
10534 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10535 			pve_ptep_idx = 0;
10536 			pve_p = pve_next(pve_p);
10537 		}
10538 	}
10539 }
10540 
10541 /**
10542  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10543  *
10544  * @param pai The Physical Address Index of the entry.
10545  * @param cacheattr The new cache attribute.
10546  */
10547 MARK_AS_PMAP_TEXT static void
10548 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10549 {
10550 	pvh_assert_locked(pai);
10551 
10552 	pp_attr_t pp_attr_current, pp_attr_template;
10553 	do {
10554 		pp_attr_current = pp_attr_table[pai];
10555 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10556 
10557 		/**
10558 		 * WIMG bits should only be updated under the PVH lock, but we should do
10559 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10560 		 */
10561 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10562 }
10563 
10564 /**
10565  * Batch updates the cache attributes of a list of pages in three passes.
10566  *
10567  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10568  * In pass two, TLB entries are flushed for each page in the list if necessary.
10569  * In pass three, caches are cleaned for each page in the list if necessary.
10570  *
10571  * When running in PPL, this function may decide to return to the caller in response
10572  * to AST_URGENT.
10573  *
10574  * @param user_page_list List of pages to be updated.
10575  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10576  * @param page_cnt Number of pages in total in user_page_list.
10577  * @param cacheattr The new cache attributes.
10578  *
10579  * @return The new state of the state machine.
10580  */
10581 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10582 pmap_batch_set_cache_attributes_internal(
10583 #if XNU_MONITOR
10584 	volatile upl_page_info_t *user_page_list,
10585 #else /* !XNU_MONITOR */
10586 	upl_page_info_array_t user_page_list,
10587 #endif /* XNU_MONITOR */
10588 	batch_set_cache_attr_state_t states,
10589 	unsigned int page_cnt,
10590 	unsigned int cacheattr)
10591 {
10592 	uint64_t page_index = states.page_index;
10593 	uint64_t state = states.state;
10594 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10595 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10596 
10597 	/* For verifying progress. */
10598 	__assert_only const uint64_t page_index_old = page_index;
10599 	__assert_only const uint64_t state_old = state;
10600 
10601 	/* Assert page_index and state are within their range. */
10602 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10603 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10604 	}
10605 
10606 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10607 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10608 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10609 		while (page_index < page_cnt) {
10610 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10611 			const pmap_paddr_t paddr = ptoa(pn);
10612 
10613 			if (!pa_valid(paddr)) {
10614 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10615 			}
10616 
10617 			const unsigned int pai = pa_index(paddr);
10618 
10619 			/* Lock the page. */
10620 			pvh_lock(pai);
10621 
10622 #if XNU_MONITOR
10623 			if (ppattr_pa_test_monitor(paddr)) {
10624 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10625 			}
10626 #endif /* XNU_MONITOR */
10627 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10628 
10629 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10630 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10631 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10632 			}
10633 
10634 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10635 
10636 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10637 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10638 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10639 			}
10640 
10641 			/* Update the cache attributes in PTE and PP_ATTR table. */
10642 			if (wimg_bits_new != wimg_bits_prev) {
10643 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10644 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10645 			}
10646 
10647 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10648 				rt_cache_flush_pass_needed = true;
10649 			}
10650 
10651 			pvh_unlock(pai);
10652 
10653 			page_index++;
10654 
10655 #if XNU_MONITOR
10656 			/**
10657 			 * Check for AST_URGENT every page, as the pve list search in cache
10658 			 * update can take non-constant time.
10659 			 */
10660 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10661 				goto pbscai_exit;
10662 			}
10663 #endif /* XNU_MONITOR */
10664 		}
10665 
10666 		/* page_index == page_cnt && !pmap_pending_preemption() */
10667 		if (tlb_flush_pass_needed) {
10668 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10669 		} else if (rt_cache_flush_pass_needed) {
10670 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10671 		} else {
10672 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10673 		}
10674 		page_index = 0;
10675 
10676 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10677 		FLUSH_PTE_STRONG();
10678 
10679 #if XNU_MONITOR
10680 		if (__improbable(pmap_pending_preemption())) {
10681 			goto pbscai_exit;
10682 		}
10683 #endif /* XNU_MONITOR */
10684 	}
10685 
10686 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10687 		/**
10688 		 * Pass 2: for each physical page and for each mapping, we need to flush
10689 		 * the TLB for it.
10690 		 */
10691 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10692 		while (page_index < page_cnt) {
10693 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10694 
10695 			const pmap_paddr_t paddr = ptoa(pn);
10696 			if (!pa_valid(paddr)) {
10697 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10698 			}
10699 
10700 			const unsigned int pai = pa_index(paddr);
10701 
10702 			pvh_lock(pai);
10703 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10704 			pvh_unlock(pai);
10705 
10706 			page_index++;
10707 
10708 #if XNU_MONITOR
10709 			/**
10710 			 * Check for AST_URGENT every page, as the pve list search in cache
10711 			 * update can take non-constant time.
10712 			 */
10713 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10714 				goto pbscai_exit;
10715 			}
10716 #endif /* XNU_MONITOR */
10717 		}
10718 
10719 #if HAS_FEAT_XS
10720 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10721 		arm64_sync_tlb(false);
10722 #else
10723 		/**
10724 		 * For targets that distinguish between mild and strong DSB, mild DSB
10725 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10726 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10727 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10728 		 */
10729 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10730 #endif
10731 
10732 		if (rt_cache_flush_pass_needed) {
10733 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10734 		} else {
10735 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10736 		}
10737 		page_index = 0;
10738 
10739 #if XNU_MONITOR
10740 		if (__improbable(pmap_pending_preemption())) {
10741 			goto pbscai_exit;
10742 		}
10743 #endif /* XNU_MONITOR */
10744 	}
10745 
10746 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10747 		/* Pass 3: Flush the cache if the page is recently set to RT */
10748 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10749 #if !XNU_MONITOR
10750 		/**
10751 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10752 		 * in the state where DC by VA instructions remain enabled.
10753 		 */
10754 		disable_preemption();
10755 #endif /* !XNU_MONITOR */
10756 
10757 		assert(get_preemption_level() > 0);
10758 
10759 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10760 		/**
10761 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10762 		 * and the host will handle cache maintenance for it. So we don't need to
10763 		 * worry about enabling the ops here for AVP.
10764 		 */
10765 		enable_dc_mva_ops();
10766 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10767 
10768 		while (page_index < page_cnt) {
10769 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10770 
10771 			if (!pa_valid(paddr)) {
10772 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10773 			}
10774 
10775 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10776 
10777 			page_index++;
10778 
10779 #if XNU_MONITOR
10780 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10781 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10782 				disable_dc_mva_ops();
10783 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10784 				goto pbscai_exit;
10785 			}
10786 #endif /* XNU_MONITOR */
10787 		}
10788 
10789 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10790 		disable_dc_mva_ops();
10791 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10792 
10793 #if !XNU_MONITOR
10794 		enable_preemption();
10795 #endif /* !XNU_MONITOR */
10796 
10797 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10798 		page_index = 0;
10799 	}
10800 
10801 #if XNU_MONITOR
10802 pbscai_exit:
10803 #endif /* XNU_MONITOR */
10804 	/* Assert page_index and state are within their range. */
10805 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10806 
10807 	/* Make sure we are making progress in this call. */
10808 	assert(page_index > page_index_old || state > state_old);
10809 
10810 	batch_set_cache_attr_state_t states_new;
10811 	states_new.page_index = page_index;
10812 	states_new.state = state;
10813 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10814 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10815 	return states_new;
10816 }
10817 
10818 MARK_AS_PMAP_TEXT static void
10819 pmap_set_cache_attributes_priv(
10820 	ppnum_t pn,
10821 	unsigned int cacheattr,
10822 	boolean_t external __unused)
10823 {
10824 	pmap_paddr_t    paddr;
10825 	unsigned int    pai;
10826 	pp_attr_t       pp_attr_current;
10827 	pp_attr_t       pp_attr_template;
10828 	unsigned int    wimg_bits_prev, wimg_bits_new;
10829 
10830 	paddr = ptoa(pn);
10831 
10832 	if (!pa_valid(paddr)) {
10833 		return;                         /* Not a managed page. */
10834 	}
10835 
10836 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10837 		cacheattr = VM_WIMG_DEFAULT;
10838 	}
10839 
10840 	pai = pa_index(paddr);
10841 
10842 	pvh_lock(pai);
10843 
10844 #if XNU_MONITOR
10845 	if (external && ppattr_pa_test_monitor(paddr)) {
10846 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10847 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10848 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10849 	}
10850 #endif
10851 
10852 	do {
10853 		pp_attr_current = pp_attr_table[pai];
10854 		wimg_bits_prev = VM_WIMG_DEFAULT;
10855 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10856 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10857 		}
10858 
10859 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10860 
10861 		/**
10862 		 * WIMG bits should only be updated under the PVH lock, but we should do
10863 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10864 		 */
10865 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10866 
10867 	wimg_bits_new = VM_WIMG_DEFAULT;
10868 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10869 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10870 	}
10871 
10872 	if (wimg_bits_new != wimg_bits_prev) {
10873 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10874 	}
10875 
10876 	pvh_unlock(pai);
10877 
10878 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10879 }
10880 
10881 MARK_AS_PMAP_TEXT void
10882 pmap_set_cache_attributes_internal(
10883 	ppnum_t pn,
10884 	unsigned int cacheattr)
10885 {
10886 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10887 }
10888 
10889 void
10890 pmap_set_cache_attributes(
10891 	ppnum_t pn,
10892 	unsigned int cacheattr)
10893 {
10894 #if XNU_MONITOR
10895 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10896 #else
10897 	pmap_set_cache_attributes_internal(pn, cacheattr);
10898 #endif
10899 }
10900 
10901 /**
10902  * Updates the page numbered ppnum to have attribute specified by attributes.
10903  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10904  * The necessity of the TLB flush is returned in case this function is called
10905  * in a batched manner and the TLB flush is intended to be done at a different
10906  * timing.
10907  *
10908  * @param ppnum Page Number of the page to be updated.
10909  * @param attributes The new cache attributes.
10910  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10911  *        immediately.
10912  *
10913  * @return Returns true if a TLB flush is needed for this update regardless of
10914  *         whether a flush has occurred already.
10915  */
10916 MARK_AS_PMAP_TEXT bool
10917 pmap_update_cache_attributes_locked(
10918 	ppnum_t ppnum,
10919 	unsigned attributes,
10920 	bool perform_tlbi)
10921 {
10922 	pmap_paddr_t    phys = ptoa(ppnum);
10923 	pv_entry_t      *pve_p;
10924 	pt_entry_t      *pte_p;
10925 	pv_entry_t      **pv_h;
10926 	pt_entry_t      tmplate;
10927 	unsigned int    pai;
10928 	boolean_t       tlb_flush_needed = false;
10929 
10930 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10931 
10932 	if (pmap_panic_dev_wimg_on_managed) {
10933 		switch (attributes & VM_WIMG_MASK) {
10934 		case VM_WIMG_IO:                        // nGnRnE
10935 		case VM_WIMG_POSTED:                    // nGnRE
10936 		/* supported on DRAM, but slow, so we disallow */
10937 
10938 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10939 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10940 			/* unsupported on DRAM */
10941 
10942 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10943 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10944 			break;
10945 
10946 		default:
10947 			/* not device type memory, all good */
10948 
10949 			break;
10950 		}
10951 	}
10952 
10953 #if __ARM_PTE_PHYSMAP__
10954 	vm_offset_t kva = phystokv(phys);
10955 	pte_p = pmap_pte(kernel_pmap, kva);
10956 
10957 	tmplate = *pte_p;
10958 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10959 #if XNU_MONITOR
10960 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10961 #else
10962 	tmplate |= wimg_to_pte(attributes, phys);
10963 #endif
10964 	if (tmplate & ARM_PTE_HINT_MASK) {
10965 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10966 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10967 	}
10968 
10969 	if (perform_tlbi) {
10970 		write_pte_strong(pte_p, tmplate);
10971 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10972 	} else {
10973 		write_pte_fast(pte_p, tmplate);
10974 	}
10975 	tlb_flush_needed = true;
10976 #endif
10977 
10978 	pai = pa_index(phys);
10979 
10980 	pv_h = pai_to_pvh(pai);
10981 
10982 	pte_p = PT_ENTRY_NULL;
10983 	pve_p = PV_ENTRY_NULL;
10984 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10985 		pte_p = pvh_ptep(pv_h);
10986 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10987 		pve_p = pvh_pve_list(pv_h);
10988 		pte_p = PT_ENTRY_NULL;
10989 	}
10990 
10991 	int pve_ptep_idx = 0;
10992 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10993 		vm_map_address_t va;
10994 		pmap_t          pmap;
10995 
10996 		if (pve_p != PV_ENTRY_NULL) {
10997 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10998 			if (pte_p == PT_ENTRY_NULL) {
10999 				goto cache_skip_pve;
11000 			}
11001 		}
11002 
11003 #ifdef PVH_FLAG_IOMMU
11004 		if (pvh_ptep_is_iommu(pte_p)) {
11005 			goto cache_skip_pve;
11006 		}
11007 #endif
11008 		pmap = ptep_get_pmap(pte_p);
11009 #if HAS_FEAT_XS
11010 		/**
11011 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11012 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11013 		 */
11014 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11015 #endif /* HAS_FEAT_XS */
11016 		va = ptep_get_va(pte_p);
11017 
11018 		tmplate = *pte_p;
11019 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11020 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11021 
11022 		if (perform_tlbi) {
11023 			write_pte_strong(pte_p, tmplate);
11024 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11025 			    pmap, true, false);
11026 		} else {
11027 			write_pte_fast(pte_p, tmplate);
11028 		}
11029 		tlb_flush_needed = true;
11030 
11031 cache_skip_pve:
11032 		pte_p = PT_ENTRY_NULL;
11033 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11034 			pve_ptep_idx = 0;
11035 			pve_p = pve_next(pve_p);
11036 		}
11037 	}
11038 	if (perform_tlbi && tlb_flush_needed) {
11039 #if HAS_FEAT_XS
11040 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11041 		arm64_sync_tlb(false);
11042 #else
11043 		/**
11044 		 * For targets that distinguish between mild and strong DSB, mild DSB
11045 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11046 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11047 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11048 		 */
11049 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11050 #endif
11051 	}
11052 
11053 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11054 
11055 	return tlb_flush_needed;
11056 }
11057 
11058 /**
11059  * Mark a pmap as being dedicated to use for a commpage mapping.
11060  * The pmap itself will never be activated on a CPU; its mappings will
11061  * only be embedded in userspace pmaps at a fixed virtual address.
11062  *
11063  * @param pmap the pmap to mark as belonging to a commpage.
11064  */
11065 static void
11066 pmap_set_commpage(pmap_t pmap)
11067 {
11068 #if XNU_MONITOR
11069 	assert(!pmap_ppl_locked_down);
11070 #endif
11071 	assert(pmap->type == PMAP_TYPE_USER);
11072 	pmap->type = PMAP_TYPE_COMMPAGE;
11073 	/*
11074 	 * Free the pmap's ASID.  This pmap should not ever be directly
11075 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11076 	 * ASID space contention but will also cause pmap_switch() to panic
11077 	 * if an attacker tries to activate this pmap.  Disable preemption to
11078 	 * accommodate the *_nopreempt spinlock in free_asid().
11079 	 */
11080 	mp_disable_preemption();
11081 	pmap_get_pt_ops(pmap)->free_id(pmap);
11082 	mp_enable_preemption();
11083 }
11084 
11085 static void
11086 pmap_update_tt3e(
11087 	pmap_t pmap,
11088 	vm_address_t address,
11089 	tt_entry_t template)
11090 {
11091 	tt_entry_t *ptep, pte;
11092 
11093 	ptep = pmap_tt3e(pmap, address);
11094 	if (ptep == NULL) {
11095 		panic("%s: no ptep?", __FUNCTION__);
11096 	}
11097 
11098 	pte = *ptep;
11099 	pte = tte_to_pa(pte) | template;
11100 	write_pte_strong(ptep, pte);
11101 }
11102 
11103 /* Note absence of non-global bit */
11104 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11105 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11106 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11107 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11108 
11109 /* Note absence of non-global bit and no-execute bit.  */
11110 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11111 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11112 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11113 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11114 
11115 void
11116 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11117     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11118 {
11119 	kern_return_t kr;
11120 	pmap_paddr_t data_pa = 0; // data address
11121 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11122 	pmap_paddr_t text_pa = 0; // text address
11123 
11124 	*kernel_data_addr = 0;
11125 	*kernel_text_addr = 0;
11126 	*user_text_addr = 0;
11127 
11128 #if XNU_MONITOR
11129 	data_pa = pmap_alloc_page_for_kern(0);
11130 	assert(data_pa);
11131 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11132 	ro_data_pa = pmap_alloc_page_for_kern(0);
11133 	assert(ro_data_pa);
11134 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11135 #if CONFIG_ARM_PFZ
11136 	text_pa = pmap_alloc_page_for_kern(0);
11137 	assert(text_pa);
11138 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11139 #endif
11140 
11141 #else /* XNU_MONITOR */
11142 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11143 	/*
11144 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11145 	 * mapped at page granularity, so a separate page for kernel RO data would not
11146 	 * be useful.
11147 	 */
11148 	ro_data_pa = data_pa;
11149 #if CONFIG_ARM_PFZ
11150 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11151 #endif
11152 
11153 #endif /* XNU_MONITOR */
11154 
11155 	/*
11156 	 * In order to avoid burning extra pages on mapping the shared page, we
11157 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11158 	 * translation tables from this pmap into other pmaps.  The level we
11159 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11160 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11161 	 *
11162 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11163 	 * shared cache).
11164 	 *
11165 	 * Note that we update parameters of the entry for our unique needs (NG
11166 	 * entry, etc.).
11167 	 */
11168 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11169 	assert(commpage_pmap_default != NULL);
11170 	pmap_set_commpage(commpage_pmap_default);
11171 
11172 	/* The user 64-bit mappings... */
11173 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11174 	assert(kr == KERN_SUCCESS);
11175 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11176 
11177 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11178 	assert(kr == KERN_SUCCESS);
11179 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11180 #if CONFIG_ARM_PFZ
11181 	/* User mapping of comm page text section for 64 bit mapping only
11182 	 *
11183 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11184 	 * user processes to get this page mapped in, they should never call into
11185 	 * this page.
11186 	 *
11187 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11188 	 * is slid in the same L3 as the data commpage.  It is either outside the
11189 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11190 	 * it is reserved and unavailable to mach VM for future mappings.
11191 	 */
11192 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11193 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11194 
11195 	vm_map_address_t commpage_text_va = 0;
11196 
11197 	do {
11198 		int text_leaf_index = random() % num_ptes;
11199 
11200 		// Generate a VA for the commpage text with the same root and twig index as data
11201 		// comm page, but with new leaf index we've just generated.
11202 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11203 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11204 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11205 
11206 	// Assert that this is empty
11207 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11208 	assert(ptep != PT_ENTRY_NULL);
11209 	assert(*ptep == ARM_TTE_EMPTY);
11210 
11211 	// At this point, we've found the address we want to insert our comm page at
11212 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11213 	assert(kr == KERN_SUCCESS);
11214 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11215 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11216 
11217 	*user_text_addr = commpage_text_va;
11218 #endif
11219 
11220 	/* ...and the user 32-bit mappings. */
11221 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11222 	assert(kr == KERN_SUCCESS);
11223 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11224 
11225 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11226 	assert(kr == KERN_SUCCESS);
11227 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11228 #if __ARM_MIXED_PAGE_SIZE__
11229 	/**
11230 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11231 	 * new set of page tables that point to the exact same 16K shared page as
11232 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11233 	 * the only part that contains relevant data.
11234 	 */
11235 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11236 	assert(commpage_pmap_4k != NULL);
11237 	pmap_set_commpage(commpage_pmap_4k);
11238 
11239 	/* The user 64-bit mappings... */
11240 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11241 	assert(kr == KERN_SUCCESS);
11242 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11243 
11244 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11245 	assert(kr == KERN_SUCCESS);
11246 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11247 
11248 	/* ...and the user 32-bit mapping. */
11249 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11250 	assert(kr == KERN_SUCCESS);
11251 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11252 
11253 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11254 	assert(kr == KERN_SUCCESS);
11255 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11256 #endif
11257 
11258 	/* For manipulation in kernel, go straight to physical page */
11259 	*kernel_data_addr = phystokv(data_pa);
11260 	assert(commpage_ro_data_kva == 0);
11261 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11262 	assert(commpage_text_kva == 0);
11263 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11264 }
11265 
11266 
11267 /*
11268  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11269  * with user controlled TTEs for regions that aren't explicitly reserved by the
11270  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11271  */
11272 #if (ARM_PGSHIFT == 14)
11273 /**
11274  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11275  * commpage completely above the maximum 32-bit userspace VA.
11276  */
11277 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11278 
11279 /**
11280  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11281  * userspace VAs can nest the commpage completely above the maximum 64-bit
11282  * userpace VA, but that technically isn't true on macOS. On those systems, the
11283  * commpage lives within the userspace VA range, but is protected by the VM as
11284  * a reserved region (see vm_reserved_regions[] definition for more info).
11285  */
11286 
11287 #elif (ARM_PGSHIFT == 12)
11288 /**
11289  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11290  * above the maximum userspace VA.
11291  */
11292 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11293 #else
11294 #error Nested shared page mapping is unsupported on this config
11295 #endif
11296 
11297 MARK_AS_PMAP_TEXT kern_return_t
11298 pmap_insert_commpage_internal(
11299 	pmap_t pmap)
11300 {
11301 	kern_return_t kr = KERN_SUCCESS;
11302 	vm_offset_t commpage_vaddr;
11303 	pt_entry_t *ttep, *src_ttep;
11304 	int options = 0;
11305 	pmap_t commpage_pmap = commpage_pmap_default;
11306 
11307 	/* Validate the pmap input before accessing its data. */
11308 	validate_pmap_mutable(pmap);
11309 
11310 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11311 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11312 
11313 #if __ARM_MIXED_PAGE_SIZE__
11314 #if !__ARM_16K_PG__
11315 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11316 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11317 #endif /* !__ARM_16K_PG__ */
11318 
11319 	/* Choose the correct shared page pmap to use. */
11320 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11321 	if (pmap_page_size == 16384) {
11322 		commpage_pmap = commpage_pmap_default;
11323 	} else if (pmap_page_size == 4096) {
11324 		commpage_pmap = commpage_pmap_4k;
11325 	} else {
11326 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11327 	}
11328 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11329 
11330 #if XNU_MONITOR
11331 	options |= PMAP_OPTIONS_NOWAIT;
11332 #endif /* XNU_MONITOR */
11333 
11334 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11335 #error We assume a single page.
11336 #endif
11337 
11338 	if (pmap_is_64bit(pmap)) {
11339 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11340 	} else {
11341 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11342 	}
11343 
11344 
11345 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11346 
11347 	/*
11348 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11349 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11350 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11351 	 * to "nest".
11352 	 *
11353 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11354 	 * nesting just means inserting pointers to pre-allocated tables inside of
11355 	 * the passed in pmap to allow us to share page tables (which map the shared
11356 	 * page) for every task. This saves at least one page of memory per process
11357 	 * compared to creating new page tables in every process for mapping the
11358 	 * shared page.
11359 	 */
11360 
11361 	/**
11362 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11363 	 * page's tables into place.
11364 	 */
11365 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11366 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11367 
11368 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11369 
11370 		if (kr != KERN_SUCCESS) {
11371 #if XNU_MONITOR
11372 			if (kr == KERN_RESOURCE_SHORTAGE) {
11373 				return kr;
11374 			} else
11375 #endif
11376 			if (kr == KERN_ABORTED) {
11377 				return kr;
11378 			} else {
11379 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11380 			}
11381 		}
11382 
11383 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11384 	}
11385 
11386 	if (*ttep != ARM_PTE_EMPTY) {
11387 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11388 	}
11389 
11390 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11391 
11392 	*ttep = *src_ttep;
11393 	FLUSH_PTE_STRONG();
11394 
11395 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11396 
11397 	return kr;
11398 }
11399 
11400 static void
11401 pmap_unmap_commpage(
11402 	pmap_t pmap)
11403 {
11404 	pt_entry_t *ttep;
11405 	vm_offset_t commpage_vaddr;
11406 	pmap_t commpage_pmap = commpage_pmap_default;
11407 
11408 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11409 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11410 
11411 #if __ARM_MIXED_PAGE_SIZE__
11412 #if !__ARM_16K_PG__
11413 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11414 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11415 #endif /* !__ARM_16K_PG__ */
11416 
11417 	/* Choose the correct shared page pmap to use. */
11418 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11419 	if (pmap_page_size == 16384) {
11420 		commpage_pmap = commpage_pmap_default;
11421 	} else if (pmap_page_size == 4096) {
11422 		commpage_pmap = commpage_pmap_4k;
11423 	} else {
11424 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11425 	}
11426 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11427 
11428 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11429 #error We assume a single page.
11430 #endif
11431 
11432 	if (pmap_is_64bit(pmap)) {
11433 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11434 	} else {
11435 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11436 	}
11437 
11438 
11439 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11440 
11441 	if (ttep == NULL) {
11442 		return;
11443 	}
11444 
11445 	/* It had better be mapped to the shared page. */
11446 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11447 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11448 	}
11449 
11450 	*ttep = ARM_TTE_EMPTY;
11451 	FLUSH_PTE_STRONG();
11452 
11453 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11454 	sync_tlb_flush();
11455 }
11456 
11457 void
11458 pmap_insert_commpage(
11459 	pmap_t pmap)
11460 {
11461 	kern_return_t kr = KERN_FAILURE;
11462 #if XNU_MONITOR
11463 	do {
11464 		kr = pmap_insert_commpage_ppl(pmap);
11465 
11466 		if (kr == KERN_RESOURCE_SHORTAGE) {
11467 			pmap_alloc_page_for_ppl(0);
11468 		}
11469 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11470 
11471 	pmap_ledger_check_balance(pmap);
11472 #else
11473 	do {
11474 		kr = pmap_insert_commpage_internal(pmap);
11475 	} while (kr == KERN_ABORTED);
11476 #endif
11477 
11478 	if (kr != KERN_SUCCESS) {
11479 		panic("%s: failed to insert the shared page, kr=%d, "
11480 		    "pmap=%p",
11481 		    __FUNCTION__, kr,
11482 		    pmap);
11483 	}
11484 }
11485 
11486 static boolean_t
11487 pmap_is_64bit(
11488 	pmap_t pmap)
11489 {
11490 	return pmap->is_64bit;
11491 }
11492 
11493 bool
11494 pmap_is_exotic(
11495 	pmap_t pmap __unused)
11496 {
11497 	return false;
11498 }
11499 
11500 
11501 /* ARMTODO -- an implementation that accounts for
11502  * holes in the physical map, if any.
11503  */
11504 boolean_t
11505 pmap_valid_page(
11506 	ppnum_t pn)
11507 {
11508 	return pa_valid(ptoa(pn));
11509 }
11510 
11511 boolean_t
11512 pmap_bootloader_page(
11513 	ppnum_t pn)
11514 {
11515 	pmap_paddr_t paddr = ptoa(pn);
11516 
11517 	if (pa_valid(paddr)) {
11518 		return FALSE;
11519 	}
11520 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11521 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11522 }
11523 
11524 MARK_AS_PMAP_TEXT boolean_t
11525 pmap_is_empty_internal(
11526 	pmap_t pmap,
11527 	vm_map_offset_t va_start,
11528 	vm_map_offset_t va_end)
11529 {
11530 	vm_map_offset_t block_start, block_end;
11531 	tt_entry_t *tte_p;
11532 
11533 	if (pmap == NULL) {
11534 		return TRUE;
11535 	}
11536 
11537 	validate_pmap(pmap);
11538 
11539 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11540 	unsigned int initial_not_in_kdp = not_in_kdp;
11541 
11542 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11543 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11544 	}
11545 
11546 
11547 	/* TODO: This will be faster if we increment ttep at each level. */
11548 	block_start = va_start;
11549 
11550 	while (block_start < va_end) {
11551 		pt_entry_t     *bpte_p, *epte_p;
11552 		pt_entry_t     *pte_p;
11553 
11554 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11555 		if (block_end > va_end) {
11556 			block_end = va_end;
11557 		}
11558 
11559 		tte_p = pmap_tte(pmap, block_start);
11560 		if ((tte_p != PT_ENTRY_NULL)
11561 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11562 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11563 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11564 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11565 
11566 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11567 				if (*pte_p != ARM_PTE_EMPTY) {
11568 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11569 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11570 					}
11571 					return FALSE;
11572 				}
11573 			}
11574 		}
11575 		block_start = block_end;
11576 	}
11577 
11578 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11579 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11580 	}
11581 
11582 	return TRUE;
11583 }
11584 
11585 boolean_t
11586 pmap_is_empty(
11587 	pmap_t pmap,
11588 	vm_map_offset_t va_start,
11589 	vm_map_offset_t va_end)
11590 {
11591 #if XNU_MONITOR
11592 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11593 #else
11594 	return pmap_is_empty_internal(pmap, va_start, va_end);
11595 #endif
11596 }
11597 
11598 vm_map_offset_t
11599 pmap_max_offset(
11600 	boolean_t               is64,
11601 	unsigned int    option)
11602 {
11603 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11604 }
11605 
11606 vm_map_offset_t
11607 pmap_max_64bit_offset(
11608 	__unused unsigned int option)
11609 {
11610 	vm_map_offset_t max_offset_ret = 0;
11611 
11612 #if defined(__arm64__)
11613 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11614 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11615 		max_offset_ret = arm64_pmap_max_offset_default;
11616 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11617 		max_offset_ret = min_max_offset;
11618 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11619 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11620 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11621 		if (arm64_pmap_max_offset_default) {
11622 			max_offset_ret = arm64_pmap_max_offset_default;
11623 		} else if (max_mem > 0xC0000000) {
11624 			// devices with > 3GB of memory
11625 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11626 		} else if (max_mem > 0x40000000) {
11627 			// devices with > 1GB and <= 3GB of memory
11628 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11629 		} else {
11630 			// devices with <= 1 GB of memory
11631 			max_offset_ret = min_max_offset;
11632 		}
11633 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11634 		if (arm64_pmap_max_offset_default) {
11635 			// Allow the boot-arg to override jumbo size
11636 			max_offset_ret = arm64_pmap_max_offset_default;
11637 		} else {
11638 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11639 		}
11640 	} else {
11641 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11642 	}
11643 
11644 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11645 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11646 		assert(max_offset_ret >= min_max_offset);
11647 	}
11648 #else
11649 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11650 #endif
11651 
11652 	return max_offset_ret;
11653 }
11654 
11655 vm_map_offset_t
11656 pmap_max_32bit_offset(
11657 	unsigned int option)
11658 {
11659 	vm_map_offset_t max_offset_ret = 0;
11660 
11661 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11662 		max_offset_ret = arm_pmap_max_offset_default;
11663 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11664 		max_offset_ret = VM_MAX_ADDRESS;
11665 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11666 		max_offset_ret = VM_MAX_ADDRESS;
11667 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11668 		if (arm_pmap_max_offset_default) {
11669 			max_offset_ret = arm_pmap_max_offset_default;
11670 		} else if (max_mem > 0x20000000) {
11671 			max_offset_ret = VM_MAX_ADDRESS;
11672 		} else {
11673 			max_offset_ret = VM_MAX_ADDRESS;
11674 		}
11675 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11676 		max_offset_ret = VM_MAX_ADDRESS;
11677 	} else {
11678 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11679 	}
11680 
11681 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11682 	return max_offset_ret;
11683 }
11684 
11685 #if CONFIG_DTRACE
11686 /*
11687  * Constrain DTrace copyin/copyout actions
11688  */
11689 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11690 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11691 
11692 kern_return_t
11693 dtrace_copyio_preflight(
11694 	__unused addr64_t va)
11695 {
11696 	if (current_map() == kernel_map) {
11697 		return KERN_FAILURE;
11698 	} else {
11699 		return KERN_SUCCESS;
11700 	}
11701 }
11702 
11703 kern_return_t
11704 dtrace_copyio_postflight(
11705 	__unused addr64_t va)
11706 {
11707 	return KERN_SUCCESS;
11708 }
11709 #endif /* CONFIG_DTRACE */
11710 
11711 
11712 void
11713 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11714 {
11715 }
11716 
11717 
11718 void
11719 pmap_flush(
11720 	__unused pmap_flush_context *cpus_to_flush)
11721 {
11722 	/* not implemented yet */
11723 	return;
11724 }
11725 
11726 #if XNU_MONITOR
11727 
11728 /*
11729  * Enforce that the address range described by kva and nbytes is not currently
11730  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11731  * unintentionally writing to PPL-owned memory.
11732  */
11733 void
11734 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11735 {
11736 	vm_offset_t end;
11737 	if (os_add_overflow(kva, nbytes, &end)) {
11738 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11739 	}
11740 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11741 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11742 		pp_attr_t attr;
11743 		unsigned int pai = pa_index(pa);
11744 		if (ckva == phystokv(pa)) {
11745 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11746 		}
11747 		do {
11748 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11749 			if (attr & PP_ATTR_MONITOR) {
11750 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11751 			}
11752 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11753 	}
11754 }
11755 
11756 void
11757 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11758 {
11759 	vm_offset_t end;
11760 	if (os_add_overflow(kva, nbytes, &end)) {
11761 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11762 	}
11763 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11764 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11765 
11766 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11767 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11768 		}
11769 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11770 		ppattr_pa_clear_no_monitor(pa);
11771 	}
11772 }
11773 
11774 /**
11775  * Lock down a page, making all mappings read-only, and preventing further
11776  * mappings or removal of this particular kva's mapping. Effectively, it makes
11777  * the physical page at kva immutable (see the ppl_writable parameter for an
11778  * exception to this).
11779  *
11780  * @param kva Valid address to any mapping of the physical page to lockdown.
11781  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11782  * @param ppl_writable True if the PPL should still be able to write to the page
11783  *                     using the physical aperture mapping. False will make the
11784  *                     page read-only for both the kernel and PPL in the
11785  *                     physical aperture.
11786  */
11787 
11788 MARK_AS_PMAP_TEXT static void
11789 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11790 {
11791 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11792 }
11793 
11794 /**
11795  * Lock down a page, giving all mappings the specified maximum permissions, and
11796  * preventing further mappings or removal of this particular kva's mapping.
11797  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11798  * parameter for an exception to this).
11799  *
11800  * @param kva Valid address to any mapping of the physical page to lockdown.
11801  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11802  * @param ppl_writable True if the PPL should still be able to write to the page
11803  *                     using the physical aperture mapping. False will make the
11804  *                     page read-only for both the kernel and PPL in the
11805  *                     physical aperture.
11806  * @param prot Maximum permissions to allow in existing alias mappings
11807  */
11808 MARK_AS_PMAP_TEXT static void
11809 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11810 {
11811 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11812 	const unsigned int pai = pa_index(pa);
11813 
11814 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11815 	pvh_lock(pai);
11816 	pv_entry_t **pvh = pai_to_pvh(pai);
11817 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11818 
11819 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11820 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11821 	}
11822 
11823 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11824 		panic("%s: %#lx already locked down/executable (%#llx)",
11825 		    __func__, kva, (uint64_t)pvh_flags);
11826 	}
11827 
11828 
11829 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11830 
11831 	/* Update the physical aperture mapping to prevent kernel write access. */
11832 	const unsigned int new_xprr_perm =
11833 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11834 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11835 
11836 	pvh_unlock(pai);
11837 
11838 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11839 
11840 	/**
11841 	 * Double-check that the mapping didn't change physical addresses before the
11842 	 * LOCKDOWN flag was set (there is a brief window between the above
11843 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11844 	 *
11845 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11846 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11847 	 * page without the LOCKDOWN flag already set (so any future mappings can
11848 	 * only be RO, and no existing mappings can be removed).
11849 	 */
11850 	if (kvtophys_nofail(kva) != pa) {
11851 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11852 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11853 	}
11854 }
11855 
11856 /**
11857  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11858  * kernel once again.
11859  *
11860  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11861  *       to unlockdown a page that was never locked down, will panic.
11862  *
11863  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11864  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11865  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11866  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11867  *                     deviation will result in a panic.
11868  */
11869 MARK_AS_PMAP_TEXT static void
11870 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11871 {
11872 	pvh_assert_locked(pai);
11873 	pv_entry_t **pvh = pai_to_pvh(pai);
11874 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11875 
11876 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11877 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11878 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11879 	}
11880 
11881 
11882 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11883 
11884 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11885 	const unsigned int old_xprr_perm =
11886 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11887 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11888 }
11889 
11890 /**
11891  * Release a page from being locked down to the PPL, making it writable to the
11892  * kernel once again.
11893  *
11894  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11895  *       to unlockdown a page that was never locked down, will panic.
11896  *
11897  * @param kva Valid address to any mapping of the physical page to unlockdown.
11898  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11899  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11900  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11901  *                     deviation will result in a panic.
11902  */
11903 MARK_AS_PMAP_TEXT static void
11904 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11905 {
11906 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11907 	const unsigned int pai = pa_index(pa);
11908 
11909 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11910 	pvh_lock(pai);
11911 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11912 	pvh_unlock(pai);
11913 }
11914 
11915 #else /* XNU_MONITOR */
11916 
11917 void __unused
11918 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11919 {
11920 }
11921 
11922 void __unused
11923 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11924 {
11925 }
11926 
11927 #endif /* !XNU_MONITOR */
11928 
11929 
11930 MARK_AS_PMAP_TEXT static inline void
11931 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11932 {
11933 #if XNU_MONITOR
11934 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11935 #else
11936 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11937 #endif
11938 }
11939 
11940 MARK_AS_PMAP_TEXT static inline void
11941 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11942 {
11943 #if XNU_MONITOR
11944 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11945 #else
11946 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11947 #endif
11948 }
11949 
11950 /**
11951  * Perform basic validation checks on the destination only and
11952  * corresponding offset/sizes prior to writing to a read only allocation.
11953  *
11954  * @note Should be called before writing to an allocation from the read
11955  * only allocator.
11956  *
11957  * @param zid The ID of the zone the allocation belongs to.
11958  * @param va VA of element being modified (destination).
11959  * @param offset Offset being written to, in the element.
11960  * @param new_data_size Size of modification.
11961  *
11962  */
11963 
11964 MARK_AS_PMAP_TEXT static void
11965 pmap_ro_zone_validate_element_dst(
11966 	zone_id_t           zid,
11967 	vm_offset_t         va,
11968 	vm_offset_t         offset,
11969 	vm_size_t           new_data_size)
11970 {
11971 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11972 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11973 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11974 	}
11975 
11976 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11977 
11978 	/* Check element is from correct zone and properly aligned */
11979 	zone_require_ro(zid, elem_size, (void*)va);
11980 
11981 	if (__improbable(new_data_size > (elem_size - offset))) {
11982 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
11983 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11984 	}
11985 	if (__improbable(offset >= elem_size)) {
11986 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
11987 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11988 	}
11989 }
11990 
11991 
11992 /**
11993  * Perform basic validation checks on the source, destination and
11994  * corresponding offset/sizes prior to writing to a read only allocation.
11995  *
11996  * @note Should be called before writing to an allocation from the read
11997  * only allocator.
11998  *
11999  * @param zid The ID of the zone the allocation belongs to.
12000  * @param va VA of element being modified (destination).
12001  * @param offset Offset being written to, in the element.
12002  * @param new_data Pointer to new data (source).
12003  * @param new_data_size Size of modification.
12004  *
12005  */
12006 
12007 MARK_AS_PMAP_TEXT static void
12008 pmap_ro_zone_validate_element(
12009 	zone_id_t           zid,
12010 	vm_offset_t         va,
12011 	vm_offset_t         offset,
12012 	const vm_offset_t   new_data,
12013 	vm_size_t           new_data_size)
12014 {
12015 	vm_offset_t sum = 0;
12016 
12017 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12018 		panic("%s: Integer addition overflow %p + %lu = %lu",
12019 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12020 	}
12021 
12022 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12023 }
12024 
12025 /**
12026  * Ensure that physical page is locked down and pinned, before writing to it.
12027  *
12028  * @note Should be called before writing to an allocation from the read
12029  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12030  * ensure that it is called after the modification.
12031  *
12032  *
12033  * @param pa Physical address of the element being modified.
12034  * @param va Virtual address of element being modified.
12035  * @param size Size of the modification.
12036  *
12037  */
12038 
12039 MARK_AS_PMAP_TEXT static void
12040 pmap_ro_zone_lock_phy_page(
12041 	const pmap_paddr_t  pa,
12042 	vm_offset_t         va,
12043 	vm_size_t           size)
12044 {
12045 	const unsigned int pai = pa_index(pa);
12046 	pvh_lock(pai);
12047 
12048 	/* Ensure that the physical page is locked down */
12049 #if XNU_MONITOR
12050 	pv_entry_t **pvh = pai_to_pvh(pai);
12051 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12052 		panic("%s: Physical page not locked down %llx", __func__, pa);
12053 	}
12054 #endif /* XNU_MONITOR */
12055 
12056 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
12057 	pmap_pin_kernel_pages(va, size);
12058 }
12059 
12060 /**
12061  * Unlock and unpin physical page after writing to it.
12062  *
12063  * @note Should be called after writing to an allocation from the read
12064  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12065  * ensure that it has been called prior to the modification.
12066  *
12067  * @param pa Physical address of the element that was modified.
12068  * @param va Virtual address of element that was modified.
12069  * @param size Size of the modification.
12070  *
12071  */
12072 
12073 MARK_AS_PMAP_TEXT static void
12074 pmap_ro_zone_unlock_phy_page(
12075 	const pmap_paddr_t  pa,
12076 	vm_offset_t         va,
12077 	vm_size_t           size)
12078 {
12079 	const unsigned int pai = pa_index(pa);
12080 	pmap_unpin_kernel_pages(va, size);
12081 	pvh_unlock(pai);
12082 }
12083 
12084 /**
12085  * Function to copy kauth_cred from new_data to kv.
12086  * Function defined in "kern_prot.c"
12087  *
12088  * @note Will be removed upon completion of
12089  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12090  *
12091  * @param kv Address to copy new data to.
12092  * @param new_data Pointer to new data.
12093  *
12094  */
12095 
12096 extern void
12097 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12098 
12099 /**
12100  * Zalloc-specific memcpy that writes through the physical aperture
12101  * and ensures the element being modified is from a read-only zone.
12102  *
12103  * @note Designed to work only with the zone allocator's read-only submap.
12104  *
12105  * @param zid The ID of the zone to allocate from.
12106  * @param va VA of element to be modified.
12107  * @param offset Offset from element.
12108  * @param new_data Pointer to new data.
12109  * @param new_data_size	Size of modification.
12110  *
12111  */
12112 
12113 void
12114 pmap_ro_zone_memcpy(
12115 	zone_id_t           zid,
12116 	vm_offset_t         va,
12117 	vm_offset_t         offset,
12118 	const vm_offset_t   new_data,
12119 	vm_size_t           new_data_size)
12120 {
12121 #if XNU_MONITOR
12122 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12123 #else /* XNU_MONITOR */
12124 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12125 #endif /* XNU_MONITOR */
12126 }
12127 
12128 MARK_AS_PMAP_TEXT void
12129 pmap_ro_zone_memcpy_internal(
12130 	zone_id_t             zid,
12131 	vm_offset_t           va,
12132 	vm_offset_t           offset,
12133 	const vm_offset_t     new_data,
12134 	vm_size_t             new_data_size)
12135 {
12136 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12137 
12138 	if (!new_data || new_data_size == 0) {
12139 		return;
12140 	}
12141 
12142 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12143 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12144 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12145 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12146 }
12147 
12148 /**
12149  * Zalloc-specific function to atomically mutate fields of an element that
12150  * belongs to a read-only zone, via the physcial aperture.
12151  *
12152  * @note Designed to work only with the zone allocator's read-only submap.
12153  *
12154  * @param zid The ID of the zone the element belongs to.
12155  * @param va VA of element to be modified.
12156  * @param offset Offset in element.
12157  * @param op Atomic operation to perform.
12158  * @param value	Mutation value.
12159  *
12160  */
12161 
12162 uint64_t
12163 pmap_ro_zone_atomic_op(
12164 	zone_id_t             zid,
12165 	vm_offset_t           va,
12166 	vm_offset_t           offset,
12167 	zro_atomic_op_t       op,
12168 	uint64_t              value)
12169 {
12170 #if XNU_MONITOR
12171 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12172 #else /* XNU_MONITOR */
12173 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12174 #endif /* XNU_MONITOR */
12175 }
12176 
12177 MARK_AS_PMAP_TEXT uint64_t
12178 pmap_ro_zone_atomic_op_internal(
12179 	zone_id_t             zid,
12180 	vm_offset_t           va,
12181 	vm_offset_t           offset,
12182 	zro_atomic_op_t       op,
12183 	uint64_t              value)
12184 {
12185 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12186 	vm_size_t value_size = op & 0xf;
12187 
12188 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12189 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12190 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12191 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12192 
12193 	return value;
12194 }
12195 
12196 /**
12197  * bzero for allocations from read only zones, that writes through the
12198  * physical aperture.
12199  *
12200  * @note This is called by the zfree path of all allocations from read
12201  * only zones.
12202  *
12203  * @param zid The ID of the zone the allocation belongs to.
12204  * @param va VA of element to be zeroed.
12205  * @param offset Offset in the element.
12206  * @param size	Size of allocation.
12207  *
12208  */
12209 
12210 void
12211 pmap_ro_zone_bzero(
12212 	zone_id_t       zid,
12213 	vm_offset_t     va,
12214 	vm_offset_t     offset,
12215 	vm_size_t       size)
12216 {
12217 #if XNU_MONITOR
12218 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12219 #else /* XNU_MONITOR */
12220 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12221 #endif /* XNU_MONITOR */
12222 }
12223 
12224 MARK_AS_PMAP_TEXT void
12225 pmap_ro_zone_bzero_internal(
12226 	zone_id_t       zid,
12227 	vm_offset_t     va,
12228 	vm_offset_t     offset,
12229 	vm_size_t       size)
12230 {
12231 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12232 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12233 	pmap_ro_zone_lock_phy_page(pa, va, size);
12234 	bzero((void*)phystokv(pa), size);
12235 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12236 }
12237 
12238 /**
12239  * Removes write access from the Physical Aperture.
12240  *
12241  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12242  * @note Designed to work only with the zone allocator's read-only submap.
12243  *
12244  * @param va VA of the page to restore write access to.
12245  *
12246  */
12247 MARK_AS_PMAP_TEXT static void
12248 pmap_phys_write_disable(vm_address_t va)
12249 {
12250 #if XNU_MONITOR
12251 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12252 #else /* XNU_MONITOR */
12253 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12254 #endif /* XNU_MONITOR */
12255 }
12256 
12257 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12258 
12259 MARK_AS_PMAP_TEXT mach_vm_size_t
12260 pmap_query_resident_internal(
12261 	pmap_t                  pmap,
12262 	vm_map_address_t        start,
12263 	vm_map_address_t        end,
12264 	mach_vm_size_t          *compressed_bytes_p)
12265 {
12266 	mach_vm_size_t  resident_bytes = 0;
12267 	mach_vm_size_t  compressed_bytes = 0;
12268 
12269 	pt_entry_t     *bpte, *epte;
12270 	pt_entry_t     *pte_p;
12271 	tt_entry_t     *tte_p;
12272 
12273 	if (pmap == NULL) {
12274 		return PMAP_RESIDENT_INVALID;
12275 	}
12276 
12277 	validate_pmap(pmap);
12278 
12279 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12280 
12281 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12282 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12283 	    (end % pt_attr_page_size(pt_attr)))) {
12284 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12285 	}
12286 
12287 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12288 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12289 	}
12290 
12291 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12292 	tte_p = pmap_tte(pmap, start);
12293 	if (tte_p == (tt_entry_t *) NULL) {
12294 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12295 		return PMAP_RESIDENT_INVALID;
12296 	}
12297 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12298 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12299 		bpte = &pte_p[pte_index(pt_attr, start)];
12300 		epte = &pte_p[pte_index(pt_attr, end)];
12301 
12302 		for (; bpte < epte; bpte++) {
12303 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12304 				compressed_bytes += pt_attr_page_size(pt_attr);
12305 			} else if (pa_valid(pte_to_pa(*bpte))) {
12306 				resident_bytes += pt_attr_page_size(pt_attr);
12307 			}
12308 		}
12309 	}
12310 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12311 
12312 	if (compressed_bytes_p) {
12313 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12314 		*compressed_bytes_p += compressed_bytes;
12315 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12316 	}
12317 
12318 	return resident_bytes;
12319 }
12320 
12321 mach_vm_size_t
12322 pmap_query_resident(
12323 	pmap_t                  pmap,
12324 	vm_map_address_t        start,
12325 	vm_map_address_t        end,
12326 	mach_vm_size_t          *compressed_bytes_p)
12327 {
12328 	mach_vm_size_t          total_resident_bytes;
12329 	mach_vm_size_t          compressed_bytes;
12330 	vm_map_address_t        va;
12331 
12332 
12333 	if (pmap == PMAP_NULL) {
12334 		if (compressed_bytes_p) {
12335 			*compressed_bytes_p = 0;
12336 		}
12337 		return 0;
12338 	}
12339 
12340 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12341 
12342 	total_resident_bytes = 0;
12343 	compressed_bytes = 0;
12344 
12345 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12346 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12347 	    VM_KERNEL_ADDRHIDE(end));
12348 
12349 	va = start;
12350 	while (va < end) {
12351 		vm_map_address_t l;
12352 		mach_vm_size_t resident_bytes;
12353 
12354 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12355 
12356 		if (l > end) {
12357 			l = end;
12358 		}
12359 #if XNU_MONITOR
12360 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12361 #else
12362 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12363 #endif
12364 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12365 			break;
12366 		}
12367 
12368 		total_resident_bytes += resident_bytes;
12369 
12370 		va = l;
12371 	}
12372 
12373 	if (compressed_bytes_p) {
12374 		*compressed_bytes_p = compressed_bytes;
12375 	}
12376 
12377 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12378 	    total_resident_bytes);
12379 
12380 	return total_resident_bytes;
12381 }
12382 
12383 #if MACH_ASSERT
12384 static void
12385 pmap_check_ledgers(
12386 	pmap_t pmap)
12387 {
12388 	int     pid;
12389 	char    *procname;
12390 
12391 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12392 		/*
12393 		 * This pmap was not or is no longer fully associated
12394 		 * with a task (e.g. the old pmap after a fork()/exec() or
12395 		 * spawn()).  Its "ledger" still points at a task that is
12396 		 * now using a different (and active) address space, so
12397 		 * we can't check that all the pmap ledgers are balanced here.
12398 		 *
12399 		 * If the "pid" is set, that means that we went through
12400 		 * pmap_set_process() in task_terminate_internal(), so
12401 		 * this task's ledger should not have been re-used and
12402 		 * all the pmap ledgers should be back to 0.
12403 		 */
12404 		return;
12405 	}
12406 
12407 	pid = pmap->pmap_pid;
12408 	procname = pmap->pmap_procname;
12409 
12410 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12411 }
12412 #endif /* MACH_ASSERT */
12413 
12414 void
12415 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12416 {
12417 }
12418 
12419 /**
12420  * The minimum shared region nesting size is used by the VM to determine when to
12421  * break up large mappings to nested regions. The smallest size that these
12422  * mappings can be broken into is determined by what page table level those
12423  * regions are being nested in at and the size of the page tables.
12424  *
12425  * For instance, if a nested region is nesting at L2 for a process utilizing
12426  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12427  * block entry).
12428  *
12429  * @param pmap The target pmap to determine the block size based on whether it's
12430  *             using 16KB or 4KB page tables.
12431  */
12432 uint64_t
12433 pmap_shared_region_size_min(__unused pmap_t pmap)
12434 {
12435 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12436 
12437 	/**
12438 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12439 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12440 	 * point to shared L3 page tables in the shared region pmap.
12441 	 */
12442 	return pt_attr_twig_size(pt_attr);
12443 }
12444 
12445 boolean_t
12446 pmap_enforces_execute_only(
12447 	pmap_t pmap)
12448 {
12449 	return pmap != kernel_pmap;
12450 }
12451 
12452 MARK_AS_PMAP_TEXT void
12453 pmap_set_vm_map_cs_enforced_internal(
12454 	pmap_t pmap,
12455 	bool new_value)
12456 {
12457 	validate_pmap_mutable(pmap);
12458 	pmap->pmap_vm_map_cs_enforced = new_value;
12459 }
12460 
12461 void
12462 pmap_set_vm_map_cs_enforced(
12463 	pmap_t pmap,
12464 	bool new_value)
12465 {
12466 #if XNU_MONITOR
12467 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12468 #else
12469 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12470 #endif
12471 }
12472 
12473 extern int cs_process_enforcement_enable;
12474 bool
12475 pmap_get_vm_map_cs_enforced(
12476 	pmap_t pmap)
12477 {
12478 	if (cs_process_enforcement_enable) {
12479 		return true;
12480 	}
12481 	return pmap->pmap_vm_map_cs_enforced;
12482 }
12483 
12484 MARK_AS_PMAP_TEXT void
12485 pmap_set_jit_entitled_internal(
12486 	__unused pmap_t pmap)
12487 {
12488 	return;
12489 }
12490 
12491 void
12492 pmap_set_jit_entitled(
12493 	pmap_t pmap)
12494 {
12495 #if XNU_MONITOR
12496 	pmap_set_jit_entitled_ppl(pmap);
12497 #else
12498 	pmap_set_jit_entitled_internal(pmap);
12499 #endif
12500 }
12501 
12502 bool
12503 pmap_get_jit_entitled(
12504 	__unused pmap_t pmap)
12505 {
12506 	return false;
12507 }
12508 
12509 MARK_AS_PMAP_TEXT void
12510 pmap_set_tpro_internal(
12511 	__unused pmap_t pmap)
12512 {
12513 	return;
12514 }
12515 
12516 void
12517 pmap_set_tpro(
12518 	pmap_t pmap)
12519 {
12520 #if XNU_MONITOR
12521 	pmap_set_tpro_ppl(pmap);
12522 #else /* XNU_MONITOR */
12523 	pmap_set_tpro_internal(pmap);
12524 #endif /* XNU_MONITOR */
12525 }
12526 
12527 bool
12528 pmap_get_tpro(
12529 	__unused pmap_t pmap)
12530 {
12531 	return false;
12532 }
12533 
12534 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12535 
12536 MARK_AS_PMAP_TEXT kern_return_t
12537 pmap_query_page_info_internal(
12538 	pmap_t          pmap,
12539 	vm_map_offset_t va,
12540 	int             *disp_p)
12541 {
12542 	pmap_paddr_t    pa;
12543 	int             disp;
12544 	unsigned int    pai;
12545 	pt_entry_t      *pte_p, pte;
12546 	pv_entry_t      **pv_h, *pve_p;
12547 
12548 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12549 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12550 		*disp_p = 0;
12551 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12552 		return KERN_INVALID_ARGUMENT;
12553 	}
12554 
12555 	validate_pmap(pmap);
12556 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12557 
12558 try_again:
12559 	disp = 0;
12560 	pte_p = pmap_pte(pmap, va);
12561 	if (pte_p == PT_ENTRY_NULL) {
12562 		goto done;
12563 	}
12564 	pte = *(volatile pt_entry_t*)pte_p;
12565 	pa = pte_to_pa(pte);
12566 	if (pa == 0) {
12567 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12568 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12569 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12570 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12571 			}
12572 		}
12573 	} else {
12574 		disp |= PMAP_QUERY_PAGE_PRESENT;
12575 		pai = pa_index(pa);
12576 		if (!pa_valid(pa)) {
12577 			goto done;
12578 		}
12579 		pvh_lock(pai);
12580 		if (pte != *(volatile pt_entry_t*)pte_p) {
12581 			/* something changed: try again */
12582 			pvh_unlock(pai);
12583 			pmap_query_page_info_retries++;
12584 			goto try_again;
12585 		}
12586 		pv_h = pai_to_pvh(pai);
12587 		pve_p = PV_ENTRY_NULL;
12588 		int pve_ptep_idx = 0;
12589 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12590 			pve_p = pvh_pve_list(pv_h);
12591 			while (pve_p != PV_ENTRY_NULL &&
12592 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12593 				pve_p = pve_next(pve_p);
12594 			}
12595 		}
12596 
12597 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12598 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12599 		} else if (ppattr_test_reusable(pai)) {
12600 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12601 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12602 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12603 		}
12604 		pvh_unlock(pai);
12605 	}
12606 
12607 done:
12608 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12609 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12610 	*disp_p = disp;
12611 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12612 	return KERN_SUCCESS;
12613 }
12614 
12615 kern_return_t
12616 pmap_query_page_info(
12617 	pmap_t          pmap,
12618 	vm_map_offset_t va,
12619 	int             *disp_p)
12620 {
12621 #if XNU_MONITOR
12622 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12623 #else
12624 	return pmap_query_page_info_internal(pmap, va, disp_p);
12625 #endif
12626 }
12627 
12628 
12629 
12630 uint32_t
12631 pmap_user_va_bits(pmap_t pmap __unused)
12632 {
12633 #if __ARM_MIXED_PAGE_SIZE__
12634 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12635 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12636 #else
12637 	return 64 - T0SZ_BOOT;
12638 #endif
12639 }
12640 
12641 uint32_t
12642 pmap_kernel_va_bits(void)
12643 {
12644 	return 64 - T1SZ_BOOT;
12645 }
12646 
12647 static vm_map_size_t
12648 pmap_user_va_size(pmap_t pmap)
12649 {
12650 	return 1ULL << pmap_user_va_bits(pmap);
12651 }
12652 
12653 
12654 
12655 bool
12656 pmap_in_ppl(void)
12657 {
12658 	// Unsupported
12659 	return false;
12660 }
12661 
12662 __attribute__((__noreturn__))
12663 void
12664 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12665 {
12666 	panic("%s called on an unsupported platform.", __FUNCTION__);
12667 }
12668 
12669 void *
12670 pmap_claim_reserved_ppl_page(void)
12671 {
12672 	// Unsupported
12673 	return NULL;
12674 }
12675 
12676 void
12677 pmap_free_reserved_ppl_page(void __unused *kva)
12678 {
12679 	// Unsupported
12680 }
12681 
12682 
12683 #if PMAP_CS_PPL_MONITOR
12684 
12685 /* Immutable part of the trust cache runtime */
12686 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12687 
12688 /* Mutable part of the trust cache runtime */
12689 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12690 
12691 /* Lock for the trust cache runtime */
12692 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12693 
12694 MARK_AS_PMAP_TEXT kern_return_t
12695 pmap_check_trust_cache_runtime_for_uuid_internal(
12696 	const uint8_t check_uuid[kUUIDSize])
12697 {
12698 	kern_return_t ret = KERN_DENIED;
12699 
12700 	if (amfi->TrustCache.version < 3) {
12701 		/* AMFI change hasn't landed in the build */
12702 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12703 		return KERN_NOT_SUPPORTED;
12704 	}
12705 
12706 	/* Lock the runtime as shared */
12707 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12708 
12709 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12710 		&ppl_trust_cache_rt,
12711 		check_uuid,
12712 		NULL);
12713 
12714 	/* Unlock the runtime */
12715 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12716 
12717 	if (tc_ret.error == kTCReturnSuccess) {
12718 		ret = KERN_SUCCESS;
12719 	} else if (tc_ret.error == kTCReturnNotFound) {
12720 		ret = KERN_NOT_FOUND;
12721 	} else {
12722 		ret = KERN_FAILURE;
12723 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12724 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12725 	}
12726 
12727 	return ret;
12728 }
12729 
12730 kern_return_t
12731 pmap_check_trust_cache_runtime_for_uuid(
12732 	const uint8_t check_uuid[kUUIDSize])
12733 {
12734 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12735 }
12736 
12737 MARK_AS_PMAP_TEXT kern_return_t
12738 pmap_load_trust_cache_with_type_internal(
12739 	TCType_t type,
12740 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12741 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12742 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12743 {
12744 	kern_return_t ret = KERN_DENIED;
12745 	pmap_img4_payload_t *payload = NULL;
12746 	size_t img4_payload_len = 0;
12747 	size_t payload_len_aligned = 0;
12748 	size_t manifest_len_aligned = 0;
12749 
12750 	/* Ignore the auxiliary manifest until we add support for it */
12751 	(void)img4_aux_manifest;
12752 	(void)img4_aux_manifest_len;
12753 
12754 
12755 #if PMAP_CS_INCLUDE_CODE_SIGNING
12756 	if (pmap_cs) {
12757 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12758 			panic("trust cache type not loadable from interface: %u", type);
12759 		} else if (type >= kTCTypeTotal) {
12760 			panic("attempted to load an unsupported trust cache type: %u", type);
12761 		}
12762 
12763 		/* Validate entitlement for the calling process */
12764 		if (TCTypeConfig[type].entitlementValue != NULL) {
12765 			const bool entitlement_satisfied = check_entitlement_pmap(
12766 				NULL,
12767 				"com.apple.private.pmap.load-trust-cache",
12768 				TCTypeConfig[type].entitlementValue,
12769 				false,
12770 				true);
12771 
12772 			if (entitlement_satisfied == false) {
12773 				panic("attempted to load trust cache without entitlement: %u", type);
12774 			}
12775 		}
12776 	}
12777 #endif
12778 
12779 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12780 	ret = pmap_reserve_ppl_page();
12781 	if (ret != KERN_SUCCESS) {
12782 		if (ret != KERN_RESOURCE_SHORTAGE) {
12783 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12784 		}
12785 		return ret;
12786 	}
12787 
12788 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12789 	payload_len_aligned = round_page(pmap_img4_payload_len);
12790 	manifest_len_aligned = round_page(img4_manifest_len);
12791 
12792 	/* Ensure we have valid data passed in */
12793 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12794 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12795 
12796 	/*
12797 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12798 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12799 	 * write to that data structure, so we keep the payload PPL writable.
12800 	 */
12801 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12802 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12803 
12804 	/* Should be safe to read from this now */
12805 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12806 
12807 	/* Acquire a writable version of the trust cache data structure */
12808 	TrustCache_t *trust_cache = &payload->trust_cache;
12809 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12810 
12811 	/* Calculate the correct length of the img4 payload */
12812 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12813 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12814 	}
12815 
12816 	/* Exclusively lock the runtime */
12817 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12818 
12819 	/* Load the trust cache */
12820 	TCReturn_t tc_ret = amfi->TrustCache.load(
12821 		&ppl_trust_cache_rt,
12822 		type,
12823 		trust_cache,
12824 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12825 		(const uintptr_t)img4_manifest, img4_manifest_len);
12826 
12827 	/* Unlock the runtime */
12828 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12829 
12830 	if (tc_ret.error == kTCReturnSuccess) {
12831 		ret = KERN_SUCCESS;
12832 	} else {
12833 		if (tc_ret.error == kTCReturnDuplicate) {
12834 			ret = KERN_ALREADY_IN_SET;
12835 		} else {
12836 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12837 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12838 
12839 			ret = KERN_FAILURE;
12840 		}
12841 
12842 		/* Unlock the payload data */
12843 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12844 		trust_cache = NULL;
12845 		payload = NULL;
12846 	}
12847 
12848 	/* Unlock the manifest since it is no longer needed */
12849 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12850 
12851 	/* Return the CoreCrypto reserved page back to the free list */
12852 	pmap_release_reserved_ppl_page();
12853 
12854 	return ret;
12855 }
12856 
12857 kern_return_t
12858 pmap_load_trust_cache_with_type(
12859 	TCType_t type,
12860 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12861 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12862 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12863 {
12864 	kern_return_t ret = KERN_DENIED;
12865 
12866 	ret = pmap_load_trust_cache_with_type_ppl(
12867 		type,
12868 		pmap_img4_payload, pmap_img4_payload_len,
12869 		img4_manifest, img4_manifest_len,
12870 		img4_aux_manifest, img4_aux_manifest_len);
12871 
12872 	while (ret == KERN_RESOURCE_SHORTAGE) {
12873 		/* Allocate a page from the free list */
12874 		pmap_alloc_page_for_ppl(0);
12875 
12876 		/* Attempt the call again */
12877 		ret = pmap_load_trust_cache_with_type_ppl(
12878 			type,
12879 			pmap_img4_payload, pmap_img4_payload_len,
12880 			img4_manifest, img4_manifest_len,
12881 			img4_aux_manifest, img4_aux_manifest_len);
12882 	}
12883 
12884 	return ret;
12885 }
12886 
12887 MARK_AS_PMAP_TEXT kern_return_t
12888 pmap_query_trust_cache_safe(
12889 	TCQueryType_t query_type,
12890 	const uint8_t cdhash[kTCEntryHashSize],
12891 	TrustCacheQueryToken_t *query_token)
12892 {
12893 	kern_return_t ret = KERN_NOT_FOUND;
12894 
12895 	/* Validate the query type preemptively */
12896 	if (query_type >= kTCQueryTypeTotal) {
12897 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12898 		return KERN_INVALID_ARGUMENT;
12899 	}
12900 
12901 	/* Lock the runtime as shared */
12902 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12903 
12904 	TCReturn_t tc_ret = amfi->TrustCache.query(
12905 		&ppl_trust_cache_rt,
12906 		query_type,
12907 		cdhash,
12908 		query_token);
12909 
12910 	/* Unlock the runtime */
12911 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12912 
12913 	if (tc_ret.error == kTCReturnSuccess) {
12914 		ret = KERN_SUCCESS;
12915 	} else if (tc_ret.error == kTCReturnNotFound) {
12916 		ret = KERN_NOT_FOUND;
12917 	} else {
12918 		ret = KERN_FAILURE;
12919 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12920 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12921 	}
12922 
12923 	return ret;
12924 }
12925 
12926 MARK_AS_PMAP_TEXT kern_return_t
12927 pmap_query_trust_cache_internal(
12928 	TCQueryType_t query_type,
12929 	const uint8_t cdhash[kTCEntryHashSize],
12930 	TrustCacheQueryToken_t *query_token)
12931 {
12932 	kern_return_t ret = KERN_NOT_FOUND;
12933 	TrustCacheQueryToken_t query_token_safe = {0};
12934 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12935 
12936 	/* Copy in the CDHash into PPL storage */
12937 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12938 
12939 	/* Query through the safe API since we're in the PPL now */
12940 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12941 
12942 	if (query_token != NULL) {
12943 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12944 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12945 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12946 	}
12947 
12948 	return ret;
12949 }
12950 
12951 kern_return_t
12952 pmap_query_trust_cache(
12953 	TCQueryType_t query_type,
12954 	const uint8_t cdhash[kTCEntryHashSize],
12955 	TrustCacheQueryToken_t *query_token)
12956 {
12957 	kern_return_t ret = KERN_NOT_FOUND;
12958 
12959 	ret = pmap_query_trust_cache_ppl(
12960 		query_type,
12961 		cdhash,
12962 		query_token);
12963 
12964 	return ret;
12965 }
12966 
12967 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
12968 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12969 
12970 MARK_AS_PMAP_TEXT void
12971 pmap_toggle_developer_mode_internal(
12972 	bool state)
12973 {
12974 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12975 
12976 	/*
12977 	 * Only the following state transitions are allowed:
12978 	 * -- not set --> false
12979 	 * -- not set --> true
12980 	 * -- true --> false
12981 	 * -- true --> true
12982 	 * -- false --> false
12983 	 *
12984 	 * We never allow false --> true transitions.
12985 	 */
12986 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
12987 
12988 	if ((current == false) && (state == true) && state_set) {
12989 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
12990 	}
12991 
12992 	/* We're going to update the developer mode state, so update this first */
12993 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
12994 
12995 	/* Update the developer mode state on the system */
12996 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
12997 }
12998 
12999 void
13000 pmap_toggle_developer_mode(
13001 	bool state)
13002 {
13003 	pmap_toggle_developer_mode_ppl(state);
13004 }
13005 
13006 #endif /* PMAP_CS_PPL_MONITOR */
13007 
13008 #if PMAP_CS_INCLUDE_CODE_SIGNING
13009 
13010 static int
13011 pmap_cs_profiles_rbtree_compare(
13012 	void *profile0,
13013 	void *profile1)
13014 {
13015 	if (profile0 < profile1) {
13016 		return -1;
13017 	} else if (profile0 > profile1) {
13018 		return 1;
13019 	}
13020 	return 0;
13021 }
13022 
13023 /* Red-black tree for managing provisioning profiles */
13024 MARK_AS_PMAP_DATA static
13025 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13026 
13027 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13028 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13029 
13030 /* Lock for the profile red-black tree */
13031 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13032 
13033 void
13034 pmap_initialize_provisioning_profiles(void)
13035 {
13036 	/* Initialize the profiles red-black tree lock */
13037 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13038 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13039 
13040 	/* Initialize the red-black tree itself */
13041 	RB_INIT(&pmap_cs_registered_profiles);
13042 
13043 	printf("initialized PPL provisioning profile data\n");
13044 }
13045 
13046 static bool
13047 pmap_is_testflight_profile(
13048 	pmap_cs_profile_t *profile_obj)
13049 {
13050 	const char *entitlement_name = "beta-reports-active";
13051 	const size_t entitlement_length = strlen(entitlement_name);
13052 	CEQueryOperation_t query[2] = {0};
13053 
13054 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13055 	if (profile_obj->entitlements_ctx == NULL) {
13056 		return false;
13057 	}
13058 
13059 	/* Build our CoreEntitlements query */
13060 	query[0].opcode = kCEOpSelectKey;
13061 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13062 	query[0].parameters.stringParameter.length = entitlement_length;
13063 	query[1] = CEMatchBool(true);
13064 
13065 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13066 		profile_obj->entitlements_ctx,
13067 		query, 2);
13068 
13069 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13070 		return true;
13071 	}
13072 
13073 	return false;
13074 }
13075 
13076 static bool
13077 pmap_is_development_profile(
13078 	pmap_cs_profile_t *profile_obj)
13079 {
13080 	/* Check for UPP */
13081 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13082 		*profile_obj->profile_ctx,
13083 		CESelectDictValue("ProvisionsAllDevices"));
13084 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13085 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13086 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13087 			return false;
13088 		}
13089 	}
13090 
13091 	/* Check for TestFlight profile */
13092 	if (pmap_is_testflight_profile(profile_obj) == true) {
13093 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13094 		return false;
13095 	}
13096 
13097 	pmap_cs_log_info("%p: development profile", profile_obj);
13098 	return true;
13099 }
13100 
13101 static kern_return_t
13102 pmap_initialize_profile_entitlements(
13103 	pmap_cs_profile_t *profile_obj)
13104 {
13105 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13106 		*profile_obj->profile_ctx,
13107 		CESelectDictValue("Entitlements"));
13108 
13109 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13110 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13111 		profile_obj->entitlements_ctx = NULL;
13112 
13113 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13114 		return KERN_NOT_FOUND;
13115 	}
13116 
13117 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13118 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13119 
13120 	CEValidationResult ce_result = {0};
13121 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13122 		pmap_cs_core_entitlements_runtime,
13123 		&ce_result,
13124 		der_start, der_end);
13125 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13126 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13127 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13128 
13129 		return KERN_ABORTED;
13130 	}
13131 
13132 	struct CEQueryContext query_ctx = {0};
13133 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13134 		pmap_cs_core_entitlements_runtime,
13135 		ce_result,
13136 		&query_ctx);
13137 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13138 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13139 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13140 
13141 		return KERN_ABORTED;
13142 	}
13143 
13144 	/* Setup the entitlements context within the profile object */
13145 	profile_obj->entitlements_ctx_storage = query_ctx;
13146 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13147 
13148 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13149 	return KERN_SUCCESS;
13150 }
13151 
13152 kern_return_t
13153 pmap_register_provisioning_profile_internal(
13154 	const vm_address_t payload_addr,
13155 	const vm_size_t payload_size)
13156 {
13157 	kern_return_t ret = KERN_DENIED;
13158 	pmap_cs_profile_t *profile_obj = NULL;
13159 	pmap_profile_payload_t *profile_payload = NULL;
13160 	vm_size_t max_profile_blob_size = 0;
13161 	const uint8_t *profile_content = NULL;
13162 	size_t profile_content_length = 0;
13163 
13164 
13165 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13166 	ret = pmap_reserve_ppl_page();
13167 	if (ret != KERN_SUCCESS) {
13168 		if (ret != KERN_RESOURCE_SHORTAGE) {
13169 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13170 		}
13171 		return ret;
13172 	}
13173 
13174 	/* Ensure we have valid data passed in */
13175 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13176 
13177 	/*
13178 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13179 	 * data structure used by the PPL to manage the payload. We need to be able to write
13180 	 * to that data structure, so we keep the payload PPL writable.
13181 	 */
13182 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13183 
13184 	/* Should be safe to read from this now */
13185 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13186 
13187 	/* Ensure the profile blob size provided is valid */
13188 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13189 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13190 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13191 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13192 	}
13193 
13194 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13195 	const bool allow_development_root_cert = true;
13196 #else
13197 	const bool allow_development_root_cert = false;
13198 #endif
13199 
13200 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13201 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13202 		allow_development_root_cert,
13203 		&profile_content, &profile_content_length);
13204 
13205 	/* Release the PPL page allocated for CoreCrypto */
13206 	pmap_release_reserved_ppl_page();
13207 
13208 	if (ct_result != 0) {
13209 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13210 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13211 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13212 		    profile_content, profile_content_length);
13213 	}
13214 
13215 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13216 		pmap_cs_core_entitlements_runtime,
13217 		CCDER_CONSTRUCTED_SET,
13218 		false,
13219 		profile_content, profile_content + profile_content_length);
13220 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13221 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13222 	}
13223 
13224 	/* Acquire a writable version of the profile data structure */
13225 	profile_obj = &profile_payload->profile_obj_storage;
13226 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13227 
13228 	profile_obj->original_payload = profile_payload;
13229 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13230 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13231 	os_atomic_store(&profile_obj->reference_count, 0, release);
13232 
13233 	/* Setup the entitlements provisioned by the profile */
13234 	ret = pmap_initialize_profile_entitlements(profile_obj);
13235 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13236 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13237 	}
13238 
13239 	/* Setup properties of the profile */
13240 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13241 
13242 	/* Mark as validated since it passed all checks */
13243 	profile_obj->profile_validated = true;
13244 
13245 	/* Add the profile to the red-black tree */
13246 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13247 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13248 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13249 	}
13250 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13251 
13252 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13253 	return KERN_SUCCESS;
13254 }
13255 
13256 kern_return_t
13257 pmap_register_provisioning_profile(
13258 	const vm_address_t payload_addr,
13259 	const vm_size_t payload_size)
13260 {
13261 	kern_return_t ret = KERN_DENIED;
13262 
13263 	ret = pmap_register_provisioning_profile_ppl(
13264 		payload_addr,
13265 		payload_size);
13266 
13267 	while (ret == KERN_RESOURCE_SHORTAGE) {
13268 		/* Allocate a page from the free list */
13269 		pmap_alloc_page_for_ppl(0);
13270 
13271 		/* Attempt the call again */
13272 		ret = pmap_register_provisioning_profile_ppl(
13273 			payload_addr,
13274 			payload_size);
13275 	}
13276 
13277 	return ret;
13278 }
13279 
13280 kern_return_t
13281 pmap_unregister_provisioning_profile_internal(
13282 	pmap_cs_profile_t *profile_obj)
13283 {
13284 	kern_return_t ret = KERN_DENIED;
13285 
13286 	/* Lock the red-black tree exclusively */
13287 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13288 
13289 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13290 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13291 	}
13292 
13293 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13294 	if (reference_count != 0) {
13295 		ret = KERN_FAILURE;
13296 		goto exit;
13297 	}
13298 
13299 	/* Remove the profile from the red-black tree */
13300 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13301 
13302 	/* Unregistration was a success */
13303 	ret = KERN_SUCCESS;
13304 
13305 exit:
13306 	/* Unlock the red-black tree */
13307 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13308 
13309 	if (ret == KERN_SUCCESS) {
13310 		/* Get the original payload address */
13311 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13312 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13313 
13314 		/* Get the original payload size */
13315 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13316 		payload_size = round_page(payload_size);
13317 
13318 		/* Unlock the profile payload */
13319 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13320 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13321 		    profile_payload, payload_size);
13322 
13323 		profile_obj = NULL;
13324 	}
13325 	return ret;
13326 }
13327 
13328 kern_return_t
13329 pmap_unregister_provisioning_profile(
13330 	pmap_cs_profile_t *profile_obj)
13331 {
13332 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13333 }
13334 
13335 kern_return_t
13336 pmap_associate_provisioning_profile_internal(
13337 	pmap_cs_code_directory_t *cd_entry,
13338 	pmap_cs_profile_t *profile_obj)
13339 {
13340 	kern_return_t ret = KERN_DENIED;
13341 
13342 	/* Acquire the lock on the code directory */
13343 	pmap_cs_lock_code_directory(cd_entry);
13344 
13345 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13346 		pmap_cs_log_error("disallowing profile association with verified signature");
13347 		goto exit;
13348 	} else if (cd_entry->profile_obj != NULL) {
13349 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13350 		goto exit;
13351 	}
13352 
13353 	/* Lock the red-black tree as shared */
13354 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13355 
13356 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13357 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13358 	} else if (profile_obj->profile_validated == false) {
13359 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13360 	}
13361 
13362 	/* Associate the profile with the signature */
13363 	cd_entry->profile_obj = profile_obj;
13364 
13365 	/* Increment the reference count on the profile object */
13366 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13367 	if (reference_count == 0) {
13368 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13369 	}
13370 
13371 	/* Unlock the red-black tree */
13372 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13373 
13374 	/* Association was a success */
13375 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13376 	ret = KERN_SUCCESS;
13377 
13378 exit:
13379 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13380 
13381 	return ret;
13382 }
13383 
13384 kern_return_t
13385 pmap_associate_provisioning_profile(
13386 	pmap_cs_code_directory_t *cd_entry,
13387 	pmap_cs_profile_t *profile_obj)
13388 {
13389 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13390 }
13391 
13392 kern_return_t
13393 pmap_disassociate_provisioning_profile_internal(
13394 	pmap_cs_code_directory_t *cd_entry)
13395 {
13396 	pmap_cs_profile_t *profile_obj = NULL;
13397 	kern_return_t ret = KERN_DENIED;
13398 
13399 	/* Acquire the lock on the code directory */
13400 	pmap_cs_lock_code_directory(cd_entry);
13401 
13402 	if (cd_entry->profile_obj == NULL) {
13403 		ret = KERN_NOT_FOUND;
13404 		goto exit;
13405 	}
13406 	profile_obj = cd_entry->profile_obj;
13407 
13408 	/* Disassociate the profile from the signature */
13409 	cd_entry->profile_obj = NULL;
13410 
13411 	/* Disassociation was a success */
13412 	ret = KERN_SUCCESS;
13413 
13414 exit:
13415 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13416 
13417 	if (ret == KERN_SUCCESS) {
13418 		/* Decrement the reference count on the profile object */
13419 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13420 		if (reference_count == UINT32_MAX) {
13421 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13422 		}
13423 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13424 	}
13425 	return ret;
13426 }
13427 
13428 kern_return_t
13429 pmap_disassociate_provisioning_profile(
13430 	pmap_cs_code_directory_t *cd_entry)
13431 {
13432 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13433 }
13434 
13435 kern_return_t
13436 pmap_associate_kernel_entitlements_internal(
13437 	pmap_cs_code_directory_t *cd_entry,
13438 	const void *kernel_entitlements)
13439 {
13440 	kern_return_t ret = KERN_DENIED;
13441 
13442 	if (kernel_entitlements == NULL) {
13443 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13444 	}
13445 
13446 	/* Acquire the lock on the code directory */
13447 	pmap_cs_lock_code_directory(cd_entry);
13448 
13449 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13450 		ret = KERN_DENIED;
13451 		goto out;
13452 	} else if (cd_entry->kernel_entitlements != NULL) {
13453 		ret = KERN_DENIED;
13454 		goto out;
13455 	}
13456 	cd_entry->kernel_entitlements = kernel_entitlements;
13457 
13458 	/* Association was a success */
13459 	ret = KERN_SUCCESS;
13460 
13461 out:
13462 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13463 	return ret;
13464 }
13465 
13466 kern_return_t
13467 pmap_associate_kernel_entitlements(
13468 	pmap_cs_code_directory_t *cd_entry,
13469 	const void *kernel_entitlements)
13470 {
13471 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13472 }
13473 
13474 kern_return_t
13475 pmap_resolve_kernel_entitlements_internal(
13476 	pmap_t pmap,
13477 	const void **kernel_entitlements)
13478 {
13479 	const void *entitlements = NULL;
13480 	pmap_cs_code_directory_t *cd_entry = NULL;
13481 	kern_return_t ret = KERN_DENIED;
13482 
13483 	/* Validate the PMAP object */
13484 	validate_pmap(pmap);
13485 
13486 	/* Ensure no kernel PMAP */
13487 	if (pmap == kernel_pmap) {
13488 		return KERN_NOT_FOUND;
13489 	}
13490 
13491 	/* Attempt a shared lock on the PMAP */
13492 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13493 		return KERN_ABORTED;
13494 	}
13495 
13496 	/*
13497 	 * Acquire the code signature from the PMAP. This function is called when
13498 	 * performing an entitlement check, and since we've confirmed this isn't
13499 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13500 	 * with a code signature.
13501 	 */
13502 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13503 	if (cd_entry == NULL) {
13504 		ret = KERN_NOT_FOUND;
13505 		goto out;
13506 	}
13507 
13508 	entitlements = cd_entry->kernel_entitlements;
13509 	if (entitlements == NULL) {
13510 		ret = KERN_NOT_FOUND;
13511 		goto out;
13512 	}
13513 
13514 	/* Pin and write out the entitlements object pointer */
13515 	if (kernel_entitlements != NULL) {
13516 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13517 		*kernel_entitlements = entitlements;
13518 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13519 	}
13520 
13521 	/* Successfully resolved the entitlements */
13522 	ret = KERN_SUCCESS;
13523 
13524 out:
13525 	/* Unlock the code signature object */
13526 	if (cd_entry != NULL) {
13527 		lck_rw_unlock_shared(&cd_entry->rwlock);
13528 		cd_entry = NULL;
13529 	}
13530 
13531 	/* Unlock the PMAP object */
13532 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13533 
13534 	return ret;
13535 }
13536 
13537 kern_return_t
13538 pmap_resolve_kernel_entitlements(
13539 	pmap_t pmap,
13540 	const void **kernel_entitlements)
13541 {
13542 	kern_return_t ret = KERN_DENIED;
13543 
13544 	do {
13545 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13546 	} while (ret == KERN_ABORTED);
13547 
13548 	return ret;
13549 }
13550 
13551 kern_return_t
13552 pmap_accelerate_entitlements_internal(
13553 	pmap_cs_code_directory_t *cd_entry)
13554 {
13555 	const coreentitlements_t *CoreEntitlements = NULL;
13556 	const CS_SuperBlob *superblob = NULL;
13557 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13558 	size_t signature_length = 0;
13559 	size_t acceleration_length = 0;
13560 	size_t required_length = 0;
13561 	kern_return_t ret = KERN_DENIED;
13562 
13563 	/* Setup the CoreEntitlements interface */
13564 	CoreEntitlements = &amfi->CoreEntitlements;
13565 
13566 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13567 
13568 	/* Acquire the lock on the code directory */
13569 	pmap_cs_lock_code_directory(cd_entry);
13570 
13571 	/*
13572 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13573 	 * decision we make since this allows us to re-use any unused space within the
13574 	 * locked down code signature region. There is also a decent bit of validation
13575 	 * within the reconstitution function to ensure blobs are ordered and do not
13576 	 * contain any padding around them which can cause issues here.
13577 	 *
13578 	 * This also serves as a check to ensure the signature is trusted.
13579 	 */
13580 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13581 		ret = KERN_DENIED;
13582 		goto out;
13583 	}
13584 
13585 	if (cd_entry->ce_ctx == NULL) {
13586 		ret = KERN_SUCCESS;
13587 		goto out;
13588 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13589 		ret = KERN_SUCCESS;
13590 		goto out;
13591 	}
13592 
13593 	/* We only support accelerating when size <= PAGE_SIZE */
13594 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13595 	if (ce_err != CoreEntitlements->kNoError) {
13596 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13597 			/* Small entitlement blobs aren't eligible */
13598 			ret = KERN_SUCCESS;
13599 			goto out;
13600 		}
13601 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13602 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13603 	} else if (acceleration_length > PAGE_SIZE) {
13604 		ret = KERN_ABORTED;
13605 		goto out;
13606 	}
13607 	assert(acceleration_length > 0);
13608 
13609 	superblob = cd_entry->superblob;
13610 	signature_length = ntohl(superblob->length);
13611 
13612 	/* Adjust the required length for the overhead structure -- can't overflow */
13613 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13614 	if (required_length > PAGE_SIZE) {
13615 		ret = KERN_ABORTED;
13616 		goto out;
13617 	}
13618 
13619 	/*
13620 	 * First we'll check if the code signature has enough space within the locked down
13621 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13622 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13623 	 * free list.
13624 	 *
13625 	 * When we're storing the buffer within the code signature, we also need to make
13626 	 * sure we account for alignment of the buffer.
13627 	 */
13628 	const vm_address_t align_mask = sizeof(void*) - 1;
13629 	size_t required_length_within_sig = required_length + align_mask;
13630 
13631 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13632 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13633 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13634 
13635 		/* We need to resolve to the physical aperture */
13636 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13637 		acceleration_buf = (void*)phystokv(phys_addr);
13638 
13639 		/* Ensure the offset within the page wasn't lost */
13640 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13641 
13642 		acceleration_buf->allocated = false;
13643 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13644 	} else {
13645 		if (required_length <= pmap_cs_blob_limit) {
13646 			struct pmap_cs_blob *bucket = NULL;
13647 			size_t bucket_size = 0;
13648 
13649 			/* Allocate a buffer from the blob allocator */
13650 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13651 			if (ret != KERN_SUCCESS) {
13652 				goto out;
13653 			}
13654 			acceleration_buf = (void*)bucket->blob;
13655 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13656 		} else {
13657 			pmap_paddr_t phys_addr = 0;
13658 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13659 			if (ret != KERN_SUCCESS) {
13660 				goto out;
13661 			}
13662 			acceleration_buf = (void*)phystokv(phys_addr);
13663 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13664 		}
13665 		acceleration_buf->allocated = true;
13666 	}
13667 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13668 	acceleration_buf->length = acceleration_length;
13669 
13670 	/* Take the acceleration buffer lock */
13671 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13672 
13673 	/* Setup the global acceleration buffer state */
13674 	pmap_cs_acceleration_buf = acceleration_buf;
13675 
13676 	/* Accelerate the entitlements */
13677 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13678 	if (ce_err != CoreEntitlements->kNoError) {
13679 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13680 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13681 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13682 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13683 	}
13684 
13685 	/*
13686 	 * The global acceleration buffer lock is unlocked by the allocation function itself
13687 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13688 	 * an assert that the lock is unlocked here since another thread could have acquired
13689 	 * it by now.
13690 	 */
13691 	ret = KERN_SUCCESS;
13692 
13693 out:
13694 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13695 	return ret;
13696 }
13697 
13698 kern_return_t
13699 pmap_accelerate_entitlements(
13700 	pmap_cs_code_directory_t *cd_entry)
13701 {
13702 	kern_return_t ret = KERN_DENIED;
13703 
13704 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
13705 	while (ret == KERN_RESOURCE_SHORTAGE) {
13706 		/* Allocate a page for the PPL */
13707 		pmap_alloc_page_for_ppl(0);
13708 
13709 		/* Try again */
13710 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
13711 	}
13712 
13713 	return ret;
13714 }
13715 
13716 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13717 
13718 MARK_AS_PMAP_TEXT bool
13719 pmap_lookup_in_loaded_trust_caches_internal(
13720 	const uint8_t cdhash[CS_CDHASH_LEN])
13721 {
13722 	kern_return_t kr = KERN_NOT_FOUND;
13723 
13724 #if PMAP_CS_PPL_MONITOR
13725 	/*
13726 	 * If we have the PPL monitor, then this function can only be called from
13727 	 * within the PPL. Calling it directly would've caused a panic, so we can
13728 	 * assume that we're in the PPL here.
13729 	 */
13730 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13731 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13732 
13733 	kr = pmap_query_trust_cache_safe(
13734 		kTCQueryTypeLoadable,
13735 		cdhash_safe,
13736 		NULL);
13737 #else
13738 	kr = query_trust_cache(
13739 		kTCQueryTypeLoadable,
13740 		cdhash,
13741 		NULL);
13742 #endif
13743 
13744 	if (kr == KERN_SUCCESS) {
13745 		return true;
13746 	}
13747 	return false;
13748 }
13749 
13750 bool
13751 pmap_lookup_in_loaded_trust_caches(
13752 	const uint8_t cdhash[CS_CDHASH_LEN])
13753 {
13754 #if XNU_MONITOR
13755 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13756 #else
13757 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13758 #endif
13759 }
13760 
13761 MARK_AS_PMAP_TEXT uint32_t
13762 pmap_lookup_in_static_trust_cache_internal(
13763 	const uint8_t cdhash[CS_CDHASH_LEN])
13764 {
13765 	TrustCacheQueryToken_t query_token = {0};
13766 	kern_return_t kr = KERN_NOT_FOUND;
13767 	uint64_t flags = 0;
13768 	uint8_t hash_type = 0;
13769 
13770 #if PMAP_CS_PPL_MONITOR
13771 	/*
13772 	 * If we have the PPL monitor, then this function can only be called from
13773 	 * within the PPL. Calling it directly would've caused a panic, so we can
13774 	 * assume that we're in the PPL here.
13775 	 */
13776 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13777 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13778 
13779 	kr = pmap_query_trust_cache_safe(
13780 		kTCQueryTypeStatic,
13781 		cdhash_safe,
13782 		&query_token);
13783 #else
13784 	kr = query_trust_cache(
13785 		kTCQueryTypeStatic,
13786 		cdhash,
13787 		&query_token);
13788 #endif
13789 
13790 	if (kr == KERN_SUCCESS) {
13791 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
13792 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13793 
13794 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13795 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13796 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13797 	}
13798 
13799 	return 0;
13800 }
13801 
13802 uint32_t
13803 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13804 {
13805 #if XNU_MONITOR
13806 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13807 #else
13808 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
13809 #endif
13810 }
13811 
13812 #if PMAP_CS_INCLUDE_CODE_SIGNING
13813 
13814 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13815 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13816 
13817 MARK_AS_PMAP_TEXT void
13818 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13819 {
13820 
13821 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13822 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13823 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13824 
13825 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13826 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13827 }
13828 
13829 MARK_AS_PMAP_TEXT bool
13830 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13831 {
13832 	bool match = false;
13833 
13834 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13835 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13836 		match = true;
13837 	}
13838 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13839 
13840 	if (match) {
13841 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13842 	}
13843 
13844 	return match;
13845 }
13846 
13847 void
13848 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13849 {
13850 #if XNU_MONITOR
13851 	pmap_set_compilation_service_cdhash_ppl(cdhash);
13852 #else
13853 	pmap_set_compilation_service_cdhash_internal(cdhash);
13854 #endif
13855 }
13856 
13857 bool
13858 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13859 {
13860 #if XNU_MONITOR
13861 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
13862 #else
13863 	return pmap_match_compilation_service_cdhash_internal(cdhash);
13864 #endif
13865 }
13866 
13867 /*
13868  * As part of supporting local signing on the device, we need the PMAP layer
13869  * to store the local signing key so that PMAP_CS can validate with it. We
13870  * store it at the PMAP layer such that it is accessible to both AMFI and
13871  * PMAP_CS should they need it.
13872  */
13873 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13874 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13875 
13876 MARK_AS_PMAP_TEXT void
13877 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13878 {
13879 	bool key_set = false;
13880 
13881 	/*
13882 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13883 	 * a successful exchange means that the local signing public key has _not_ been
13884 	 * set. In case the key has been set, we panic as we would never expect the
13885 	 * kernel to attempt to set the key more than once.
13886 	 */
13887 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13888 
13889 	if (key_set) {
13890 		panic("attempted to set the local signing public key multiple times");
13891 	}
13892 
13893 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13894 	pmap_cs_log_info("set local signing public key");
13895 }
13896 
13897 void
13898 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13899 {
13900 #if XNU_MONITOR
13901 	return pmap_set_local_signing_public_key_ppl(public_key);
13902 #else
13903 	return pmap_set_local_signing_public_key_internal(public_key);
13904 #endif
13905 }
13906 
13907 uint8_t*
13908 pmap_get_local_signing_public_key(void)
13909 {
13910 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13911 
13912 	if (key_set) {
13913 		return pmap_local_signing_public_key;
13914 	}
13915 
13916 	return NULL;
13917 }
13918 
13919 /*
13920  * Locally signed applications need to be explicitly authorized by an entitled application
13921  * before we allow them to run.
13922  */
13923 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13924 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13925 
13926 MARK_AS_PMAP_TEXT void
13927 pmap_unrestrict_local_signing_internal(
13928 	const uint8_t cdhash[CS_CDHASH_LEN])
13929 {
13930 
13931 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13932 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13933 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13934 
13935 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13936 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13937 }
13938 
13939 void
13940 pmap_unrestrict_local_signing(
13941 	const uint8_t cdhash[CS_CDHASH_LEN])
13942 {
13943 #if XNU_MONITOR
13944 	return pmap_unrestrict_local_signing_ppl(cdhash);
13945 #else
13946 	return pmap_unrestrict_local_signing_internal(cdhash);
13947 #endif
13948 }
13949 
13950 #if PMAP_CS
13951 MARK_AS_PMAP_TEXT static void
13952 pmap_restrict_local_signing(void)
13953 {
13954 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13955 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13956 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13957 }
13958 
13959 MARK_AS_PMAP_TEXT static bool
13960 pmap_local_signing_restricted(
13961 	const uint8_t cdhash[CS_CDHASH_LEN])
13962 {
13963 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13964 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13965 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13966 
13967 	return ret != 0;
13968 }
13969 
13970 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13971 #endif
13972 
13973 MARK_AS_PMAP_TEXT void
13974 pmap_footprint_suspend_internal(
13975 	vm_map_t        map,
13976 	boolean_t       suspend)
13977 {
13978 #if DEVELOPMENT || DEBUG
13979 	if (suspend) {
13980 		current_thread()->pmap_footprint_suspended = TRUE;
13981 		map->pmap->footprint_was_suspended = TRUE;
13982 	} else {
13983 		current_thread()->pmap_footprint_suspended = FALSE;
13984 	}
13985 #else /* DEVELOPMENT || DEBUG */
13986 	(void) map;
13987 	(void) suspend;
13988 #endif /* DEVELOPMENT || DEBUG */
13989 }
13990 
13991 void
13992 pmap_footprint_suspend(
13993 	vm_map_t map,
13994 	boolean_t suspend)
13995 {
13996 #if XNU_MONITOR
13997 	pmap_footprint_suspend_ppl(map, suspend);
13998 #else
13999 	pmap_footprint_suspend_internal(map, suspend);
14000 #endif
14001 }
14002 
14003 MARK_AS_PMAP_TEXT void
14004 pmap_nop_internal(pmap_t pmap __unused)
14005 {
14006 	validate_pmap_mutable(pmap);
14007 }
14008 
14009 void
14010 pmap_nop(pmap_t pmap)
14011 {
14012 #if XNU_MONITOR
14013 	pmap_nop_ppl(pmap);
14014 #else
14015 	pmap_nop_internal(pmap);
14016 #endif
14017 }
14018 
14019 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14020 
14021 struct page_table_dump_header {
14022 	uint64_t pa;
14023 	uint64_t num_entries;
14024 	uint64_t start_va;
14025 	uint64_t end_va;
14026 };
14027 
14028 static kern_return_t
14029 pmap_dump_page_tables_recurse(pmap_t pmap,
14030     const tt_entry_t *ttp,
14031     unsigned int cur_level,
14032     unsigned int level_mask,
14033     uint64_t start_va,
14034     void *buf_start,
14035     void *buf_end,
14036     size_t *bytes_copied)
14037 {
14038 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14039 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14040 
14041 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14042 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14043 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14044 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14045 
14046 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14047 
14048 	if (cur_level == pt_attr_root_level(pt_attr)) {
14049 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14050 	}
14051 
14052 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14053 	const tt_entry_t *tt_end = &ttp[num_entries];
14054 
14055 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14056 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14057 	}
14058 
14059 	if (level_mask & (1U << cur_level)) {
14060 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14061 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14062 		header->num_entries = num_entries;
14063 		header->start_va = start_va;
14064 		header->end_va = start_va + (num_entries * size);
14065 
14066 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14067 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14068 	}
14069 	uint64_t current_va = start_va;
14070 
14071 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14072 		tt_entry_t tte = *ttep;
14073 
14074 		if (!(tte & valid_mask)) {
14075 			continue;
14076 		}
14077 
14078 		if ((tte & type_mask) == type_block) {
14079 			continue;
14080 		} else {
14081 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14082 				panic("%s: corrupt entry %#llx at %p, "
14083 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14084 				    __FUNCTION__, tte, ttep,
14085 				    ttp, cur_level, bufp, buf_end);
14086 			}
14087 
14088 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14089 
14090 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14091 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14092 
14093 			if (recurse_result != KERN_SUCCESS) {
14094 				return recurse_result;
14095 			}
14096 		}
14097 	}
14098 
14099 	return KERN_SUCCESS;
14100 }
14101 
14102 kern_return_t
14103 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14104 {
14105 	if (not_in_kdp) {
14106 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14107 	}
14108 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14109 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14110 }
14111 
14112 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14113 
14114 kern_return_t
14115 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14116     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14117 {
14118 	return KERN_NOT_SUPPORTED;
14119 }
14120 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14121 
14122 
14123 #ifdef CONFIG_XNUPOST
14124 #ifdef __arm64__
14125 static volatile bool pmap_test_took_fault = false;
14126 
14127 static bool
14128 pmap_test_fault_handler(arm_saved_state_t * state)
14129 {
14130 	bool retval                 = false;
14131 	uint32_t esr                = get_saved_state_esr(state);
14132 	esr_exception_class_t class = ESR_EC(esr);
14133 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14134 
14135 	if ((class == ESR_EC_DABORT_EL1) &&
14136 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14137 		pmap_test_took_fault = true;
14138 		/* return to the instruction immediately after the call to NX page */
14139 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14140 		retval = true;
14141 	}
14142 
14143 	return retval;
14144 }
14145 
14146 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14147 static NOKASAN bool
14148 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14149 {
14150 	pmap_t old_pmap = NULL;
14151 
14152 	pmap_test_took_fault = false;
14153 
14154 	/*
14155 	 * We're potentially switching pmaps without using the normal thread
14156 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14157 	 * memory accesses.
14158 	 */
14159 	uint64_t old_int_state = pmap_interrupts_disable();
14160 	mp_disable_preemption();
14161 
14162 	if (pmap != NULL) {
14163 		old_pmap = current_pmap();
14164 		pmap_switch(pmap);
14165 
14166 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14167 #if __ARM_PAN_AVAILABLE__
14168 		__builtin_arm_wsr("pan", 0);
14169 #endif /* __ARM_PAN_AVAILABLE__ */
14170 	}
14171 
14172 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14173 
14174 	if (is_write) {
14175 		*((volatile uint64_t*)(va)) = 0xdec0de;
14176 	} else {
14177 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14178 		(void)tmp;
14179 	}
14180 
14181 	/* Save the fault bool, and undo the gross stuff we did. */
14182 	bool took_fault = pmap_test_took_fault;
14183 	ml_expect_fault_end();
14184 
14185 	if (pmap != NULL) {
14186 #if __ARM_PAN_AVAILABLE__
14187 		__builtin_arm_wsr("pan", 1);
14188 #endif /* __ARM_PAN_AVAILABLE__ */
14189 
14190 		pmap_switch(old_pmap);
14191 	}
14192 
14193 	mp_enable_preemption();
14194 	pmap_interrupts_restore(old_int_state);
14195 	bool retval = (took_fault == should_fault);
14196 	return retval;
14197 }
14198 
14199 static bool
14200 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14201 {
14202 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14203 
14204 	if (!retval) {
14205 		T_FAIL("%s: %s, "
14206 		    "pmap=%p, va=%p, should_fault=%u",
14207 		    __func__, should_fault ? "did not fault" : "faulted",
14208 		    pmap, (void*)va, (unsigned)should_fault);
14209 	}
14210 
14211 	return retval;
14212 }
14213 
14214 static bool
14215 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14216 {
14217 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14218 
14219 	if (!retval) {
14220 		T_FAIL("%s: %s, "
14221 		    "pmap=%p, va=%p, should_fault=%u",
14222 		    __func__, should_fault ? "did not fault" : "faulted",
14223 		    pmap, (void*)va, (unsigned)should_fault);
14224 	}
14225 
14226 	return retval;
14227 }
14228 
14229 static bool
14230 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14231 {
14232 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14233 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14234 
14235 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14236 
14237 	if (!retval) {
14238 		T_FAIL("%s: bits=%u, "
14239 		    "pa=%p, should_be_set=%u",
14240 		    __func__, bits,
14241 		    (void*)pa, should_be_set);
14242 	}
14243 
14244 	return retval;
14245 }
14246 
14247 static __attribute__((noinline)) bool
14248 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14249 {
14250 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14251 	return retval;
14252 }
14253 
14254 static int
14255 pmap_test_test_config(unsigned int flags)
14256 {
14257 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14258 	unsigned int map_count = 0;
14259 	unsigned long page_ratio = 0;
14260 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14261 
14262 	if (!pmap) {
14263 		panic("Failed to allocate pmap");
14264 	}
14265 
14266 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14267 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14268 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14269 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14270 
14271 	if (pmap_page_size <= native_page_size) {
14272 		page_ratio = native_page_size / pmap_page_size;
14273 	} else {
14274 		/*
14275 		 * We claim to support a page_ratio of less than 1, which is
14276 		 * not currently supported by the pmap layer; panic.
14277 		 */
14278 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14279 		    "flags=%u",
14280 		    __func__, native_page_size, pmap_page_size,
14281 		    flags);
14282 	}
14283 
14284 	if (PAGE_RATIO > 1) {
14285 		/*
14286 		 * The kernel is deliberately pretending to have 16KB pages.
14287 		 * The pmap layer has code that supports this, so pretend the
14288 		 * page size is larger than it is.
14289 		 */
14290 		pmap_page_size = PAGE_SIZE;
14291 		native_page_size = PAGE_SIZE;
14292 	}
14293 
14294 	/*
14295 	 * Get two pages from the VM; one to be mapped wired, and one to be
14296 	 * mapped nonwired.
14297 	 */
14298 	vm_page_t unwired_vm_page = vm_page_grab();
14299 	vm_page_t wired_vm_page = vm_page_grab();
14300 
14301 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14302 		panic("Failed to grab VM pages");
14303 	}
14304 
14305 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14306 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14307 
14308 	pmap_paddr_t pa = ptoa(pn);
14309 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14310 
14311 	/*
14312 	 * We'll start mappings at the second twig TT.  This keeps us from only
14313 	 * using the first entry in each TT, which would trivially be address
14314 	 * 0; one of the things we will need to test is retrieving the VA for
14315 	 * a given PTE.
14316 	 */
14317 	vm_map_address_t va_base = pmap_twig_size;
14318 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14319 
14320 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14321 		/*
14322 		 * Not exactly a functional failure, but this test relies on
14323 		 * there being a spare PTE slot we can use to pin the TT.
14324 		 */
14325 		panic("Cannot pin translation table");
14326 	}
14327 
14328 	/*
14329 	 * Create the wired mapping; this will prevent the pmap layer from
14330 	 * reclaiming our test TTs, which would interfere with this test
14331 	 * ("interfere" -> "make it panic").
14332 	 */
14333 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14334 
14335 #if XNU_MONITOR
14336 	/*
14337 	 * If the PPL is enabled, make sure that the kernel cannot write
14338 	 * to PPL memory.
14339 	 */
14340 	if (!pmap_ppl_disable) {
14341 		T_LOG("Validate that kernel cannot write to PPL memory.");
14342 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14343 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14344 	}
14345 #endif
14346 
14347 	/*
14348 	 * Create read-only mappings of the nonwired page; if the pmap does
14349 	 * not use the same page size as the kernel, create multiple mappings
14350 	 * so that the kernel page is fully mapped.
14351 	 */
14352 	for (map_count = 0; map_count < page_ratio; map_count++) {
14353 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14354 	}
14355 
14356 	/* Validate that all the PTEs have the expected PA and VA. */
14357 	for (map_count = 0; map_count < page_ratio; map_count++) {
14358 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14359 
14360 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14361 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14362 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14363 		}
14364 
14365 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14366 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14367 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14368 		}
14369 	}
14370 
14371 	T_LOG("Validate that reads to our mapping do not fault.");
14372 	pmap_test_read(pmap, va_base, false);
14373 
14374 	T_LOG("Validate that writes to our mapping fault.");
14375 	pmap_test_write(pmap, va_base, true);
14376 
14377 	T_LOG("Make the first mapping writable.");
14378 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14379 
14380 	T_LOG("Validate that writes to our mapping do not fault.");
14381 	pmap_test_write(pmap, va_base, false);
14382 
14383 
14384 	T_LOG("Make the first mapping execute-only");
14385 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14386 
14387 
14388 	T_LOG("Validate that reads to our mapping do not fault.");
14389 	pmap_test_read(pmap, va_base, false);
14390 
14391 	T_LOG("Validate that writes to our mapping fault.");
14392 	pmap_test_write(pmap, va_base, true);
14393 
14394 
14395 	/*
14396 	 * For page ratios of greater than 1: validate that writes to the other
14397 	 * mappings still fault.  Remove the mappings afterwards (we're done
14398 	 * with page ratio testing).
14399 	 */
14400 	for (map_count = 1; map_count < page_ratio; map_count++) {
14401 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14402 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14403 	}
14404 
14405 	T_LOG("Mark the page unreferenced and unmodified.");
14406 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14407 	pmap_test_check_refmod(pa, 0);
14408 
14409 	/*
14410 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14411 	 * different protection/fault_type settings, and confirm that the
14412 	 * ref/mod state matches our expectations at each step.
14413 	 */
14414 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14415 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14416 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14417 
14418 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14419 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14420 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14421 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14422 
14423 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14424 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14425 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14426 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14427 
14428 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14429 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14430 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14431 
14432 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14433 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14434 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14435 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14436 
14437 	/*
14438 	 * Shared memory testing; we'll have two mappings; one read-only,
14439 	 * one read-write.
14440 	 */
14441 	vm_map_address_t rw_base = va_base;
14442 	vm_map_address_t ro_base = va_base + pmap_page_size;
14443 
14444 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14445 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14446 
14447 	/*
14448 	 * Test that we take faults as expected for unreferenced/unmodified
14449 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14450 	 * mapping permissions change as expected.
14451 	 */
14452 	T_LOG("!ref/!mod: expect no access");
14453 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14454 	pmap_test_read_write(pmap, ro_base, false, false);
14455 	pmap_test_read_write(pmap, rw_base, false, false);
14456 
14457 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14458 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14459 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14460 	pmap_test_read_write(pmap, ro_base, true, false);
14461 	pmap_test_read_write(pmap, rw_base, true, false);
14462 
14463 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14464 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14465 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14466 	pmap_test_read_write(pmap, ro_base, true, false);
14467 	pmap_test_read_write(pmap, rw_base, true, true);
14468 
14469 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14470 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14471 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14472 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14473 	pmap_test_read_write(pmap, ro_base, true, false);
14474 	pmap_test_read_write(pmap, rw_base, true, true);
14475 
14476 	T_LOG("RW protect both mappings; should not change protections.");
14477 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14478 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14479 	pmap_test_read_write(pmap, ro_base, true, false);
14480 	pmap_test_read_write(pmap, rw_base, true, true);
14481 
14482 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14483 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14484 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14485 	pmap_test_read_write(pmap, ro_base, true, false);
14486 	pmap_test_read_write(pmap, rw_base, true, false);
14487 
14488 	T_LOG("RW protect the page; mappings should not change protections.");
14489 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14490 	pmap_page_protect(pn, VM_PROT_ALL);
14491 	pmap_test_read_write(pmap, ro_base, true, false);
14492 	pmap_test_read_write(pmap, rw_base, true, true);
14493 
14494 	T_LOG("Read protect the page; RW mapping should become RO.");
14495 	pmap_page_protect(pn, VM_PROT_READ);
14496 	pmap_test_read_write(pmap, ro_base, true, false);
14497 	pmap_test_read_write(pmap, rw_base, true, false);
14498 
14499 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14500 	pmap_disconnect(pn);
14501 	if (!pmap_verify_free(pn)) {
14502 		T_FAIL("Page still has mappings");
14503 	}
14504 
14505 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14506 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14507 	pmap_destroy(pmap);
14508 
14509 	T_LOG("Release the pages back to the VM.");
14510 	vm_page_lock_queues();
14511 	vm_page_free(unwired_vm_page);
14512 	vm_page_free(wired_vm_page);
14513 	vm_page_unlock_queues();
14514 
14515 	T_LOG("Testing successful!");
14516 	return 0;
14517 }
14518 #endif /* __arm64__ */
14519 
14520 kern_return_t
14521 pmap_test(void)
14522 {
14523 	T_LOG("Starting pmap_tests");
14524 #ifdef __arm64__
14525 	int flags = 0;
14526 	flags |= PMAP_CREATE_64BIT;
14527 
14528 #if __ARM_MIXED_PAGE_SIZE__
14529 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14530 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14531 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14532 	pmap_test_test_config(flags);
14533 #else /* __ARM_MIXED_PAGE_SIZE__ */
14534 	pmap_test_test_config(flags);
14535 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14536 
14537 #endif /* __arm64__ */
14538 	T_PASS("completed pmap_test successfully");
14539 	return KERN_SUCCESS;
14540 }
14541 #endif /* CONFIG_XNUPOST */
14542 
14543 /*
14544  * The following function should never make it to RELEASE code, since
14545  * it provides a way to get the PPL to modify text pages.
14546  */
14547 #if DEVELOPMENT || DEBUG
14548 
14549 #define ARM_UNDEFINED_INSN 0xe7f000f0
14550 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14551 
14552 /**
14553  * Forcibly overwrite executable text with an illegal instruction.
14554  *
14555  * @note Only used for xnu unit testing.
14556  *
14557  * @param pa The physical address to corrupt.
14558  *
14559  * @return KERN_SUCCESS on success.
14560  */
14561 kern_return_t
14562 pmap_test_text_corruption(pmap_paddr_t pa)
14563 {
14564 #if XNU_MONITOR
14565 	return pmap_test_text_corruption_ppl(pa);
14566 #else /* XNU_MONITOR */
14567 	return pmap_test_text_corruption_internal(pa);
14568 #endif /* XNU_MONITOR */
14569 }
14570 
14571 MARK_AS_PMAP_TEXT kern_return_t
14572 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14573 {
14574 	vm_offset_t va = phystokv(pa);
14575 	unsigned int pai = pa_index(pa);
14576 
14577 	assert(pa_valid(pa));
14578 
14579 	pvh_lock(pai);
14580 
14581 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14582 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14583 #if defined(PVH_FLAG_EXEC)
14584 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14585 
14586 	if (need_ap_twiddle) {
14587 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14588 	}
14589 #endif /* defined(PVH_FLAG_EXEC) */
14590 
14591 	/*
14592 	 * The low bit in an instruction address indicates a THUMB instruction
14593 	 */
14594 	if (va & 1) {
14595 		va &= ~(vm_offset_t)1;
14596 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14597 	} else {
14598 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14599 	}
14600 
14601 #if defined(PVH_FLAG_EXEC)
14602 	if (need_ap_twiddle) {
14603 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14604 	}
14605 #endif /* defined(PVH_FLAG_EXEC) */
14606 
14607 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14608 
14609 	pvh_unlock(pai);
14610 
14611 	return KERN_SUCCESS;
14612 }
14613 
14614 #endif /* DEVELOPMENT || DEBUG */
14615