xref: /xnu-8796.121.2/osfmk/arm/pmap/pmap.c (revision c54f35ca767986246321eb901baf8f5ff7923f6a)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_page_shift = 12,
246 };
247 
248 const struct page_table_attr pmap_pt_attr_16k = {
249 	.pta_level_info = pmap_table_level_info_16k,
250 	.pta_root_level = PMAP_TT_L1_LEVEL,
251 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
252 	.pta_max_level  = PMAP_TT_L3_LEVEL,
253 	.pta_ops = &native_pt_ops,
254 	.ap_ro = ARM_PTE_AP(AP_RORO),
255 	.ap_rw = ARM_PTE_AP(AP_RWRW),
256 	.ap_rona = ARM_PTE_AP(AP_RONA),
257 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
258 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 	.ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 	.pta_tcr_value  = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 	.pta_page_size  = 16384,
264 	.pta_page_shift = 14,
265 };
266 
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272 
273 
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276 
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279 
280 #endif /* MACH_ASSERT */
281 
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 	(current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288 
289 
290 /*
291  * Represents a tlb range that will be flushed before exiting
292  * the ppl.
293  * Used by phys_attribute_clear_range to defer flushing pages in
294  * this range until the end of the operation.
295  */
296 typedef struct pmap_tlb_flush_range {
297 	pmap_t ptfr_pmap;
298 	vm_map_address_t ptfr_start;
299 	vm_map_address_t ptfr_end;
300 	bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302 
303 #if XNU_MONITOR
304 /*
305  * PPL External References.
306  */
307 extern vm_offset_t   segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t   segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t   segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313 
314 
315 /*
316  * PPL Global Variables
317  */
318 
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325 
326 /*
327  * Indicates if the PPL has started applying APRR.
328  * This variable is accessed from various assembly trampolines, so be sure to change
329  * those if you change the size or layout of this variable.
330  */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332 
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335 
336 #endif /* !XNU_MONITOR */
337 
338 
339 
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
342 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344 
345 extern uint8_t bootstrap_pagetables[];
346 
347 extern unsigned int not_in_kdp;
348 
349 extern vm_offset_t first_avail;
350 
351 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
352 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
353 extern vm_offset_t     static_memory_end;
354 
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357 
358 extern int maxproc, hard_maxproc;
359 
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362 
363 /*
364  * The bounds on our TTBRs.  These are for sanity checking that
365  * an address is accessible by a TTBR before we attempt to map it.
366  */
367 
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370 
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373 
374 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t    kernel_pmap = &kernel_pmap_store;
376 
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
378 
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
382 
383 typedef struct tt_free_entry {
384 	struct tt_free_entry    *next;
385 } tt_free_entry_t;
386 
387 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
388 
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX   4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
400 
401 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
402 
403 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409 
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412 
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
415 
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418 
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1;                                     /* enable no-execute protection */
421 int allow_data_exec  = 0;                               /* No apps may execute data */
422 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1;                                       /* enable no-execute protection */
428 const int allow_data_exec  = 0;                         /* No apps may execute data */
429 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431 
432 /**
433  * This variable is set true during hibernation entry to protect pmap data structures
434  * during image copying, and reset false on hibernation exit.
435  */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437 
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446 
447 /**
448  * This helper function ensures that potentially-long-running batched PPL operations are
449  * called in preemptible context before entering the PPL, so that the PPL call may
450  * periodically exit to allow pending urgent ASTs to be taken.
451  */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457 
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459 
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
462 
463 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
464 
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470     "Minimum address space size outside allowable range");
471 
472 // Max offset is 13.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 9.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476 
477 
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479     "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481     "Small device address space size outside allowable range");
482 
483 #  ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 #  else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 #  endif
488 #endif /* __arm64__ */
489 
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495 
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
499 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
500 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
501 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
502 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
503 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
504 
505 
506 #if __ARM_MIXED_PAGE_SIZE__
507 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
508 #endif
509 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
510 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
511 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
512 
513 /* PTE Define Macros */
514 
515 #define ARM_PTE_IS_COMPRESSED(x, p) \
516 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
517 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
518 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
519 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
520 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
521 
522 #define pte_is_wired(pte)                                                               \
523 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
524 
525 #define pte_was_writeable(pte) \
526 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
527 
528 #define pte_set_was_writeable(pte, was_writeable) \
529 	do {                                         \
530 	        if ((was_writeable)) {               \
531 	                (pte) |= ARM_PTE_WRITEABLE;  \
532 	        } else {                             \
533 	                (pte) &= ~ARM_PTE_WRITEABLE; \
534 	        }                                    \
535 	} while(0)
536 
537 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)538 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
539 {
540 	if (wired) {
541 		*ptep |= ARM_PTE_WIRED;
542 	} else {
543 		*ptep &= ~ARM_PTE_WIRED;
544 	}
545 	/*
546 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
547 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
548 	 * never reclaimed.
549 	 */
550 	if (pmap == kernel_pmap) {
551 		return;
552 	}
553 	unsigned short *ptd_wiredcnt_ptr;
554 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
555 	if (wired) {
556 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
557 	} else {
558 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
559 		if (__improbable(prev_wired == 0)) {
560 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
561 		}
562 	}
563 }
564 
565 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                       \
566 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
567 	arm64_sync_tlb(strong);                                                                        \
568 }
569 
570 /*
571  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
572  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
573  * will observe the updated PTE.
574  */
575 #define FLUSH_PTE()                                                                     \
576 	__builtin_arm_dmb(DMB_ISH);
577 
578 /*
579  * Synchronize updates to PTEs that were previously valid and thus may be cached in
580  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
581  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
582  * program order will not issue until the DSB completes.  Prior loads may be reordered
583  * after the barrier, but their behavior should not be materially affected by the
584  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
585  * matter for loads until the access is re-driven well after the TLB update is
586  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
587  * we should be in a position to handle access faults.  For "voluntary" PTE access
588  * restriction due to unmapping or protection, the decision to restrict access should
589  * have a data dependency on prior loads in order to avoid a data race.
590  */
591 #define FLUSH_PTE_STRONG()                                                             \
592 	__builtin_arm_dsb(DSB_ISHST);
593 
594 /**
595  * Write enough page table entries to map a single VM page. On systems where the
596  * VM page size does not match the hardware page size, multiple page table
597  * entries will need to be written.
598  *
599  * @note This function does not emit a barrier to ensure these page table writes
600  *       have completed before continuing. This is commonly needed. In the case
601  *       where a DMB or DSB barrier is needed, then use the write_pte() and
602  *       write_pte_strong() functions respectively instead of this one.
603  *
604  * @param ptep Pointer to the first page table entry to update.
605  * @param pte The value to write into each page table entry. In the case that
606  *            multiple PTEs are updated to a non-empty value, then the address
607  *            in this value will automatically be incremented for each PTE
608  *            write.
609  */
610 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)611 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
612 {
613 	/**
614 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
615 	 * systems, which is why it's checked at runtime instead of compile time.
616 	 * The "unreachable" warning needs to be suppressed because it still is a
617 	 * compile time constant on some systems.
618 	 */
619 	__unreachable_ok_push
620 	if (TEST_PAGE_RATIO_4) {
621 		if (((uintptr_t)ptep) & 0x1f) {
622 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
623 			    __func__, ptep, (void*)pte);
624 		}
625 
626 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
627 			/**
628 			 * If we're writing an empty/compressed PTE value, then don't
629 			 * auto-increment the address for each PTE write.
630 			 */
631 			*ptep = pte;
632 			*(ptep + 1) = pte;
633 			*(ptep + 2) = pte;
634 			*(ptep + 3) = pte;
635 		} else {
636 			*ptep = pte;
637 			*(ptep + 1) = pte | 0x1000;
638 			*(ptep + 2) = pte | 0x2000;
639 			*(ptep + 3) = pte | 0x3000;
640 		}
641 	} else {
642 		*ptep = pte;
643 	}
644 	__unreachable_ok_pop
645 }
646 
647 /**
648  * Writes enough page table entries to map a single VM page and then ensures
649  * those writes complete by executing a Data Memory Barrier.
650  *
651  * @note The DMB issued by this function is not strong enough to protect against
652  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
653  *       instruction is going to immediately be called after this write, it's
654  *       recommended to call write_pte_strong() instead of this function.
655  *
656  * See the function header for write_pte_fast() for more details on the
657  * parameters.
658  */
659 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)660 write_pte(pt_entry_t *ptep, pt_entry_t pte)
661 {
662 	write_pte_fast(ptep, pte);
663 	FLUSH_PTE();
664 }
665 
666 /**
667  * Writes enough page table entries to map a single VM page and then ensures
668  * those writes complete by executing a Data Synchronization Barrier. This
669  * barrier provides stronger guarantees than the DMB executed by write_pte().
670  *
671  * @note This function is useful if you're going to immediately flush the TLB
672  *       after making the PTE write. A DSB is required to protect against the
673  *       TLB invalidate being reordered before the PTE write.
674  *
675  * See the function header for write_pte_fast() for more details on the
676  * parameters.
677  */
678 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)679 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
680 {
681 	write_pte_fast(ptep, pte);
682 	FLUSH_PTE_STRONG();
683 }
684 
685 /**
686  * Retrieve the pmap structure for the thread running on the current CPU.
687  */
688 pmap_t
current_pmap()689 current_pmap()
690 {
691 	const pmap_t current = vm_map_pmap(current_thread()->map);
692 
693 	assert(current != NULL);
694 
695 #if XNU_MONITOR
696 	/**
697 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
698 	 * decided by kernel-writable memory. This function is used in various parts
699 	 * of the PPL, and besides validating that the pointer returned by this
700 	 * function is indeed a pmap structure, it's also important to ensure that
701 	 * it's actually the current thread's pmap. This is because different pmaps
702 	 * will have access to different entitlements based on the code signature of
703 	 * their loaded process. So if a different user pmap is set in the current
704 	 * thread structure (in an effort to bypass code signing restrictions), even
705 	 * though the structure would validate correctly as it is a real pmap
706 	 * structure, it should fail here.
707 	 *
708 	 * This only needs to occur for user pmaps because the kernel pmap's root
709 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
710 	 * changed so it'd be redundant to check), and its code signing fields are
711 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
712 	 * it shouldn't be possible to set those fields. Due to that, an attacker
713 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
714 	 * this check won't accomplish anything as it doesn't provide any extra code
715 	 * signing entitlements.
716 	 */
717 	if ((current != kernel_pmap) &&
718 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
719 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
720 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
721 	}
722 #endif /* XNU_MONITOR */
723 
724 	return current;
725 }
726 
727 #if DEVELOPMENT || DEBUG
728 
729 /*
730  * Trace levels are controlled by a bitmask in which each
731  * level can be enabled/disabled by the (1<<level) position
732  * in the boot arg
733  * Level 0: PPL extension functionality
734  * Level 1: pmap lifecycle (create/destroy/switch)
735  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
736  * Level 3: internal state management (attributes/fast-fault)
737  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
738  */
739 
740 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
741 
742 #define PMAP_TRACE(level, ...) \
743 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
744 	        KDBG_RELEASE(__VA_ARGS__); \
745 	}
746 #else /* DEVELOPMENT || DEBUG */
747 
748 #define PMAP_TRACE(level, ...)
749 
750 #endif /* DEVELOPMENT || DEBUG */
751 
752 
753 /*
754  * Internal function prototypes (forward declarations).
755  */
756 
757 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
758 
759 static void pmap_set_reference(ppnum_t pn);
760 
761 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
762 
763 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
764 
765 static kern_return_t pmap_expand(
766 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
767 
768 static int pmap_remove_range(
769 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
770 
771 static tt_entry_t *pmap_tt1_allocate(
772 	pmap_t, vm_size_t, unsigned int);
773 
774 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
775 
776 static void pmap_tt1_deallocate(
777 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
778 
779 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
780 
781 static kern_return_t pmap_tt_allocate(
782 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
783 
784 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
785 
786 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
787 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
788 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
789 
790 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
791 
792 
793 static void pmap_unmap_commpage(
794 	pmap_t pmap);
795 
796 static boolean_t
797 pmap_is_64bit(pmap_t);
798 
799 
800 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
801 
802 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
803 
804 static bool pmap_update_cache_attributes_locked(
805 	ppnum_t, unsigned, bool);
806 
807 static boolean_t arm_clear_fast_fault(
808 	ppnum_t ppnum,
809 	vm_prot_t fault_type,
810 	pt_entry_t *pte_p);
811 
812 static void pmap_trim_self(pmap_t pmap);
813 static void pmap_trim_subord(pmap_t subord);
814 
815 
816 /*
817  * Temporary prototypes, while we wait for pmap_enter to move to taking an
818  * address instead of a page number.
819  */
820 static kern_return_t
821 pmap_enter_addr(
822 	pmap_t pmap,
823 	vm_map_address_t v,
824 	pmap_paddr_t pa,
825 	vm_prot_t prot,
826 	vm_prot_t fault_type,
827 	unsigned int flags,
828 	boolean_t wired);
829 
830 kern_return_t
831 pmap_enter_options_addr(
832 	pmap_t pmap,
833 	vm_map_address_t v,
834 	pmap_paddr_t pa,
835 	vm_prot_t prot,
836 	vm_prot_t fault_type,
837 	unsigned int flags,
838 	boolean_t wired,
839 	unsigned int options,
840 	__unused void   *arg);
841 
842 #ifdef CONFIG_XNUPOST
843 kern_return_t pmap_test(void);
844 #endif /* CONFIG_XNUPOST */
845 
846 PMAP_SUPPORT_PROTOTYPES(
847 	kern_return_t,
848 	arm_fast_fault, (pmap_t pmap,
849 	vm_map_address_t va,
850 	vm_prot_t fault_type,
851 	bool was_af_fault,
852 	bool from_user), ARM_FAST_FAULT_INDEX);
853 
854 PMAP_SUPPORT_PROTOTYPES(
855 	boolean_t,
856 	arm_force_fast_fault, (ppnum_t ppnum,
857 	vm_prot_t allow_mode,
858 	int options), ARM_FORCE_FAST_FAULT_INDEX);
859 
860 MARK_AS_PMAP_TEXT static boolean_t
861 arm_force_fast_fault_with_flush_range(
862 	ppnum_t ppnum,
863 	vm_prot_t allow_mode,
864 	int options,
865 	pmap_tlb_flush_range_t *flush_range);
866 
867 /**
868  * Definition of the states driving the batch cache attributes update
869  * state machine.
870  */
871 typedef struct {
872 	uint64_t page_index : 32,           /* The page index to be operated on */
873 	    state : 8,                      /* The current state of the update machine */
874 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
875 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
876 	:0;
877 } batch_set_cache_attr_state_t;
878 
879 /* Possible values of the "state" field. */
880 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
881 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
882 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
883 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
884 
885 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
886 
887 PMAP_SUPPORT_PROTOTYPES(
888 	batch_set_cache_attr_state_t,
889 	pmap_batch_set_cache_attributes, (
890 #if XNU_MONITOR
891 		volatile upl_page_info_t *user_page_list,
892 #else /* !XNU_MONITOR */
893 		upl_page_info_array_t user_page_list,
894 #endif /* XNU_MONITOR */
895 		batch_set_cache_attr_state_t state,
896 		unsigned int page_cnt,
897 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
898 
899 PMAP_SUPPORT_PROTOTYPES(
900 	kern_return_t,
901 	pmap_change_wiring, (pmap_t pmap,
902 	vm_map_address_t v,
903 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
904 
905 PMAP_SUPPORT_PROTOTYPES(
906 	pmap_t,
907 	pmap_create_options, (ledger_t ledger,
908 	vm_map_size_t size,
909 	unsigned int flags,
910 	kern_return_t * kr), PMAP_CREATE_INDEX);
911 
912 PMAP_SUPPORT_PROTOTYPES(
913 	void,
914 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
915 
916 PMAP_SUPPORT_PROTOTYPES(
917 	kern_return_t,
918 	pmap_enter_options, (pmap_t pmap,
919 	vm_map_address_t v,
920 	pmap_paddr_t pa,
921 	vm_prot_t prot,
922 	vm_prot_t fault_type,
923 	unsigned int flags,
924 	boolean_t wired,
925 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
926 
927 PMAP_SUPPORT_PROTOTYPES(
928 	pmap_paddr_t,
929 	pmap_find_pa, (pmap_t pmap,
930 	addr64_t va), PMAP_FIND_PA_INDEX);
931 
932 PMAP_SUPPORT_PROTOTYPES(
933 	kern_return_t,
934 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
935 
936 
937 PMAP_SUPPORT_PROTOTYPES(
938 	boolean_t,
939 	pmap_is_empty, (pmap_t pmap,
940 	vm_map_offset_t va_start,
941 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
942 
943 
944 PMAP_SUPPORT_PROTOTYPES(
945 	unsigned int,
946 	pmap_map_cpu_windows_copy, (ppnum_t pn,
947 	vm_prot_t prot,
948 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
949 
950 PMAP_SUPPORT_PROTOTYPES(
951 	void,
952 	pmap_ro_zone_memcpy, (zone_id_t zid,
953 	vm_offset_t va,
954 	vm_offset_t offset,
955 	const vm_offset_t new_data,
956 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
957 
958 PMAP_SUPPORT_PROTOTYPES(
959 	uint64_t,
960 	pmap_ro_zone_atomic_op, (zone_id_t zid,
961 	vm_offset_t va,
962 	vm_offset_t offset,
963 	zro_atomic_op_t op,
964 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
965 
966 PMAP_SUPPORT_PROTOTYPES(
967 	void,
968 	pmap_ro_zone_bzero, (zone_id_t zid,
969 	vm_offset_t va,
970 	vm_offset_t offset,
971 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
972 
973 PMAP_SUPPORT_PROTOTYPES(
974 	vm_map_offset_t,
975 	pmap_nest, (pmap_t grand,
976 	pmap_t subord,
977 	addr64_t vstart,
978 	uint64_t size,
979 	vm_map_offset_t vrestart,
980 	kern_return_t * krp), PMAP_NEST_INDEX);
981 
982 PMAP_SUPPORT_PROTOTYPES(
983 	void,
984 	pmap_page_protect_options, (ppnum_t ppnum,
985 	vm_prot_t prot,
986 	unsigned int options,
987 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
988 
989 PMAP_SUPPORT_PROTOTYPES(
990 	vm_map_address_t,
991 	pmap_protect_options, (pmap_t pmap,
992 	vm_map_address_t start,
993 	vm_map_address_t end,
994 	vm_prot_t prot,
995 	unsigned int options,
996 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
997 
998 PMAP_SUPPORT_PROTOTYPES(
999 	kern_return_t,
1000 	pmap_query_page_info, (pmap_t pmap,
1001 	vm_map_offset_t va,
1002 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1003 
1004 PMAP_SUPPORT_PROTOTYPES(
1005 	mach_vm_size_t,
1006 	pmap_query_resident, (pmap_t pmap,
1007 	vm_map_address_t start,
1008 	vm_map_address_t end,
1009 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1010 
1011 PMAP_SUPPORT_PROTOTYPES(
1012 	void,
1013 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1014 
1015 PMAP_SUPPORT_PROTOTYPES(
1016 	vm_map_address_t,
1017 	pmap_remove_options, (pmap_t pmap,
1018 	vm_map_address_t start,
1019 	vm_map_address_t end,
1020 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1021 
1022 
1023 PMAP_SUPPORT_PROTOTYPES(
1024 	void,
1025 	pmap_set_cache_attributes, (ppnum_t pn,
1026 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1027 
1028 PMAP_SUPPORT_PROTOTYPES(
1029 	void,
1030 	pmap_update_compressor_page, (ppnum_t pn,
1031 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1032 
1033 PMAP_SUPPORT_PROTOTYPES(
1034 	void,
1035 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1036 
1037 #if MACH_ASSERT || XNU_MONITOR
1038 PMAP_SUPPORT_PROTOTYPES(
1039 	void,
1040 	pmap_set_process, (pmap_t pmap,
1041 	int pid,
1042 	char *procname), PMAP_SET_PROCESS_INDEX);
1043 #endif
1044 
1045 PMAP_SUPPORT_PROTOTYPES(
1046 	void,
1047 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1048 
1049 PMAP_SUPPORT_PROTOTYPES(
1050 	vm_map_offset_t,
1051 	pmap_unnest_options, (pmap_t grand,
1052 	addr64_t vaddr,
1053 	uint64_t size,
1054 	vm_map_offset_t vrestart,
1055 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1056 
1057 PMAP_SUPPORT_PROTOTYPES(
1058 	void,
1059 	phys_attribute_set, (ppnum_t pn,
1060 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1061 
1062 PMAP_SUPPORT_PROTOTYPES(
1063 	void,
1064 	phys_attribute_clear, (ppnum_t pn,
1065 	unsigned int bits,
1066 	int options,
1067 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1068 
1069 #if __ARM_RANGE_TLBI__
1070 PMAP_SUPPORT_PROTOTYPES(
1071 	vm_map_address_t,
1072 	phys_attribute_clear_range, (pmap_t pmap,
1073 	vm_map_address_t start,
1074 	vm_map_address_t end,
1075 	unsigned int bits,
1076 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1077 #endif /* __ARM_RANGE_TLBI__ */
1078 
1079 
1080 PMAP_SUPPORT_PROTOTYPES(
1081 	void,
1082 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1083 
1084 PMAP_SUPPORT_PROTOTYPES(
1085 	void,
1086 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1087 
1088 PMAP_SUPPORT_PROTOTYPES(
1089 	void,
1090 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1091 
1092 PMAP_SUPPORT_PROTOTYPES(
1093 	void,
1094 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1095 
1096 PMAP_SUPPORT_PROTOTYPES(
1097 	void,
1098 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1099 
1100 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1101 PMAP_SUPPORT_PROTOTYPES(
1102 	void,
1103 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1104 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1105 
1106 /* Definition of the states used by pmap_trim(). */
1107 typedef enum {
1108 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1109 	PMAP_TRIM_STATE_START = 0,
1110 
1111 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1112 	PMAP_TRIM_STATE_GRAND_BEFORE,
1113 
1114 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1115 	PMAP_TRIM_STATE_GRAND_AFTER,
1116 
1117 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1118 	PMAP_TRIM_STATE_SUBORD,
1119 
1120 	/* Marks that trimming is finished. */
1121 	PMAP_TRIM_STATE_DONE,
1122 
1123 	/* Sentry enum for sanity checks. */
1124 	PMAP_TRIM_STATE_COUNT,
1125 } pmap_trim_state_t;
1126 
1127 PMAP_SUPPORT_PROTOTYPES(
1128 	pmap_trim_state_t,
1129 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1130 
1131 #if HAS_APPLE_PAC
1132 PMAP_SUPPORT_PROTOTYPES(
1133 	void *,
1134 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1135 PMAP_SUPPORT_PROTOTYPES(
1136 	void *,
1137 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1138 #endif /* HAS_APPLE_PAC */
1139 
1140 
1141 
1142 
1143 PMAP_SUPPORT_PROTOTYPES(
1144 	kern_return_t,
1145 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1146 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1147 
1148 PMAP_SUPPORT_PROTOTYPES(
1149 	kern_return_t,
1150 	pmap_load_trust_cache_with_type, (TCType_t type,
1151 	const vm_address_t pmap_img4_payload,
1152 	const vm_size_t pmap_img4_payload_len,
1153 	const vm_address_t img4_manifest,
1154 	const vm_size_t img4_manifest_len,
1155 	const vm_address_t img4_aux_manifest,
1156 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1157 
1158 PMAP_SUPPORT_PROTOTYPES(
1159 	void,
1160 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1161 
1162 PMAP_SUPPORT_PROTOTYPES(
1163 	kern_return_t,
1164 	pmap_query_trust_cache, (TCQueryType_t query_type,
1165 	const uint8_t cdhash[kTCEntryHashSize],
1166 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1167 
1168 #if PMAP_CS_INCLUDE_CODE_SIGNING
1169 
1170 PMAP_SUPPORT_PROTOTYPES(
1171 	kern_return_t,
1172 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1173 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1174 
1175 PMAP_SUPPORT_PROTOTYPES(
1176 	kern_return_t,
1177 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1178 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1179 
1180 PMAP_SUPPORT_PROTOTYPES(
1181 	kern_return_t,
1182 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1183 	pmap_cs_profile_t * profile_obj),
1184 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1185 
1186 PMAP_SUPPORT_PROTOTYPES(
1187 	kern_return_t,
1188 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1189 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1190 
1191 PMAP_SUPPORT_PROTOTYPES(
1192 	kern_return_t,
1193 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1194 	const void *kernel_entitlements),
1195 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1196 
1197 PMAP_SUPPORT_PROTOTYPES(
1198 	kern_return_t,
1199 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1200 	const void **kernel_entitlements),
1201 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1202 
1203 PMAP_SUPPORT_PROTOTYPES(
1204 	kern_return_t,
1205 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1206 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1207 
1208 PMAP_SUPPORT_PROTOTYPES(
1209 	kern_return_t,
1210 	pmap_cs_allow_invalid, (pmap_t pmap),
1211 	PMAP_CS_ALLOW_INVALID_INDEX);
1212 
1213 PMAP_SUPPORT_PROTOTYPES(
1214 	void,
1215 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1216 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1217 
1218 PMAP_SUPPORT_PROTOTYPES(
1219 	bool,
1220 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1221 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1222 
1223 PMAP_SUPPORT_PROTOTYPES(
1224 	void,
1225 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1226 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1227 
1228 PMAP_SUPPORT_PROTOTYPES(
1229 	void,
1230 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1231 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1232 
1233 #endif
1234 
1235 PMAP_SUPPORT_PROTOTYPES(
1236 	uint32_t,
1237 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1238 
1239 PMAP_SUPPORT_PROTOTYPES(
1240 	bool,
1241 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1242 
1243 PMAP_SUPPORT_PROTOTYPES(
1244 	void,
1245 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1246 
1247 void pmap_footprint_suspend(vm_map_t    map,
1248     boolean_t   suspend);
1249 PMAP_SUPPORT_PROTOTYPES(
1250 	void,
1251 	pmap_footprint_suspend, (vm_map_t map,
1252 	boolean_t suspend),
1253 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1254 
1255 
1256 
1257 
1258 #if DEVELOPMENT || DEBUG
1259 PMAP_SUPPORT_PROTOTYPES(
1260 	kern_return_t,
1261 	pmap_test_text_corruption, (pmap_paddr_t),
1262 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1263 #endif /* DEVELOPMENT || DEBUG */
1264 
1265 /*
1266  * The low global vector page is mapped at a fixed alias.
1267  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1268  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1269  * to check both addresses anyway for backward compatibility. So for now
1270  * we leave H6 and H7 where they were.
1271  */
1272 #if (ARM_PGSHIFT == 14)
1273 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1274 #else
1275 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1276 #endif
1277 
1278 
1279 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1280 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1281 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1282 
1283 #if XNU_MONITOR
1284 
1285 #if __has_feature(ptrauth_calls)
1286 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1287 #else
1288 #define __ptrauth_ppl_handler
1289 #endif
1290 
1291 /*
1292  * Table of function pointers used for PPL dispatch.
1293  */
1294 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1295 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1296 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1297 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1298 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1299 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1300 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1301 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1302 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1303 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1304 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1305 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1306 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1307 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1308 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1309 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1310 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1311 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1312 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1313 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1314 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1315 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1316 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1317 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1318 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1319 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1320 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1321 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1322 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1323 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1324 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1325 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1326 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1327 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1328 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1329 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1330 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1331 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1332 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1333 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1334 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1335 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1336 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1337 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1338 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1339 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1340 #if PMAP_CS_INCLUDE_CODE_SIGNING
1341 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1342 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1343 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1344 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1345 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1346 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1347 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1348 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1349 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1350 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1351 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1352 #endif
1353 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1354 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1355 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1356 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1357 #if HAS_APPLE_PAC
1358 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1359 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1360 #endif /* HAS_APPLE_PAC */
1361 #if __ARM_RANGE_TLBI__
1362 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1363 #endif /* __ARM_RANGE_TLBI__ */
1364 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1365 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1366 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1367 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1368 
1369 #if DEVELOPMENT || DEBUG
1370 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1371 #endif /* DEVELOPMENT || DEBUG */
1372 
1373 };
1374 #endif
1375 
1376 #if XNU_MONITOR
1377 /**
1378  * A convenience function for setting protections on a single physical
1379  * aperture or static region mapping without invalidating the TLB.
1380  *
1381  * @note This function does not perform any TLB invalidations. That must be done
1382  *       separately to be able to safely use the updated mapping.
1383  *
1384  * @note This function understands the difference between the VM page size and
1385  *       the kernel page size and will update multiple PTEs if the sizes differ.
1386  *       In other words, enough PTEs will always get updated to change the
1387  *       permissions on a PAGE_SIZE amount of memory.
1388  *
1389  * @note The PVH lock for the physical page represented by this mapping must
1390  *       already be locked.
1391  *
1392  * @note This function assumes the caller has already verified that the PTE
1393  *       pointer does indeed point to a physical aperture or static region page
1394  *       table. Please validate your inputs before passing it along to this
1395  *       function.
1396  *
1397  * @param ptep Pointer to the physical aperture or static region page table to
1398  *             update with a new XPRR index.
1399  * @param expected_perm The XPRR index that is expected to already exist at the
1400  *                      current mapping. If the current index doesn't match this
1401  *                      then the system will panic.
1402  * @param new_perm The new XPRR index to update the mapping with.
1403  */
1404 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1405 pmap_set_pte_xprr_perm(
1406 	pt_entry_t * const ptep,
1407 	unsigned int expected_perm,
1408 	unsigned int new_perm)
1409 {
1410 	assert(ptep != NULL);
1411 
1412 	pt_entry_t spte = *ptep;
1413 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1414 
1415 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1416 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1417 		    __func__, ptep, new_perm, expected_perm);
1418 	}
1419 
1420 	/**
1421 	 * The PTE involved should be valid, should not have the hint bit set, and
1422 	 * should have the expected XPRR index.
1423 	 */
1424 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1425 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1426 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1427 		    __func__, ptep, spte, new_perm, expected_perm);
1428 	}
1429 
1430 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1431 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1432 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1433 		    __func__, ptep, spte, new_perm, expected_perm);
1434 	}
1435 
1436 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1437 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1438 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1439 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1440 	}
1441 
1442 	pt_entry_t template = spte;
1443 	template &= ~ARM_PTE_XPRR_MASK;
1444 	template |= xprr_perm_to_pte(new_perm);
1445 
1446 	write_pte_strong(ptep, template);
1447 }
1448 
1449 /**
1450  * Update the protections on a single physical aperture mapping and invalidate
1451  * the TLB so the mapping can be used.
1452  *
1453  * @note The PVH lock for the physical page must already be locked.
1454  *
1455  * @param pai The physical address index of the page whose physical aperture
1456  *            mapping will be updated with new permissions.
1457  * @param expected_perm The XPRR index that is expected to already exist at the
1458  *                      current mapping. If the current index doesn't match this
1459  *                      then the system will panic.
1460  * @param new_perm The new XPRR index to update the mapping with.
1461  */
1462 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1463 pmap_set_xprr_perm(
1464 	unsigned int pai,
1465 	unsigned int expected_perm,
1466 	unsigned int new_perm)
1467 {
1468 	pvh_assert_locked(pai);
1469 
1470 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1471 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1472 
1473 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1474 
1475 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1476 	sync_tlb_flush();
1477 }
1478 
1479 /**
1480  * Update the protections on a range of physical aperture or static region
1481  * mappings and invalidate the TLB so the mappings can be used.
1482  *
1483  * @note Static region mappings can only be updated before machine_lockdown().
1484  *       Physical aperture mappings can be updated at any time.
1485  *
1486  * @param start The starting virtual address of the static region or physical
1487  *              aperture range whose permissions will be updated.
1488  * @param end The final (inclusive) virtual address of the static region or
1489  *            physical aperture range whose permissions will be updated.
1490  * @param expected_perm The XPRR index that is expected to already exist at the
1491  *                      current mappings. If the current indices don't match
1492  *                      this then the system will panic.
1493  * @param new_perm The new XPRR index to update the mappings with.
1494  */
1495 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1496 pmap_set_range_xprr_perm(
1497 	vm_address_t start,
1498 	vm_address_t end,
1499 	unsigned int expected_perm,
1500 	unsigned int new_perm)
1501 {
1502 	/**
1503 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1504 	 */
1505 	if (__improbable((start | end) & ARM_PGMASK)) {
1506 		panic_plain("%s: start or end not page aligned, "
1507 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1508 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1509 	}
1510 
1511 	if (__improbable(start > end)) {
1512 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1513 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1514 	}
1515 
1516 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1517 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1518 
1519 	if (__improbable(!(in_physmap || in_static))) {
1520 		panic_plain("%s: address not in static region or physical aperture, "
1521 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1522 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1523 	}
1524 
1525 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1526 		panic_plain("%s: invalid XPRR index, "
1527 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1528 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1529 	}
1530 
1531 	/*
1532 	 * Walk over the PTEs for the given range, and set the protections on those
1533 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1534 	 * one twig entry (whichever twig entry currently maps "va").
1535 	 */
1536 	vm_address_t va = start;
1537 	while (va < end) {
1538 		/**
1539 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1540 		 * PTEs from va to tte_va_end will have their permissions updated.
1541 		 */
1542 		vm_address_t tte_va_end =
1543 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1544 
1545 		if (tte_va_end > end) {
1546 			tte_va_end = end;
1547 		}
1548 
1549 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1550 
1551 		if (ttep == NULL) {
1552 			panic_plain("%s: physical aperture or static region tte is NULL, "
1553 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1554 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1555 		}
1556 
1557 		tt_entry_t tte = *ttep;
1558 
1559 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1560 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1561 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1562 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1563 		}
1564 
1565 		/* Walk over the given L3 page table page and update the PTEs. */
1566 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1567 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1568 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1569 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1570 
1571 		/**
1572 		 * The current PTE pointer is incremented by the page ratio (ratio of
1573 		 * VM page size to kernel hardware page size) because one call to
1574 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1575 		 * a PAGE_SIZE worth of hardware pages.
1576 		 */
1577 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1578 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1579 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1580 			pvh_lock(pai);
1581 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1582 			pvh_unlock(pai);
1583 		}
1584 
1585 		va = tte_va_end;
1586 	}
1587 
1588 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1589 }
1590 
1591 #endif /* XNU_MONITOR */
1592 
1593 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1594 PMAP_ZINFO_PALLOC(
1595 	pmap_t pmap, int bytes)
1596 {
1597 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1598 }
1599 
1600 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1601 PMAP_ZINFO_PFREE(
1602 	pmap_t pmap,
1603 	int bytes)
1604 {
1605 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1606 }
1607 
1608 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1609 pmap_tt_ledger_credit(
1610 	pmap_t          pmap,
1611 	vm_size_t       size)
1612 {
1613 	if (pmap != kernel_pmap) {
1614 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1615 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1616 	}
1617 }
1618 
1619 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1620 pmap_tt_ledger_debit(
1621 	pmap_t          pmap,
1622 	vm_size_t       size)
1623 {
1624 	if (pmap != kernel_pmap) {
1625 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1626 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1627 	}
1628 }
1629 
1630 static inline void
pmap_update_plru(uint16_t asid_index)1631 pmap_update_plru(uint16_t asid_index)
1632 {
1633 	if (__probable(pmap_asid_plru)) {
1634 		unsigned plru_index = asid_index >> 6;
1635 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1636 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1637 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1638 		}
1639 	}
1640 }
1641 
1642 static bool
alloc_asid(pmap_t pmap)1643 alloc_asid(pmap_t pmap)
1644 {
1645 	int vasid = -1;
1646 	uint16_t hw_asid;
1647 
1648 	pmap_simple_lock(&asid_lock);
1649 
1650 	if (__probable(pmap_asid_plru)) {
1651 		unsigned plru_index = 0;
1652 		uint64_t lowest_gen = asid_plru_generation[0];
1653 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1654 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1655 			if (asid_plru_generation[i] < lowest_gen) {
1656 				plru_index = i;
1657 				lowest_gen = asid_plru_generation[i];
1658 				lowest_gen_bitmap = asid_plru_bitmap[i];
1659 			}
1660 		}
1661 
1662 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1663 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1664 			if (temp_plru) {
1665 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1666 #if DEVELOPMENT || DEBUG
1667 				++pmap_asid_hits;
1668 #endif
1669 				break;
1670 			}
1671 		}
1672 	}
1673 	if (__improbable(vasid < 0)) {
1674 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1675 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1676 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1677 #if DEVELOPMENT || DEBUG
1678 		++pmap_asid_misses;
1679 #endif
1680 	}
1681 	if (__improbable(vasid < 0)) {
1682 		pmap_simple_unlock(&asid_lock);
1683 		return false;
1684 	}
1685 	assert((uint32_t)vasid < pmap_max_asids);
1686 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1687 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1688 	pmap_simple_unlock(&asid_lock);
1689 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1690 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1691 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1692 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1693 		 * reassign to a reserved VASID. */
1694 		assert(pmap->sw_asid < UINT8_MAX);
1695 		pmap->sw_asid = UINT8_MAX;
1696 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1697 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1698 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1699 		assert(hw_asid < MAX_HW_ASIDS);
1700 	}
1701 	pmap_update_plru(hw_asid);
1702 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1703 #if __ARM_KERNEL_PROTECT__
1704 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1705 #endif
1706 	pmap->hw_asid = hw_asid;
1707 	return true;
1708 }
1709 
1710 static void
free_asid(pmap_t pmap)1711 free_asid(pmap_t pmap)
1712 {
1713 	unsigned int vasid;
1714 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1715 	if (__improbable(hw_asid == 0)) {
1716 		return;
1717 	}
1718 
1719 #if __ARM_KERNEL_PROTECT__
1720 	hw_asid >>= 1;
1721 #endif
1722 	hw_asid -= 1;
1723 
1724 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1725 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1726 	} else {
1727 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1728 	}
1729 
1730 	if (__probable(pmap_asid_plru)) {
1731 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1732 	}
1733 	pmap_simple_lock(&asid_lock);
1734 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1735 	bitmap_set(&asid_bitmap[0], vasid);
1736 	pmap_simple_unlock(&asid_lock);
1737 }
1738 
1739 
1740 boolean_t
pmap_valid_address(pmap_paddr_t addr)1741 pmap_valid_address(
1742 	pmap_paddr_t addr)
1743 {
1744 	return pa_valid(addr);
1745 }
1746 
1747 
1748 
1749 
1750 
1751 
1752 /*
1753  *      Map memory at initialization.  The physical addresses being
1754  *      mapped are not managed and are never unmapped.
1755  *
1756  *      For now, VM is already on, we only need to map the
1757  *      specified memory.
1758  */
1759 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1760 pmap_map(
1761 	vm_map_address_t virt,
1762 	vm_offset_t start,
1763 	vm_offset_t end,
1764 	vm_prot_t prot,
1765 	unsigned int flags)
1766 {
1767 	kern_return_t   kr;
1768 	vm_size_t       ps;
1769 
1770 	ps = PAGE_SIZE;
1771 	while (start < end) {
1772 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1773 		    prot, VM_PROT_NONE, flags, FALSE);
1774 
1775 		if (kr != KERN_SUCCESS) {
1776 			panic("%s: failed pmap_enter, "
1777 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1778 			    __FUNCTION__,
1779 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1780 		}
1781 
1782 		virt += ps;
1783 		start += ps;
1784 	}
1785 	return virt;
1786 }
1787 
1788 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1789 pmap_map_bd_with_options(
1790 	vm_map_address_t virt,
1791 	vm_offset_t start,
1792 	vm_offset_t end,
1793 	vm_prot_t prot,
1794 	int32_t options)
1795 {
1796 	pt_entry_t      tmplate;
1797 	pt_entry_t     *ptep;
1798 	vm_map_address_t vaddr;
1799 	vm_offset_t     paddr;
1800 	pt_entry_t      mem_attr;
1801 
1802 	switch (options & PMAP_MAP_BD_MASK) {
1803 	case PMAP_MAP_BD_WCOMB:
1804 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1805 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1806 		break;
1807 	case PMAP_MAP_BD_POSTED:
1808 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1809 		break;
1810 	case PMAP_MAP_BD_POSTED_REORDERED:
1811 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1812 		break;
1813 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1814 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1815 		break;
1816 	default:
1817 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1818 		break;
1819 	}
1820 
1821 	tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1822 	    mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1823 #if __ARM_KERNEL_PROTECT__
1824 	tmplate |= ARM_PTE_NG;
1825 #endif /* __ARM_KERNEL_PROTECT__ */
1826 
1827 	vaddr = virt;
1828 	paddr = start;
1829 	while (paddr < end) {
1830 		ptep = pmap_pte(kernel_pmap, vaddr);
1831 		if (ptep == PT_ENTRY_NULL) {
1832 			panic("%s: no PTE for vaddr=%p, "
1833 			    "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1834 			    __FUNCTION__, (void*)vaddr,
1835 			    (void*)virt, (void*)start, (void*)end, prot, options);
1836 		}
1837 
1838 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1839 		write_pte_strong(ptep, tmplate);
1840 
1841 		pte_increment_pa(tmplate);
1842 		vaddr += PAGE_SIZE;
1843 		paddr += PAGE_SIZE;
1844 	}
1845 
1846 	if (end >= start) {
1847 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1848 	}
1849 
1850 	return vaddr;
1851 }
1852 
1853 /*
1854  *      Back-door routine for mapping kernel VM at initialization.
1855  *      Useful for mapping memory outside the range
1856  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1857  *      Otherwise like pmap_map.
1858  */
1859 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1860 pmap_map_bd(
1861 	vm_map_address_t virt,
1862 	vm_offset_t start,
1863 	vm_offset_t end,
1864 	vm_prot_t prot)
1865 {
1866 	pt_entry_t      tmplate;
1867 	pt_entry_t              *ptep;
1868 	vm_map_address_t vaddr;
1869 	vm_offset_t             paddr;
1870 
1871 	/* not cacheable and not buffered */
1872 	tmplate = pa_to_pte(start)
1873 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1874 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1875 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1876 #if __ARM_KERNEL_PROTECT__
1877 	tmplate |= ARM_PTE_NG;
1878 #endif /* __ARM_KERNEL_PROTECT__ */
1879 
1880 	vaddr = virt;
1881 	paddr = start;
1882 	while (paddr < end) {
1883 		ptep = pmap_pte(kernel_pmap, vaddr);
1884 		if (ptep == PT_ENTRY_NULL) {
1885 			panic("pmap_map_bd");
1886 		}
1887 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1888 		write_pte_strong(ptep, tmplate);
1889 
1890 		pte_increment_pa(tmplate);
1891 		vaddr += PAGE_SIZE;
1892 		paddr += PAGE_SIZE;
1893 	}
1894 
1895 	if (end >= start) {
1896 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1897 	}
1898 
1899 	return vaddr;
1900 }
1901 
1902 /*
1903  *      Back-door routine for mapping kernel VM at initialization.
1904  *      Useful for mapping memory specific physical addresses in early
1905  *      boot (i.e., before kernel_map is initialized).
1906  *
1907  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1908  */
1909 
1910 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1911 pmap_map_high_window_bd(
1912 	vm_offset_t pa_start,
1913 	vm_size_t len,
1914 	vm_prot_t prot)
1915 {
1916 	pt_entry_t              *ptep, pte;
1917 	vm_map_address_t        va_start = VREGION1_START;
1918 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1919 	vm_map_address_t        va_end;
1920 	vm_map_address_t        va;
1921 	vm_size_t               offset;
1922 
1923 	offset = pa_start & PAGE_MASK;
1924 	pa_start -= offset;
1925 	len += offset;
1926 
1927 	if (len > (va_max - va_start)) {
1928 		panic("%s: area too large, "
1929 		    "pa_start=%p, len=%p, prot=0x%x",
1930 		    __FUNCTION__,
1931 		    (void*)pa_start, (void*)len, prot);
1932 	}
1933 
1934 scan:
1935 	for (; va_start < va_max; va_start += PAGE_SIZE) {
1936 		ptep = pmap_pte(kernel_pmap, va_start);
1937 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1938 		if (*ptep == ARM_PTE_TYPE_FAULT) {
1939 			break;
1940 		}
1941 	}
1942 	if (va_start > va_max) {
1943 		panic("%s: insufficient pages, "
1944 		    "pa_start=%p, len=%p, prot=0x%x",
1945 		    __FUNCTION__,
1946 		    (void*)pa_start, (void*)len, prot);
1947 	}
1948 
1949 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1950 		ptep = pmap_pte(kernel_pmap, va_end);
1951 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1952 		if (*ptep != ARM_PTE_TYPE_FAULT) {
1953 			va_start = va_end + PAGE_SIZE;
1954 			goto scan;
1955 		}
1956 	}
1957 
1958 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1959 		ptep = pmap_pte(kernel_pmap, va);
1960 		pte = pa_to_pte(pa_start)
1961 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1962 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1963 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1964 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1965 #if __ARM_KERNEL_PROTECT__
1966 		pte |= ARM_PTE_NG;
1967 #endif /* __ARM_KERNEL_PROTECT__ */
1968 		write_pte_strong(ptep, pte);
1969 	}
1970 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1971 #if KASAN
1972 	kasan_notify_address(va_start, len);
1973 #endif
1974 	return va_start;
1975 }
1976 
1977 static uint32_t
pmap_compute_max_asids(void)1978 pmap_compute_max_asids(void)
1979 {
1980 	DTEntry entry;
1981 	void const *prop = NULL;
1982 	uint32_t max_asids;
1983 	int err;
1984 	unsigned int prop_size;
1985 
1986 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1987 	assert(err == kSuccess);
1988 
1989 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1990 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
1991 		 * we can choose a more flexible default value here. */
1992 		return MAX_ASIDS;
1993 	}
1994 
1995 	if (prop_size != sizeof(max_asids)) {
1996 		panic("pmap-max-asids property is not a 32-bit integer");
1997 	}
1998 
1999 	max_asids = *((uint32_t const *)prop);
2000 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2001 	max_asids = (max_asids + 63) & ~63UL;
2002 
2003 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2004 		/* currently capped by size of pmap->sw_asid */
2005 		panic("pmap-max-asids too large");
2006 	}
2007 	if (max_asids == 0) {
2008 		panic("pmap-max-asids cannot be zero");
2009 	}
2010 	return max_asids;
2011 }
2012 
2013 #if __arm64__
2014 /*
2015  * pmap_get_arm64_prot
2016  *
2017  * return effective armv8 VMSA block protections including
2018  * table AP/PXN/XN overrides of a pmap entry
2019  *
2020  */
2021 
2022 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2023 pmap_get_arm64_prot(
2024 	pmap_t pmap,
2025 	vm_offset_t addr)
2026 {
2027 	tt_entry_t tte = 0;
2028 	unsigned int level = 0;
2029 	uint64_t tte_type = 0;
2030 	uint64_t effective_prot_bits = 0;
2031 	uint64_t aggregate_tte = 0;
2032 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2033 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2034 
2035 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2036 		tte = *pmap_ttne(pmap, level, addr);
2037 
2038 		if (!(tte & ARM_TTE_VALID)) {
2039 			return 0;
2040 		}
2041 
2042 		tte_type = tte & ARM_TTE_TYPE_MASK;
2043 
2044 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2045 		    (level == pt_attr->pta_max_level)) {
2046 			/* Block or page mapping; both have the same protection bit layout. */
2047 			break;
2048 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2049 			/* All of the table bits we care about are overrides, so just OR them together. */
2050 			aggregate_tte |= tte;
2051 		}
2052 	}
2053 
2054 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2055 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2056 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2057 
2058 	/* Start with the PTE bits. */
2059 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2060 
2061 	/* Table AP bits mask out block/page AP bits */
2062 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2063 
2064 	/* XN/PXN bits can be OR'd in. */
2065 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2066 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2067 
2068 	return effective_prot_bits;
2069 }
2070 #endif /* __arm64__ */
2071 
2072 /*
2073  *	Bootstrap the system enough to run with virtual memory.
2074  *
2075  *	The early VM initialization code has already allocated
2076  *	the first CPU's translation table and made entries for
2077  *	all the one-to-one mappings to be found there.
2078  *
2079  *	We must set up the kernel pmap structures, the
2080  *	physical-to-virtual translation lookup tables for the
2081  *	physical memory to be managed (between avail_start and
2082  *	avail_end).
2083  *
2084  *	Map the kernel's code and data, and allocate the system page table.
2085  *	Page_size must already be set.
2086  *
2087  *	Parameters:
2088  *	first_avail	first available physical page -
2089  *			   after kernel page tables
2090  *	avail_start	PA of first managed physical page
2091  *	avail_end	PA of last managed physical page
2092  */
2093 
2094 void
pmap_bootstrap(vm_offset_t vstart)2095 pmap_bootstrap(
2096 	vm_offset_t vstart)
2097 {
2098 	vm_map_offset_t maxoffset;
2099 
2100 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2101 
2102 #if XNU_MONITOR
2103 
2104 #if DEVELOPMENT || DEBUG
2105 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2106 #endif
2107 
2108 #if CONFIG_CSR_FROM_DT
2109 	if (csr_unsafe_kernel_text) {
2110 		pmap_ppl_disable = true;
2111 	}
2112 #endif /* CONFIG_CSR_FROM_DT */
2113 
2114 #endif /* XNU_MONITOR */
2115 
2116 #if DEVELOPMENT || DEBUG
2117 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2118 		kprintf("Kernel traces for pmap operations enabled\n");
2119 	}
2120 #endif
2121 
2122 	/*
2123 	 *	Initialize the kernel pmap.
2124 	 */
2125 #if ARM_PARAMETERIZED_PMAP
2126 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2127 #endif /* ARM_PARAMETERIZED_PMAP */
2128 #if HAS_APPLE_PAC
2129 	kernel_pmap->disable_jop = 0;
2130 #endif /* HAS_APPLE_PAC */
2131 	kernel_pmap->tte = cpu_tte;
2132 	kernel_pmap->ttep = cpu_ttep;
2133 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2134 	kernel_pmap->max = UINTPTR_MAX;
2135 	os_atomic_init(&kernel_pmap->ref_count, 1);
2136 #if XNU_MONITOR
2137 	os_atomic_init(&kernel_pmap->nested_count, 0);
2138 #endif
2139 	kernel_pmap->nx_enabled = TRUE;
2140 #ifdef  __arm64__
2141 	kernel_pmap->is_64bit = TRUE;
2142 #else
2143 	kernel_pmap->is_64bit = FALSE;
2144 #endif
2145 #if CONFIG_ROSETTA
2146 	kernel_pmap->is_rosetta = FALSE;
2147 #endif
2148 
2149 #if ARM_PARAMETERIZED_PMAP
2150 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2151 #endif /* ARM_PARAMETERIZED_PMAP */
2152 
2153 	kernel_pmap->nested_region_addr = 0x0ULL;
2154 	kernel_pmap->nested_region_size = 0x0ULL;
2155 	kernel_pmap->nested_region_asid_bitmap = NULL;
2156 	kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2157 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2158 
2159 	kernel_pmap->hw_asid = 0;
2160 	kernel_pmap->sw_asid = 0;
2161 
2162 	pmap_lock_init(kernel_pmap);
2163 
2164 	pmap_max_asids = pmap_compute_max_asids();
2165 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2166 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2167 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2168 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2169 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2170 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2171 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2172 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2173 
2174 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2175 
2176 	/**
2177 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2178 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2179 	 * space for these data structures.
2180 	 */
2181 	pmap_data_bootstrap();
2182 
2183 	/**
2184 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2185 	 */
2186 	uat_bootstrap();
2187 
2188 
2189 	/**
2190 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2191 	 */
2192 	sart_bootstrap();
2193 
2194 	/**
2195 	 * Don't make any assumptions about the alignment of avail_start before this
2196 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2197 	 */
2198 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2199 
2200 	const pmap_paddr_t pmap_struct_start = avail_start;
2201 
2202 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2203 	avail_start = round_page(avail_start + asid_table_size);
2204 
2205 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2206 
2207 	vm_first_phys = gPhysBase;
2208 	vm_last_phys = trunc_page(avail_end);
2209 
2210 	queue_init(&map_pmap_list);
2211 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2212 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2213 	free_page_size_tt_count = 0;
2214 	free_page_size_tt_max = 0;
2215 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2216 	free_two_page_size_tt_count = 0;
2217 	free_two_page_size_tt_max = 0;
2218 	free_tt_list = TT_FREE_ENTRY_NULL;
2219 	free_tt_count = 0;
2220 	free_tt_max = 0;
2221 
2222 	virtual_space_start = vstart;
2223 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2224 
2225 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2226 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2227 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2228 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2229 
2230 
2231 
2232 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2233 		maxoffset = trunc_page(maxoffset);
2234 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2235 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2236 			arm_pmap_max_offset_default = maxoffset;
2237 		}
2238 	}
2239 #if defined(__arm64__)
2240 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2241 		maxoffset = trunc_page(maxoffset);
2242 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2243 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2244 			arm64_pmap_max_offset_default = maxoffset;
2245 		}
2246 	}
2247 #endif
2248 
2249 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2250 
2251 
2252 #if PMAP_CS_PPL_MONITOR
2253 	/* Initialize the PPL trust cache read-write lock */
2254 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2255 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2256 #endif
2257 
2258 #if MACH_ASSERT
2259 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2260 	    &vm_footprint_suspend_allowed,
2261 	    sizeof(vm_footprint_suspend_allowed));
2262 #endif /* MACH_ASSERT */
2263 
2264 #if KASAN
2265 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2266 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2267 #endif /* KASAN */
2268 
2269 	/**
2270 	 * Ensure that avail_start is always left on a page boundary. The calling
2271 	 * code might not perform any alignment before allocating page tables so
2272 	 * this is important.
2273 	 */
2274 	avail_start = round_page(avail_start);
2275 }
2276 
2277 #if XNU_MONITOR
2278 
2279 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2280 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2281 {
2282 	pmap_paddr_t cur_pa;
2283 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2284 		assert(pa_valid(cur_pa));
2285 		ppattr_pa_set_monitor(cur_pa);
2286 	}
2287 }
2288 
2289 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2290 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2291     pmap_paddr_t end_pa,
2292     unsigned int expected_perm,
2293     unsigned int new_perm)
2294 {
2295 	vm_offset_t start_va = phystokv(start_pa);
2296 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2297 
2298 	pa_set_range_monitor(start_pa, end_pa);
2299 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2300 }
2301 
2302 static void
pmap_lockdown_kc(void)2303 pmap_lockdown_kc(void)
2304 {
2305 	extern vm_offset_t vm_kernelcache_base;
2306 	extern vm_offset_t vm_kernelcache_top;
2307 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2308 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2309 	pmap_paddr_t cur_pa = start_pa;
2310 	vm_offset_t cur_va = vm_kernelcache_base;
2311 	while (cur_pa < end_pa) {
2312 		vm_size_t range_size = end_pa - cur_pa;
2313 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2314 		if (ptov_va != cur_va) {
2315 			/*
2316 			 * If the physical address maps back to a virtual address that is non-linear
2317 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2318 			 * reclaimed by the OS and should therefore not be locked down.
2319 			 */
2320 			cur_pa += range_size;
2321 			cur_va += range_size;
2322 			continue;
2323 		}
2324 		unsigned int pai = pa_index(cur_pa);
2325 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2326 
2327 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2328 
2329 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2330 			panic("pai %d already locked down", pai);
2331 		}
2332 
2333 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2334 		cur_pa += ARM_PGBYTES;
2335 		cur_va += ARM_PGBYTES;
2336 	}
2337 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2338 	extern uint64_t ctrr_ro_test;
2339 	extern uint64_t ctrr_nx_test;
2340 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2341 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2342 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2343 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2344 	}
2345 #endif
2346 }
2347 
2348 void
pmap_static_allocations_done(void)2349 pmap_static_allocations_done(void)
2350 {
2351 	pmap_paddr_t monitor_start_pa;
2352 	pmap_paddr_t monitor_end_pa;
2353 
2354 	/*
2355 	 * Protect the bootstrap (V=P and V->P) page tables.
2356 	 *
2357 	 * These bootstrap allocations will be used primarily for page tables.
2358 	 * If we wish to secure the page tables, we need to start by marking
2359 	 * these bootstrap allocations as pages that we want to protect.
2360 	 */
2361 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2362 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2363 
2364 	/* The bootstrap page tables are mapped RW at boostrap. */
2365 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2366 
2367 	/*
2368 	 * We use avail_start as a pointer to the first address that has not
2369 	 * been reserved for bootstrap, so we know which pages to give to the
2370 	 * virtual memory layer.
2371 	 */
2372 	monitor_start_pa = BootArgs->topOfKernelData;
2373 	monitor_end_pa = avail_start;
2374 
2375 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2376 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2377 
2378 	/*
2379 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2380 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2381 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2382 	 * they can't be allocated for other uses.  We don't need a special xPRR
2383 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2384 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2385 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2386 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2387 	 * to believe we are dealing with an user XO page upon performing a translation.
2388 	 */
2389 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2390 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2391 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2392 
2393 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2394 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2395 
2396 	/* PPL data is RW for the PPL, RO for the kernel. */
2397 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2398 
2399 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2400 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2401 
2402 	/* PPL text is RX for the PPL, RO for the kernel. */
2403 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2404 
2405 
2406 	/*
2407 	 * In order to support DTrace, the save areas for the PPL must be
2408 	 * writable.  This is due to the fact that DTrace will try to update
2409 	 * register state.
2410 	 */
2411 	if (pmap_ppl_disable) {
2412 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2413 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2414 
2415 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2416 	}
2417 
2418 
2419 	if (segSizePPLDATACONST > 0) {
2420 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2421 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2422 
2423 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2424 	}
2425 
2426 	/*
2427 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2428 	 * precaution.  The real RW mappings are at a different location with guard pages.
2429 	 */
2430 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2431 
2432 	/* Prevent remapping of the kernelcache */
2433 	pmap_lockdown_kc();
2434 }
2435 
2436 void
pmap_lockdown_ppl(void)2437 pmap_lockdown_ppl(void)
2438 {
2439 	/* Mark the PPL as being locked down. */
2440 
2441 	mp_disable_preemption(); // for _nopreempt locking operations
2442 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2443 	if (commpage_text_kva != 0) {
2444 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2445 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2446 	}
2447 	mp_enable_preemption();
2448 
2449 	/* Write-protect the kernel RO commpage. */
2450 #error "XPRR configuration error"
2451 }
2452 #endif /* XNU_MONITOR */
2453 
2454 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2455 pmap_virtual_space(
2456 	vm_offset_t *startp,
2457 	vm_offset_t *endp
2458 	)
2459 {
2460 	*startp = virtual_space_start;
2461 	*endp = virtual_space_end;
2462 }
2463 
2464 
2465 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2466 pmap_virtual_region(
2467 	unsigned int region_select,
2468 	vm_map_offset_t *startp,
2469 	vm_map_size_t *size
2470 	)
2471 {
2472 	boolean_t       ret = FALSE;
2473 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2474 	if (region_select == 0) {
2475 		/*
2476 		 * In this config, the bootstrap mappings should occupy their own L2
2477 		 * TTs, as they should be immutable after boot.  Having the associated
2478 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2479 		 * while allowing the rest of the kernel address range to be remapped.
2480 		 */
2481 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2482 #if defined(ARM_LARGE_MEMORY)
2483 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2484 #else
2485 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2486 #endif
2487 		ret = TRUE;
2488 	}
2489 
2490 #if defined(ARM_LARGE_MEMORY)
2491 	if (region_select == 1) {
2492 		*startp = VREGION1_START;
2493 		*size = VREGION1_SIZE;
2494 		ret = TRUE;
2495 	}
2496 #endif
2497 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2498 #if defined(ARM_LARGE_MEMORY)
2499 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2500 	if (region_select == 0) {
2501 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2502 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2503 		ret = TRUE;
2504 	}
2505 
2506 	if (region_select == 1) {
2507 		*startp = VREGION1_START;
2508 		*size = VREGION1_SIZE;
2509 		ret = TRUE;
2510 	}
2511 #else /* !defined(ARM_LARGE_MEMORY) */
2512 	unsigned long low_global_vr_mask = 0;
2513 	vm_map_size_t low_global_vr_size = 0;
2514 
2515 	if (region_select == 0) {
2516 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2517 		if (!TEST_PAGE_SIZE_4K) {
2518 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2519 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2520 		} else {
2521 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2522 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2523 		}
2524 		ret = TRUE;
2525 	}
2526 	if (region_select == 1) {
2527 		*startp = VREGION1_START;
2528 		*size = VREGION1_SIZE;
2529 		ret = TRUE;
2530 	}
2531 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2532 	if (!TEST_PAGE_SIZE_4K) {
2533 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2534 		low_global_vr_size = 0x2000000;
2535 	} else {
2536 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2537 		low_global_vr_size = 0x800000;
2538 	}
2539 
2540 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2541 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2542 		*size = low_global_vr_size;
2543 		ret = TRUE;
2544 	}
2545 
2546 	if (region_select == 3) {
2547 		/* In this config, we allow the bootstrap mappings to occupy the same
2548 		 * page table pages as the heap.
2549 		 */
2550 		*startp = VM_MIN_KERNEL_ADDRESS;
2551 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2552 		ret = TRUE;
2553 	}
2554 #endif /* defined(ARM_LARGE_MEMORY) */
2555 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2556 	return ret;
2557 }
2558 
2559 /*
2560  * Routines to track and allocate physical pages during early boot.
2561  * On most systems that memory runs from first_avail through to avail_end
2562  * with no gaps.
2563  *
2564  * If the system supports ECC and ecc_bad_pages_count > 0, we
2565  * need to skip those pages.
2566  */
2567 
2568 static unsigned int avail_page_count = 0;
2569 static bool need_ram_ranges_init = true;
2570 
2571 
2572 /**
2573  * Checks to see if a given page is in
2574  * the array of known bad pages
2575  *
2576  * @param ppn page number to check
2577  */
2578 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2579 pmap_is_bad_ram(__unused ppnum_t ppn)
2580 {
2581 	return false;
2582 }
2583 
2584 /**
2585  * Prepare bad ram pages to be skipped.
2586  */
2587 
2588 /*
2589  * Initialize the count of available pages. No lock needed here,
2590  * as this code is called while kernel boot up is single threaded.
2591  */
2592 static void
initialize_ram_ranges(void)2593 initialize_ram_ranges(void)
2594 {
2595 	pmap_paddr_t first = first_avail;
2596 	pmap_paddr_t end = avail_end;
2597 
2598 	assert(first <= end);
2599 	assert(first == (first & ~PAGE_MASK));
2600 	assert(end == (end & ~PAGE_MASK));
2601 	avail_page_count = atop(end - first);
2602 
2603 	need_ram_ranges_init = false;
2604 }
2605 
2606 unsigned int
pmap_free_pages(void)2607 pmap_free_pages(
2608 	void)
2609 {
2610 	if (need_ram_ranges_init) {
2611 		initialize_ram_ranges();
2612 	}
2613 	return avail_page_count;
2614 }
2615 
2616 unsigned int
pmap_free_pages_span(void)2617 pmap_free_pages_span(
2618 	void)
2619 {
2620 	if (need_ram_ranges_init) {
2621 		initialize_ram_ranges();
2622 	}
2623 	return (unsigned int)atop(avail_end - first_avail);
2624 }
2625 
2626 
2627 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2628 pmap_next_page_hi(
2629 	ppnum_t            * pnum,
2630 	__unused boolean_t might_free)
2631 {
2632 	return pmap_next_page(pnum);
2633 }
2634 
2635 
2636 boolean_t
pmap_next_page(ppnum_t * pnum)2637 pmap_next_page(
2638 	ppnum_t *pnum)
2639 {
2640 	if (need_ram_ranges_init) {
2641 		initialize_ram_ranges();
2642 	}
2643 
2644 
2645 	if (first_avail != avail_end) {
2646 		*pnum = (ppnum_t)atop(first_avail);
2647 		first_avail += PAGE_SIZE;
2648 		assert(avail_page_count > 0);
2649 		--avail_page_count;
2650 		return TRUE;
2651 	}
2652 	assert(avail_page_count == 0);
2653 	return FALSE;
2654 }
2655 
2656 
2657 /*
2658  *	Initialize the pmap module.
2659  *	Called by vm_init, to initialize any structures that the pmap
2660  *	system needs to map virtual memory.
2661  */
2662 void
pmap_init(void)2663 pmap_init(
2664 	void)
2665 {
2666 	/*
2667 	 *	Protect page zero in the kernel map.
2668 	 *	(can be overruled by permanent transltion
2669 	 *	table entries at page zero - see arm_vm_init).
2670 	 */
2671 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2672 
2673 	pmap_initialized = TRUE;
2674 
2675 	/*
2676 	 *	Create the zone of physical maps
2677 	 *	and the physical-to-virtual entries.
2678 	 */
2679 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2680 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2681 
2682 
2683 	/*
2684 	 *	Initialize the pmap object (for tracking the vm_page_t
2685 	 *	structures for pages we allocate to be page tables in
2686 	 *	pmap_expand().
2687 	 */
2688 	_vm_object_allocate(mem_size, pmap_object);
2689 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2690 
2691 	/*
2692 	 * The values of [hard_]maxproc may have been scaled, make sure
2693 	 * they are still less than the value of pmap_max_asids.
2694 	 */
2695 	if ((uint32_t)maxproc > pmap_max_asids) {
2696 		maxproc = pmap_max_asids;
2697 	}
2698 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2699 		hard_maxproc = pmap_max_asids;
2700 	}
2701 }
2702 
2703 /**
2704  * Verify that a given physical page contains no mappings (outside of the
2705  * default physical aperture mapping).
2706  *
2707  * @param ppnum Physical page number to check there are no mappings to.
2708  *
2709  * @return True if there are no mappings, false otherwise or if the page is not
2710  *         kernel-managed.
2711  */
2712 bool
pmap_verify_free(ppnum_t ppnum)2713 pmap_verify_free(ppnum_t ppnum)
2714 {
2715 	const pmap_paddr_t pa = ptoa(ppnum);
2716 
2717 	assert(pa != vm_page_fictitious_addr);
2718 
2719 	/* Only mappings to kernel-managed physical memory are tracked. */
2720 	if (!pa_valid(pa)) {
2721 		return false;
2722 	}
2723 
2724 	const unsigned int pai = pa_index(pa);
2725 	pv_entry_t **pvh = pai_to_pvh(pai);
2726 
2727 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2728 }
2729 
2730 #if MACH_ASSERT
2731 /**
2732  * Verify that a given physical page contains no mappings (outside of the
2733  * default physical aperture mapping) and if it does, then panic.
2734  *
2735  * @note It's recommended to use pmap_verify_free() directly when operating in
2736  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2737  *       normally being called from outside of the PPL, and the pv_head_table
2738  *       can't be modified outside of the PPL).
2739  *
2740  * @param ppnum Physical page number to check there are no mappings to.
2741  */
2742 void
pmap_assert_free(ppnum_t ppnum)2743 pmap_assert_free(ppnum_t ppnum)
2744 {
2745 	const pmap_paddr_t pa = ptoa(ppnum);
2746 
2747 	/* Only mappings to kernel-managed physical memory are tracked. */
2748 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2749 		return;
2750 	}
2751 
2752 	const unsigned int pai = pa_index(pa);
2753 	pv_entry_t **pvh = pai_to_pvh(pai);
2754 
2755 	/**
2756 	 * This function is always called from outside of the PPL. Because of this,
2757 	 * the PVH entry can't be locked. This function is generally only called
2758 	 * before the VM reclaims a physical page and shouldn't be creating new
2759 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2760 	 * the worst case is that the system will panic in another way, and we were
2761 	 * already about to panic anyway.
2762 	 */
2763 
2764 	/**
2765 	 * Since pmap_verify_free() returned false, that means there is at least one
2766 	 * mapping left. Let's get some extra info on the first mapping we find to
2767 	 * dump in the panic string (the common case is that there is one spare
2768 	 * mapping that was never unmapped).
2769 	 */
2770 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2771 
2772 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2773 		first_ptep = pvh_ptep(pvh);
2774 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2775 		pv_entry_t *pvep = pvh_pve_list(pvh);
2776 
2777 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2778 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2779 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2780 			if (first_ptep != PT_ENTRY_NULL) {
2781 				break;
2782 			}
2783 		}
2784 
2785 		/* The PVE should have at least one valid PTE. */
2786 		assert(first_ptep != PT_ENTRY_NULL);
2787 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2788 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2789 		    __func__, pvh, pai);
2790 	} else {
2791 		/**
2792 		 * The mapping disappeared between here and the pmap_verify_free() call.
2793 		 * The only way that can happen is if the VM was racing this call with
2794 		 * a call that unmaps PTEs. Operations on this page should not be
2795 		 * occurring at the same time as this check, and unfortunately we can't
2796 		 * lock the PVH entry to prevent it, so just panic instead.
2797 		 */
2798 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2799 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2800 		    __func__, pvh, pai);
2801 	}
2802 
2803 	/* Panic with a unique string identifying the first bad mapping and owner. */
2804 	{
2805 		/* First PTE is mapped by the main CPUs. */
2806 		pmap_t pmap = ptep_get_pmap(first_ptep);
2807 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2808 
2809 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2810 		    "%s CPU mapping (pmap: %p)",
2811 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2812 	}
2813 }
2814 #endif
2815 
2816 
2817 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2818 pmap_root_alloc_size(pmap_t pmap)
2819 {
2820 #pragma unused(pmap)
2821 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2822 	unsigned int root_level = pt_attr_root_level(pt_attr);
2823 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2824 }
2825 
2826 
2827 /*
2828  *	Create and return a physical map.
2829  *
2830  *	If the size specified for the map
2831  *	is zero, the map is an actual physical
2832  *	map, and may be referenced by the
2833  *	hardware.
2834  *
2835  *	If the size specified is non-zero,
2836  *	the map will be used in software only, and
2837  *	is bounded by that size.
2838  */
2839 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2840 pmap_create_options_internal(
2841 	ledger_t ledger,
2842 	vm_map_size_t size,
2843 	unsigned int flags,
2844 	kern_return_t *kr)
2845 {
2846 	unsigned        i;
2847 	unsigned        tte_index_max;
2848 	pmap_t          p;
2849 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2850 #if defined(HAS_APPLE_PAC)
2851 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2852 #endif /* defined(HAS_APPLE_PAC) */
2853 	kern_return_t   local_kr = KERN_SUCCESS;
2854 
2855 	if (size != 0) {
2856 		{
2857 			// Size parameter should only be set for stage 2.
2858 			return PMAP_NULL;
2859 		}
2860 	}
2861 
2862 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2863 		return PMAP_NULL;
2864 	}
2865 
2866 #if XNU_MONITOR
2867 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2868 		goto pmap_create_fail;
2869 	}
2870 
2871 	assert(p != PMAP_NULL);
2872 
2873 	if (ledger) {
2874 		pmap_ledger_validate(ledger);
2875 		pmap_ledger_retain(ledger);
2876 	}
2877 #else
2878 	/*
2879 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2880 	 *	the translation table of the right size for the pmap.
2881 	 */
2882 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2883 		local_kr = KERN_RESOURCE_SHORTAGE;
2884 		goto pmap_create_fail;
2885 	}
2886 #endif
2887 
2888 	p->ledger = ledger;
2889 
2890 
2891 	p->pmap_vm_map_cs_enforced = false;
2892 	p->min = 0;
2893 
2894 
2895 #if CONFIG_ROSETTA
2896 	if (flags & PMAP_CREATE_ROSETTA) {
2897 		p->is_rosetta = TRUE;
2898 	} else {
2899 		p->is_rosetta = FALSE;
2900 	}
2901 #endif /* CONFIG_ROSETTA */
2902 
2903 #if defined(HAS_APPLE_PAC)
2904 	p->disable_jop = disable_jop;
2905 #endif /* defined(HAS_APPLE_PAC) */
2906 
2907 	p->nested_region_true_start = 0;
2908 	p->nested_region_true_end = ~0;
2909 
2910 	p->nx_enabled = true;
2911 	p->is_64bit = is_64bit;
2912 	p->nested_pmap = PMAP_NULL;
2913 	p->type = PMAP_TYPE_USER;
2914 
2915 #if ARM_PARAMETERIZED_PMAP
2916 	/* Default to the native pt_attr */
2917 	p->pmap_pt_attr = native_pt_attr;
2918 #endif /* ARM_PARAMETERIZED_PMAP */
2919 #if __ARM_MIXED_PAGE_SIZE__
2920 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2921 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2922 	}
2923 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2924 	p->max = pmap_user_va_size(p);
2925 
2926 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2927 		local_kr = KERN_NO_SPACE;
2928 		goto id_alloc_fail;
2929 	}
2930 
2931 	pmap_lock_init(p);
2932 
2933 	p->tt_entry_free = (tt_entry_t *)0;
2934 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2935 
2936 
2937 #if XNU_MONITOR
2938 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2939 #else
2940 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2941 #endif
2942 	if (!(p->tte)) {
2943 		local_kr = KERN_RESOURCE_SHORTAGE;
2944 		goto tt1_alloc_fail;
2945 	}
2946 
2947 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
2948 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2949 
2950 	/* nullify the translation table */
2951 	for (i = 0; i < tte_index_max; i++) {
2952 		p->tte[i] = ARM_TTE_TYPE_FAULT;
2953 	}
2954 
2955 	FLUSH_PTE();
2956 
2957 	/*
2958 	 *  initialize the rest of the structure
2959 	 */
2960 	p->nested_region_addr = 0x0ULL;
2961 	p->nested_region_size = 0x0ULL;
2962 	p->nested_region_asid_bitmap = NULL;
2963 	p->nested_region_asid_bitmap_size = 0x0UL;
2964 
2965 	p->nested_has_no_bounds_ref = false;
2966 	p->nested_no_bounds_refcnt = 0;
2967 	p->nested_bounds_set = false;
2968 
2969 
2970 #if MACH_ASSERT
2971 	p->pmap_pid = 0;
2972 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2973 #endif /* MACH_ASSERT */
2974 #if DEVELOPMENT || DEBUG
2975 	p->footprint_was_suspended = FALSE;
2976 #endif /* DEVELOPMENT || DEBUG */
2977 
2978 #if XNU_MONITOR
2979 	os_atomic_init(&p->nested_count, 0);
2980 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
2981 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
2982 	os_atomic_thread_fence(release);
2983 #endif
2984 	os_atomic_init(&p->ref_count, 1);
2985 	pmap_simple_lock(&pmaps_lock);
2986 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2987 	pmap_simple_unlock(&pmaps_lock);
2988 
2989 	/*
2990 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
2991 	 * which can lead to a concurrent disconnect operation making the balance
2992 	 * transiently negative.  The ledger should still ultimately balance out,
2993 	 * which we still check upon pmap destruction.
2994 	 */
2995 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2996 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2997 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2998 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2999 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3000 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3001 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3002 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3003 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3004 
3005 	return p;
3006 
3007 tt1_alloc_fail:
3008 	pmap_get_pt_ops(p)->free_id(p);
3009 id_alloc_fail:
3010 #if XNU_MONITOR
3011 	pmap_free_pmap(p);
3012 
3013 	if (ledger) {
3014 		pmap_ledger_release(ledger);
3015 	}
3016 #else
3017 	zfree(pmap_zone, p);
3018 #endif
3019 pmap_create_fail:
3020 #if XNU_MONITOR
3021 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3022 #endif
3023 	*kr = local_kr;
3024 #if XNU_MONITOR
3025 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3026 #endif
3027 	return PMAP_NULL;
3028 }
3029 
3030 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3031 pmap_create_options(
3032 	ledger_t ledger,
3033 	vm_map_size_t size,
3034 	unsigned int flags)
3035 {
3036 	pmap_t pmap;
3037 	kern_return_t kr = KERN_SUCCESS;
3038 
3039 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3040 
3041 	ledger_reference(ledger);
3042 
3043 #if XNU_MONITOR
3044 	for (;;) {
3045 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3046 		if (kr != KERN_RESOURCE_SHORTAGE) {
3047 			break;
3048 		}
3049 		assert(pmap == PMAP_NULL);
3050 		pmap_alloc_page_for_ppl(0);
3051 		kr = KERN_SUCCESS;
3052 	}
3053 #else
3054 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3055 #endif
3056 
3057 	if (pmap == PMAP_NULL) {
3058 		ledger_dereference(ledger);
3059 	}
3060 
3061 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3062 
3063 	return pmap;
3064 }
3065 
3066 #if XNU_MONITOR
3067 /*
3068  * This symbol remains in place when the PPL is enabled so that the dispatch
3069  * table does not change from development to release configurations.
3070  */
3071 #endif
3072 #if MACH_ASSERT || XNU_MONITOR
3073 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3074 pmap_set_process_internal(
3075 	__unused pmap_t pmap,
3076 	__unused int pid,
3077 	__unused char *procname)
3078 {
3079 #if MACH_ASSERT
3080 	if (pmap == NULL || pmap->pmap_pid == -1) {
3081 		return;
3082 	}
3083 
3084 	validate_pmap_mutable(pmap);
3085 
3086 	pmap->pmap_pid = pid;
3087 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3088 #endif /* MACH_ASSERT */
3089 }
3090 #endif /* MACH_ASSERT || XNU_MONITOR */
3091 
3092 #if MACH_ASSERT
3093 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3094 pmap_set_process(
3095 	pmap_t pmap,
3096 	int pid,
3097 	char *procname)
3098 {
3099 #if XNU_MONITOR
3100 	pmap_set_process_ppl(pmap, pid, procname);
3101 #else
3102 	pmap_set_process_internal(pmap, pid, procname);
3103 #endif
3104 }
3105 #endif /* MACH_ASSERT */
3106 
3107 /*
3108  * pmap_deallocate_all_leaf_tts:
3109  *
3110  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3111  * removing and deallocating all TTEs.
3112  */
3113 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3114 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3115 {
3116 	tt_entry_t tte = ARM_TTE_EMPTY;
3117 	tt_entry_t * ttep = NULL;
3118 	tt_entry_t * last_ttep = NULL;
3119 
3120 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3121 
3122 	assert(level < pt_attr_leaf_level(pt_attr));
3123 
3124 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3125 
3126 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3127 		tte = *ttep;
3128 
3129 		if (!(tte & ARM_TTE_VALID)) {
3130 			continue;
3131 		}
3132 
3133 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3134 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3135 			    "pmap=%p, first_ttep=%p, level=%u",
3136 			    __FUNCTION__, ttep, (void *)tte,
3137 			    pmap, first_ttep, level);
3138 		}
3139 
3140 		/* Must be valid, type table */
3141 		if (level < pt_attr_twig_level(pt_attr)) {
3142 			/* If we haven't reached the twig level, recurse to the next level. */
3143 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3144 		}
3145 
3146 		/* Remove the TTE. */
3147 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3148 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3149 	}
3150 }
3151 
3152 /*
3153  * We maintain stats and ledgers so that a task's physical footprint is:
3154  * phys_footprint = ((internal - alternate_accounting)
3155  *                   + (internal_compressed - alternate_accounting_compressed)
3156  *                   + iokit_mapped
3157  *                   + purgeable_nonvolatile
3158  *                   + purgeable_nonvolatile_compressed
3159  *                   + page_table)
3160  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3161  */
3162 
3163 /*
3164  *	Retire the given physical map from service.
3165  *	Should only be called if the map contains
3166  *	no valid mappings.
3167  */
3168 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3169 pmap_destroy_internal(
3170 	pmap_t pmap)
3171 {
3172 	if (pmap == PMAP_NULL) {
3173 		return;
3174 	}
3175 
3176 	validate_pmap(pmap);
3177 
3178 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3179 
3180 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3181 	if (ref_count > 0) {
3182 		return;
3183 	} else if (__improbable(ref_count < 0)) {
3184 		panic("pmap %p: refcount underflow", pmap);
3185 	} else if (__improbable(pmap == kernel_pmap)) {
3186 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3187 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3188 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3189 	}
3190 
3191 #if XNU_MONITOR
3192 	/*
3193 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3194 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3195 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3196 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3197 	 * ref_count of 0 and panic.
3198 	 */
3199 	os_atomic_thread_fence(seq_cst);
3200 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3201 		panic("pmap %p: attempt to destroy while nested", pmap);
3202 	}
3203 	const int max_cpu = ml_get_max_cpu_number();
3204 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3205 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3206 		if (cpu_data == NULL) {
3207 			continue;
3208 		}
3209 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3210 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3211 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3212 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3213 		}
3214 	}
3215 #endif
3216 	pmap_unmap_commpage(pmap);
3217 
3218 	pmap_simple_lock(&pmaps_lock);
3219 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3220 	pmap_simple_unlock(&pmaps_lock);
3221 
3222 	pmap_trim_self(pmap);
3223 
3224 	/*
3225 	 *	Free the memory maps, then the
3226 	 *	pmap structure.
3227 	 */
3228 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3229 
3230 
3231 
3232 	if (pmap->tte) {
3233 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3234 		pmap->tte = (tt_entry_t *) NULL;
3235 		pmap->ttep = 0;
3236 	}
3237 
3238 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3239 
3240 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3241 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3242 		sync_tlb_flush();
3243 	} else {
3244 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3245 		sync_tlb_flush();
3246 		/* return its asid to the pool */
3247 		pmap_get_pt_ops(pmap)->free_id(pmap);
3248 		if (pmap->nested_pmap != NULL) {
3249 #if XNU_MONITOR
3250 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3251 #endif
3252 			/* release the reference we hold on the nested pmap */
3253 			pmap_destroy_internal(pmap->nested_pmap);
3254 		}
3255 	}
3256 
3257 	pmap_check_ledgers(pmap);
3258 
3259 	if (pmap->nested_region_asid_bitmap) {
3260 #if XNU_MONITOR
3261 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3262 #else
3263 		kfree_data(pmap->nested_region_asid_bitmap,
3264 		    pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3265 #endif
3266 	}
3267 
3268 #if XNU_MONITOR
3269 	if (pmap->ledger) {
3270 		pmap_ledger_release(pmap->ledger);
3271 	}
3272 
3273 	pmap_lock_destroy(pmap);
3274 	pmap_free_pmap(pmap);
3275 #else
3276 	pmap_lock_destroy(pmap);
3277 	zfree(pmap_zone, pmap);
3278 #endif
3279 }
3280 
3281 void
pmap_destroy(pmap_t pmap)3282 pmap_destroy(
3283 	pmap_t pmap)
3284 {
3285 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3286 
3287 	ledger_t ledger = pmap->ledger;
3288 
3289 #if XNU_MONITOR
3290 	pmap_destroy_ppl(pmap);
3291 
3292 	pmap_ledger_check_balance(pmap);
3293 #else
3294 	pmap_destroy_internal(pmap);
3295 #endif
3296 
3297 	ledger_dereference(ledger);
3298 
3299 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3300 }
3301 
3302 
3303 /*
3304  *	Add a reference to the specified pmap.
3305  */
3306 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3307 pmap_reference_internal(
3308 	pmap_t pmap)
3309 {
3310 	if (pmap != PMAP_NULL) {
3311 		validate_pmap_mutable(pmap);
3312 		os_atomic_inc(&pmap->ref_count, relaxed);
3313 	}
3314 }
3315 
3316 void
pmap_reference(pmap_t pmap)3317 pmap_reference(
3318 	pmap_t pmap)
3319 {
3320 #if XNU_MONITOR
3321 	pmap_reference_ppl(pmap);
3322 #else
3323 	pmap_reference_internal(pmap);
3324 #endif
3325 }
3326 
3327 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3328 pmap_tt1_allocate(
3329 	pmap_t          pmap,
3330 	vm_size_t       size,
3331 	unsigned        option)
3332 {
3333 	tt_entry_t      *tt1 = NULL;
3334 	tt_free_entry_t *tt1_free;
3335 	pmap_paddr_t    pa;
3336 	vm_address_t    va;
3337 	vm_address_t    va_end;
3338 	kern_return_t   ret;
3339 
3340 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3341 		size = PAGE_SIZE;
3342 	}
3343 
3344 	pmap_simple_lock(&tt1_lock);
3345 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3346 		free_page_size_tt_count--;
3347 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3348 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3349 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3350 		free_two_page_size_tt_count--;
3351 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3352 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3353 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3354 		free_tt_count--;
3355 		tt1 = (tt_entry_t *)free_tt_list;
3356 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3357 	}
3358 
3359 	pmap_simple_unlock(&tt1_lock);
3360 
3361 	if (tt1 != NULL) {
3362 		pmap_tt_ledger_credit(pmap, size);
3363 		return (tt_entry_t *)tt1;
3364 	}
3365 
3366 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3367 
3368 	if (ret == KERN_RESOURCE_SHORTAGE) {
3369 		return (tt_entry_t *)0;
3370 	}
3371 
3372 #if XNU_MONITOR
3373 	assert(pa);
3374 #endif
3375 
3376 	if (size < PAGE_SIZE) {
3377 		va = phystokv(pa) + size;
3378 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3379 		tt_free_entry_t *next_free = NULL;
3380 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3381 			tt1_free = (tt_free_entry_t *)va;
3382 			tt1_free->next = next_free;
3383 			next_free = tt1_free;
3384 		}
3385 		pmap_simple_lock(&tt1_lock);
3386 		local_free_list->next = free_tt_list;
3387 		free_tt_list = next_free;
3388 		free_tt_count += ((PAGE_SIZE / size) - 1);
3389 		if (free_tt_count > free_tt_max) {
3390 			free_tt_max = free_tt_count;
3391 		}
3392 		pmap_simple_unlock(&tt1_lock);
3393 	}
3394 
3395 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3396 	 * Depending on the device, this can vary between 512b and 16K. */
3397 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3398 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3399 	pmap_tt_ledger_credit(pmap, size);
3400 
3401 	return (tt_entry_t *) phystokv(pa);
3402 }
3403 
3404 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3405 pmap_tt1_deallocate(
3406 	pmap_t pmap,
3407 	tt_entry_t *tt,
3408 	vm_size_t size,
3409 	unsigned option)
3410 {
3411 	tt_free_entry_t *tt_entry;
3412 
3413 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3414 		size = PAGE_SIZE;
3415 	}
3416 
3417 	tt_entry = (tt_free_entry_t *)tt;
3418 	assert(not_in_kdp);
3419 	pmap_simple_lock(&tt1_lock);
3420 
3421 	if (size < PAGE_SIZE) {
3422 		free_tt_count++;
3423 		if (free_tt_count > free_tt_max) {
3424 			free_tt_max = free_tt_count;
3425 		}
3426 		tt_entry->next = free_tt_list;
3427 		free_tt_list = tt_entry;
3428 	}
3429 
3430 	if (size == PAGE_SIZE) {
3431 		free_page_size_tt_count++;
3432 		if (free_page_size_tt_count > free_page_size_tt_max) {
3433 			free_page_size_tt_max = free_page_size_tt_count;
3434 		}
3435 		tt_entry->next = free_page_size_tt_list;
3436 		free_page_size_tt_list = tt_entry;
3437 	}
3438 
3439 	if (size == 2 * PAGE_SIZE) {
3440 		free_two_page_size_tt_count++;
3441 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3442 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3443 		}
3444 		tt_entry->next = free_two_page_size_tt_list;
3445 		free_two_page_size_tt_list = tt_entry;
3446 	}
3447 
3448 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3449 		pmap_simple_unlock(&tt1_lock);
3450 		pmap_tt_ledger_debit(pmap, size);
3451 		return;
3452 	}
3453 
3454 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3455 		free_page_size_tt_count--;
3456 		tt = (tt_entry_t *)free_page_size_tt_list;
3457 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3458 
3459 		pmap_simple_unlock(&tt1_lock);
3460 
3461 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3462 
3463 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3464 
3465 		pmap_simple_lock(&tt1_lock);
3466 	}
3467 
3468 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3469 		free_two_page_size_tt_count--;
3470 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3471 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3472 
3473 		pmap_simple_unlock(&tt1_lock);
3474 
3475 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3476 
3477 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3478 
3479 		pmap_simple_lock(&tt1_lock);
3480 	}
3481 	pmap_simple_unlock(&tt1_lock);
3482 	pmap_tt_ledger_debit(pmap, size);
3483 }
3484 
3485 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3486 pmap_tt_allocate(
3487 	pmap_t pmap,
3488 	tt_entry_t **ttp,
3489 	unsigned int level,
3490 	unsigned int options)
3491 {
3492 	pmap_paddr_t pa;
3493 	*ttp = NULL;
3494 
3495 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3496 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3497 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3498 
3499 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3500 		tt_free_next = tt_free_cur->next;
3501 		tt_free_cur->next = NULL;
3502 		*ttp = (tt_entry_t *)tt_free_cur;
3503 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3504 	}
3505 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3506 
3507 	if (*ttp == NULL) {
3508 		pt_desc_t       *ptdp;
3509 
3510 		/*
3511 		 *  Allocate a VM page for the level x page table entries.
3512 		 */
3513 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3514 			if (options & PMAP_OPTIONS_NOWAIT) {
3515 				return KERN_RESOURCE_SHORTAGE;
3516 			}
3517 			VM_PAGE_WAIT();
3518 		}
3519 
3520 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3521 			if (options & PMAP_OPTIONS_NOWAIT) {
3522 				pmap_pages_free(pa, PAGE_SIZE);
3523 				return KERN_RESOURCE_SHORTAGE;
3524 			}
3525 			VM_PAGE_WAIT();
3526 		}
3527 
3528 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3529 			OSAddAtomic64(1, &alloc_ttepages_count);
3530 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3531 		} else {
3532 			OSAddAtomic64(1, &alloc_ptepages_count);
3533 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3534 		}
3535 
3536 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3537 
3538 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3539 
3540 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3541 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3542 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3543 
3544 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3545 		if (PAGE_SIZE > pmap_page_size) {
3546 			vm_address_t    va;
3547 			vm_address_t    va_end;
3548 
3549 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3550 
3551 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3552 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3553 				pmap->tt_entry_free = (tt_entry_t *)va;
3554 			}
3555 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3556 		}
3557 
3558 		*ttp = (tt_entry_t *)phystokv(pa);
3559 	}
3560 
3561 #if XNU_MONITOR
3562 	assert(*ttp);
3563 #endif
3564 
3565 	return KERN_SUCCESS;
3566 }
3567 
3568 
3569 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3570 pmap_tt_deallocate(
3571 	pmap_t pmap,
3572 	tt_entry_t *ttp,
3573 	unsigned int level)
3574 {
3575 	pt_desc_t *ptdp;
3576 	ptd_info_t *ptd_info;
3577 	unsigned pt_acc_cnt;
3578 	unsigned i;
3579 	vm_offset_t     free_page = 0;
3580 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3581 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3582 
3583 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3584 
3585 	ptdp = ptep_get_ptd(ttp);
3586 	ptd_info = ptd_get_info(ptdp, ttp);
3587 
3588 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3589 
3590 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3591 		ptd_info->refcnt = 0;
3592 	}
3593 
3594 	if (__improbable(ptd_info->refcnt != 0)) {
3595 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3596 	}
3597 
3598 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3599 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3600 	}
3601 
3602 	if (pt_acc_cnt == 0) {
3603 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3604 		unsigned pt_free_entry_cnt = 1;
3605 
3606 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3607 			tt_free_entry_t *tt_free_list_next;
3608 
3609 			tt_free_list_next = tt_free_list->next;
3610 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3611 				pt_free_entry_cnt++;
3612 			}
3613 			tt_free_list = tt_free_list_next;
3614 		}
3615 		if (pt_free_entry_cnt == max_pt_index) {
3616 			tt_free_entry_t *tt_free_list_cur;
3617 
3618 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3619 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3620 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3621 
3622 			while (tt_free_list_cur) {
3623 				tt_free_entry_t *tt_free_list_next;
3624 
3625 				tt_free_list_next = tt_free_list_cur->next;
3626 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3627 					tt_free_list->next = tt_free_list_next->next;
3628 				} else {
3629 					tt_free_list = tt_free_list_next;
3630 				}
3631 				tt_free_list_cur = tt_free_list_next;
3632 			}
3633 		} else {
3634 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3635 			pmap->tt_entry_free = ttp;
3636 		}
3637 	} else {
3638 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3639 		pmap->tt_entry_free = ttp;
3640 	}
3641 
3642 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3643 
3644 	if (free_page != 0) {
3645 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3646 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3647 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3648 		if (level < pt_attr_leaf_level(pt_attr)) {
3649 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3650 		} else {
3651 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3652 		}
3653 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3654 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3655 	}
3656 }
3657 
3658 /**
3659  * Safely clear out a translation table entry.
3660  *
3661  * @note If the TTE to clear out points to a leaf table, then that leaf table
3662  *       must have a refcnt of zero before the TTE can be removed.
3663  * @note This function expects to be called with pmap locked exclusive, and will
3664  *       return with pmap unlocked.
3665  *
3666  * @param pmap The pmap containing the page table whose TTE is being removed.
3667  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3668  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3669  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3670  * @param ttep Pointer to the TTE that should be cleared out.
3671  * @param level The level of the page table that contains the TTE to be removed.
3672  */
3673 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3674 pmap_tte_remove(
3675 	pmap_t pmap,
3676 	vm_offset_t va_start,
3677 	vm_offset_t va_end,
3678 	bool need_strong_sync,
3679 	tt_entry_t *ttep,
3680 	unsigned int level)
3681 {
3682 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3683 
3684 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3685 	const tt_entry_t tte = *ttep;
3686 
3687 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3688 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3689 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3690 	}
3691 
3692 	*ttep = (tt_entry_t) 0;
3693 	FLUSH_PTE_STRONG();
3694 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3695 	if (va_end > va_start) {
3696 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3697 	}
3698 
3699 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3700 
3701 	/**
3702 	 * Remember, the passed in "level" parameter refers to the level above the
3703 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3704 	 * page table).
3705 	 */
3706 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3707 
3708 	/**
3709 	 * Non-leaf pagetables don't track active references in the PTD and instead
3710 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3711 	 * the real refcount below.
3712 	 */
3713 	unsigned short refcnt = PT_DESC_REFCOUNT;
3714 
3715 	/*
3716 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3717 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3718 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3719 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3720 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3721 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3722 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3723 	 * synchronize it against the disconnect operation.  If that removal caused the
3724 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3725 	 * operation is finished using the relevant pagetable descriptor.
3726 	 * Address these cases by waiting until all CPUs have been observed to not be
3727 	 * executing pmap_disconnect().
3728 	 */
3729 	if (remove_leaf_table) {
3730 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3731 		const int max_cpu = ml_get_max_cpu_number();
3732 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3733 		bool inflight_disconnect;
3734 
3735 		/*
3736 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3737 		 * ahead of any prior PTE load which may have observed the effect of a
3738 		 * concurrent disconnect operation.  An acquire fence is required for this;
3739 		 * a load-acquire operation is insufficient.
3740 		 */
3741 		os_atomic_thread_fence(acquire);
3742 		do {
3743 			inflight_disconnect = false;
3744 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3745 			    i >= 0;
3746 			    i = bitmap_next(&active_disconnects[0], i)) {
3747 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3748 				if (cpu_data == NULL) {
3749 					continue;
3750 				}
3751 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3752 					__builtin_arm_wfe();
3753 					inflight_disconnect = true;
3754 					continue;
3755 				}
3756 				os_atomic_clear_exclusive();
3757 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3758 			}
3759 		} while (inflight_disconnect);
3760 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3761 		os_atomic_thread_fence(acquire);
3762 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3763 	}
3764 
3765 #if MACH_ASSERT
3766 	/**
3767 	 * On internal devices, always do the page table consistency check
3768 	 * regardless of page table level or the actual refcnt value.
3769 	 */
3770 	{
3771 #else /* MACH_ASSERT */
3772 	/**
3773 	 * Only perform the page table consistency check when deleting leaf page
3774 	 * tables and it seems like there might be valid/compressed mappings
3775 	 * leftover.
3776 	 */
3777 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3778 #endif /* MACH_ASSERT */
3779 
3780 		/**
3781 		 * There are multiple problems that can arise as a non-zero refcnt:
3782 		 * 1. A bug in the refcnt management logic.
3783 		 * 2. A memory stomper or hardware failure.
3784 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3785 		 *    space before destroying a pmap.
3786 		 *
3787 		 * By looping over the page table and determining how many valid or
3788 		 * compressed entries there actually are, we can narrow down which of
3789 		 * these three cases is causing this panic. If the expected refcnt
3790 		 * (valid + compressed) and the actual refcnt don't match then the
3791 		 * problem is probably either a memory corruption issue (if the
3792 		 * non-empty entries don't match valid+compressed, that could also be a
3793 		 * sign of corruption) or refcnt management bug. Otherwise, there
3794 		 * actually are leftover mappings and the higher layers of xnu are
3795 		 * probably at fault.
3796 		 */
3797 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3798 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3799 
3800 		pt_entry_t *ptep = bpte;
3801 		unsigned short non_empty = 0, valid = 0, comp = 0;
3802 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3803 			/* Keep track of all non-empty entries to detect memory corruption. */
3804 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3805 				non_empty++;
3806 			}
3807 
3808 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3809 				comp++;
3810 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3811 				valid++;
3812 			}
3813 		}
3814 
3815 #if MACH_ASSERT
3816 		/**
3817 		 * On internal machines, panic whenever a page table getting deleted has
3818 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3819 		 * non-zero refcnt.
3820 		 */
3821 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3822 #else /* MACH_ASSERT */
3823 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3824 		{
3825 #endif /* MACH_ASSERT */
3826 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3827 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3828 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3829 		}
3830 	}
3831 }
3832 
3833 /**
3834  * Given a pointer to an entry within a `level` page table, delete the
3835  * page table at `level` + 1 that is represented by that entry. For instance,
3836  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3837  * contains the PA of the L3 table, and `level` would be "2".
3838  *
3839  * @note If the table getting deallocated is a leaf table, then that leaf table
3840  *       must have a refcnt of zero before getting deallocated. All other levels
3841  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3842  * @note This function expects to be called with pmap locked exclusive and will
3843  *       return with pmap unlocked.
3844  *
3845  * @param pmap The pmap that owns the page table to be deallocated.
3846  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3847  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3848  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3849  * @param ttep Pointer to the `level` TTE to remove.
3850  * @param level The level of the table that contains an entry pointing to the
3851  *              table to be removed. The deallocated page table will be a
3852  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3853  *              deleted).
3854  */
3855 void
3856 pmap_tte_deallocate(
3857 	pmap_t pmap,
3858 	vm_offset_t va_start,
3859 	vm_offset_t va_end,
3860 	bool need_strong_sync,
3861 	tt_entry_t *ttep,
3862 	unsigned int level)
3863 {
3864 	tt_entry_t tte;
3865 
3866 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3867 
3868 	tte = *ttep;
3869 
3870 	if (tte_get_ptd(tte)->pmap != pmap) {
3871 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3872 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3873 	}
3874 
3875 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3876 	    __func__, ttep, (unsigned long long)tte);
3877 
3878 	/* pmap_tte_remove() will drop the pmap lock */
3879 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3880 
3881 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3882 }
3883 
3884 /*
3885  *	Remove a range of hardware page-table entries.
3886  *	The entries given are the first (inclusive)
3887  *	and last (exclusive) entries for the VM pages.
3888  *	The virtual address is the va for the first pte.
3889  *
3890  *	The pmap must be locked.
3891  *	If the pmap is not the kernel pmap, the range must lie
3892  *	entirely within one pte-page.  This is NOT checked.
3893  *	Assumes that the pte-page exists.
3894  *
3895  *	Returns the number of PTE changed
3896  */
3897 MARK_AS_PMAP_TEXT static int
3898 pmap_remove_range(
3899 	pmap_t pmap,
3900 	vm_map_address_t va,
3901 	pt_entry_t *bpte,
3902 	pt_entry_t *epte)
3903 {
3904 	bool need_strong_sync = false;
3905 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
3906 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
3907 	if (num_changed > 0) {
3908 		PMAP_UPDATE_TLBS(pmap, va,
3909 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
3910 	}
3911 	return num_changed;
3912 }
3913 
3914 
3915 #ifdef PVH_FLAG_EXEC
3916 
3917 /*
3918  *	Update the access protection bits of the physical aperture mapping for a page.
3919  *	This is useful, for example, in guranteeing that a verified executable page
3920  *	has no writable mappings anywhere in the system, including the physical
3921  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
3922  *	synchronization overhead in cases where the call to this function is
3923  *	guaranteed to be followed by other TLB operations.
3924  */
3925 void
3926 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
3927 {
3928 #if __ARM_PTE_PHYSMAP__
3929 	pvh_assert_locked(pai);
3930 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
3931 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
3932 
3933 	pt_entry_t tmplate = *pte_p;
3934 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
3935 		return;
3936 	}
3937 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
3938 	if (tmplate & ARM_PTE_HINT_MASK) {
3939 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
3940 		    __func__, pte_p, (void *)kva, tmplate);
3941 	}
3942 	write_pte_strong(pte_p, tmplate);
3943 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
3944 	if (!flush_tlb_async) {
3945 		sync_tlb_flush();
3946 	}
3947 #endif
3948 }
3949 #endif /* defined(PVH_FLAG_EXEC) */
3950 
3951 
3952 
3953 MARK_AS_PMAP_TEXT int
3954 pmap_remove_range_options(
3955 	pmap_t pmap,
3956 	vm_map_address_t va,
3957 	pt_entry_t *bpte,
3958 	pt_entry_t *epte,
3959 	vm_map_address_t *eva,
3960 	bool *need_strong_sync __unused,
3961 	int options)
3962 {
3963 	pt_entry_t     *cpte;
3964 	size_t          npages = 0;
3965 	int             num_removed, num_unwired;
3966 	int             num_pte_changed;
3967 	unsigned int    pai = 0;
3968 	pmap_paddr_t    pa;
3969 	int             num_external, num_internal, num_reusable;
3970 	int             num_alt_internal;
3971 	uint64_t        num_compressed, num_alt_compressed;
3972 	int16_t         refcnt = 0;
3973 
3974 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3975 
3976 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3977 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3978 
3979 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
3980 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
3981 	}
3982 
3983 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3984 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3985 	}
3986 
3987 	num_removed = 0;
3988 	num_unwired = 0;
3989 	num_pte_changed = 0;
3990 	num_external = 0;
3991 	num_internal = 0;
3992 	num_reusable = 0;
3993 	num_compressed = 0;
3994 	num_alt_internal = 0;
3995 	num_alt_compressed = 0;
3996 
3997 #if XNU_MONITOR
3998 	bool ro_va = false;
3999 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4000 		ro_va = true;
4001 	}
4002 #endif
4003 	for (cpte = bpte; cpte < epte;
4004 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4005 		pt_entry_t      spte;
4006 		boolean_t       managed = FALSE;
4007 
4008 		/*
4009 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4010 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4011 		 */
4012 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4013 			*eva = va;
4014 			break;
4015 		}
4016 
4017 		spte = *((volatile pt_entry_t*)cpte);
4018 
4019 		while (!managed) {
4020 			if (pmap != kernel_pmap &&
4021 			    (options & PMAP_OPTIONS_REMOVE) &&
4022 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4023 				/*
4024 				 * "pmap" must be locked at this point,
4025 				 * so this should not race with another
4026 				 * pmap_remove_range() or pmap_enter().
4027 				 */
4028 
4029 				/* one less "compressed"... */
4030 				num_compressed++;
4031 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4032 					/* ... but it used to be "ALTACCT" */
4033 					num_alt_compressed++;
4034 				}
4035 
4036 				/* clear marker */
4037 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4038 				/*
4039 				 * "refcnt" also accounts for
4040 				 * our "compressed" markers,
4041 				 * so let's update it here.
4042 				 */
4043 				--refcnt;
4044 				spte = *((volatile pt_entry_t*)cpte);
4045 			}
4046 			/*
4047 			 * It may be possible for the pte to transition from managed
4048 			 * to unmanaged in this timeframe; for now, elide the assert.
4049 			 * We should break out as a consequence of checking pa_valid.
4050 			 */
4051 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4052 			pa = pte_to_pa(spte);
4053 			if (!pa_valid(pa)) {
4054 #if XNU_MONITOR
4055 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4056 #endif
4057 #if XNU_MONITOR
4058 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4059 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4060 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4061 					    __func__, (uint64_t)pa);
4062 				}
4063 #endif
4064 				break;
4065 			}
4066 			pai = pa_index(pa);
4067 			pvh_lock(pai);
4068 			spte = *((volatile pt_entry_t*)cpte);
4069 			pa = pte_to_pa(spte);
4070 			if (pai == pa_index(pa)) {
4071 				managed = TRUE;
4072 				break; // Leave pai locked as we will unlock it after we free the PV entry
4073 			}
4074 			pvh_unlock(pai);
4075 		}
4076 
4077 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4078 			/*
4079 			 * There used to be a valid mapping here but it
4080 			 * has already been removed when the page was
4081 			 * sent to the VM compressor, so nothing left to
4082 			 * remove now...
4083 			 */
4084 			continue;
4085 		}
4086 
4087 		/* remove the translation, do not flush the TLB */
4088 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4089 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4090 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4091 #if MACH_ASSERT
4092 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4093 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4094 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4095 			}
4096 #endif
4097 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4098 			num_pte_changed++;
4099 		}
4100 
4101 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4102 		    (pmap != kernel_pmap)) {
4103 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4104 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4105 			--refcnt;
4106 		}
4107 
4108 		if (pte_is_wired(spte)) {
4109 			pte_set_wired(pmap, cpte, 0);
4110 			num_unwired++;
4111 		}
4112 		/*
4113 		 * if not managed, we're done
4114 		 */
4115 		if (!managed) {
4116 			continue;
4117 		}
4118 
4119 #if XNU_MONITOR
4120 		if (__improbable(ro_va)) {
4121 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4122 		}
4123 #endif
4124 
4125 		/*
4126 		 * find and remove the mapping from the chain for this
4127 		 * physical address.
4128 		 */
4129 		bool is_internal, is_altacct;
4130 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4131 
4132 		if (is_altacct) {
4133 			assert(is_internal);
4134 			num_internal++;
4135 			num_alt_internal++;
4136 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4137 				ppattr_clear_altacct(pai);
4138 				ppattr_clear_internal(pai);
4139 			}
4140 		} else if (is_internal) {
4141 			if (ppattr_test_reusable(pai)) {
4142 				num_reusable++;
4143 			} else {
4144 				num_internal++;
4145 			}
4146 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4147 				ppattr_clear_internal(pai);
4148 			}
4149 		} else {
4150 			num_external++;
4151 		}
4152 		pvh_unlock(pai);
4153 		num_removed++;
4154 	}
4155 
4156 	/*
4157 	 *	Update the counts
4158 	 */
4159 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4160 
4161 	if (pmap != kernel_pmap) {
4162 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4163 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4164 		}
4165 
4166 		/* update ledgers */
4167 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4168 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4169 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4170 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4171 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4172 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4173 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4174 		/* make needed adjustments to phys_footprint */
4175 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4176 		    ((num_internal -
4177 		    num_alt_internal) +
4178 		    (num_compressed -
4179 		    num_alt_compressed)) * pmap_page_size);
4180 	}
4181 
4182 	/* flush the ptable entries we have written */
4183 	if (num_pte_changed > 0) {
4184 		FLUSH_PTE_STRONG();
4185 	}
4186 
4187 	return num_pte_changed;
4188 }
4189 
4190 
4191 /*
4192  *	Remove the given range of addresses
4193  *	from the specified map.
4194  *
4195  *	It is assumed that the start and end are properly
4196  *	rounded to the hardware page size.
4197  */
4198 void
4199 pmap_remove(
4200 	pmap_t pmap,
4201 	vm_map_address_t start,
4202 	vm_map_address_t end)
4203 {
4204 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4205 }
4206 
4207 MARK_AS_PMAP_TEXT vm_map_address_t
4208 pmap_remove_options_internal(
4209 	pmap_t pmap,
4210 	vm_map_address_t start,
4211 	vm_map_address_t end,
4212 	int options)
4213 {
4214 	vm_map_address_t eva = end;
4215 	pt_entry_t     *bpte, *epte;
4216 	pt_entry_t     *pte_p;
4217 	tt_entry_t     *tte_p;
4218 	int             remove_count = 0;
4219 	bool            need_strong_sync = false;
4220 	bool            unlock = true;
4221 
4222 	if (__improbable(end < start)) {
4223 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4224 	}
4225 
4226 	validate_pmap_mutable(pmap);
4227 
4228 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4229 
4230 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4231 
4232 	tte_p = pmap_tte(pmap, start);
4233 
4234 	if (tte_p == (tt_entry_t *) NULL) {
4235 		goto done;
4236 	}
4237 
4238 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4239 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4240 		bpte = &pte_p[pte_index(pt_attr, start)];
4241 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4242 
4243 		/*
4244 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4245 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4246 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4247 		 */
4248 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4249 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4250 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4251 		}
4252 
4253 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4254 		    &need_strong_sync, options);
4255 
4256 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4257 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4258 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4259 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4260 		}
4261 	}
4262 
4263 done:
4264 	if (unlock) {
4265 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4266 	}
4267 
4268 	if (remove_count > 0) {
4269 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4270 	}
4271 	return eva;
4272 }
4273 
4274 void
4275 pmap_remove_options(
4276 	pmap_t pmap,
4277 	vm_map_address_t start,
4278 	vm_map_address_t end,
4279 	int options)
4280 {
4281 	vm_map_address_t va;
4282 
4283 	if (pmap == PMAP_NULL) {
4284 		return;
4285 	}
4286 
4287 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4288 
4289 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4290 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4291 	    VM_KERNEL_ADDRHIDE(end));
4292 
4293 #if MACH_ASSERT
4294 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4295 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4296 		    pmap, (uint64_t)start, (uint64_t)end);
4297 	}
4298 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4299 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4300 		    pmap, (uint64_t)start, (uint64_t)end);
4301 	}
4302 #endif
4303 
4304 	/*
4305 	 * We allow single-page requests to execute non-preemptibly,
4306 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4307 	 * operation, and there are a couple of special use cases that
4308 	 * require a non-preemptible single-page operation.
4309 	 */
4310 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4311 		pmap_verify_preemptible();
4312 	}
4313 
4314 	/*
4315 	 *      Invalidate the translation buffer first
4316 	 */
4317 	va = start;
4318 	while (va < end) {
4319 		vm_map_address_t l;
4320 
4321 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4322 		if (l > end) {
4323 			l = end;
4324 		}
4325 
4326 #if XNU_MONITOR
4327 		va = pmap_remove_options_ppl(pmap, va, l, options);
4328 
4329 		pmap_ledger_check_balance(pmap);
4330 #else
4331 		va = pmap_remove_options_internal(pmap, va, l, options);
4332 #endif
4333 	}
4334 
4335 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4336 }
4337 
4338 
4339 /*
4340  *	Remove phys addr if mapped in specified map
4341  */
4342 void
4343 pmap_remove_some_phys(
4344 	__unused pmap_t map,
4345 	__unused ppnum_t pn)
4346 {
4347 	/* Implement to support working set code */
4348 }
4349 
4350 /*
4351  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4352  * switch a thread onto a new vm_map.
4353  */
4354 void
4355 pmap_switch_user(thread_t thread, vm_map_t new_map)
4356 {
4357 	pmap_t new_pmap = new_map->pmap;
4358 
4359 
4360 	thread->map = new_map;
4361 	pmap_set_pmap(new_pmap, thread);
4362 
4363 }
4364 
4365 void
4366 pmap_set_pmap(
4367 	pmap_t pmap,
4368 #if     !__ARM_USER_PROTECT__
4369 	__unused
4370 #endif
4371 	thread_t        thread)
4372 {
4373 	pmap_switch(pmap);
4374 #if __ARM_USER_PROTECT__
4375 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4376 	thread->machine.asid = pmap->hw_asid;
4377 #endif
4378 }
4379 
4380 static void
4381 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4382 {
4383 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4384 }
4385 
4386 static inline bool
4387 pmap_user_ttb_is_clear(void)
4388 {
4389 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4390 }
4391 
4392 MARK_AS_PMAP_TEXT void
4393 pmap_switch_internal(
4394 	pmap_t pmap)
4395 {
4396 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4397 #if XNU_MONITOR
4398 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4399 #endif
4400 	validate_pmap_mutable(pmap);
4401 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4402 	uint16_t asid_index = pmap->hw_asid;
4403 	bool do_asid_flush = false;
4404 	bool do_commpage_flush = false;
4405 
4406 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4407 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4408 	}
4409 #if __ARM_KERNEL_PROTECT__
4410 	asid_index >>= 1;
4411 #endif
4412 
4413 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4414 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4415 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4416 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4417 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4418 	bool break_before_make = do_shared_region_flush;
4419 
4420 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4421 		asid_index -= 1;
4422 		pmap_update_plru(asid_index);
4423 
4424 		/* Paranoia. */
4425 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4426 
4427 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4428 		uint8_t new_sw_asid = pmap->sw_asid;
4429 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4430 
4431 		if (new_sw_asid != last_sw_asid) {
4432 			/*
4433 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4434 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4435 			 * then this switch runs the risk of aliasing.  We need to flush the
4436 			 * TLB for this phyiscal ASID in this case.
4437 			 */
4438 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4439 			do_asid_flush = true;
4440 			break_before_make = true;
4441 		}
4442 	}
4443 
4444 #if __ARM_MIXED_PAGE_SIZE__
4445 	if (pt_attr->pta_tcr_value != get_tcr()) {
4446 		break_before_make = true;
4447 	}
4448 #endif
4449 #if __ARM_MIXED_PAGE_SIZE__
4450 	/*
4451 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4452 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4453 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4454 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4455 	 * conflict abort or other unpredictable behavior.
4456 	 */
4457 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4458 		do_commpage_flush = true;
4459 	}
4460 	if (do_commpage_flush) {
4461 		break_before_make = true;
4462 	}
4463 #endif
4464 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4465 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4466 		pmap_clear_user_ttb_internal();
4467 	}
4468 
4469 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4470 	 * to flush the userspace mappings for that region.  Those mappings are global
4471 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4472 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4473 	if (__improbable(do_shared_region_flush)) {
4474 #if __ARM_RANGE_TLBI__
4475 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4476 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4477 
4478 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4479 		 * There may still be non-global entries that overlap with the incoming pmap's
4480 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4481 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4482 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4483 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4484 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4485 		 * to consider additional invalidation here in the future. */
4486 		if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4487 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4488 		} else {
4489 			do_asid_flush = false;
4490 			flush_core_tlb_async();
4491 		}
4492 #else
4493 		do_asid_flush = false;
4494 		flush_core_tlb_async();
4495 #endif // __ARM_RANGE_TLBI__
4496 	}
4497 
4498 #if __ARM_MIXED_PAGE_SIZE__
4499 	if (__improbable(do_commpage_flush)) {
4500 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4501 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4502 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4503 		flush_core_tlb_allrange_async(rtlbi_param);
4504 	}
4505 #endif
4506 	if (__improbable(do_asid_flush)) {
4507 		pmap_flush_core_tlb_asid_async(pmap);
4508 #if DEVELOPMENT || DEBUG
4509 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4510 #endif
4511 	}
4512 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4513 		sync_tlb_flush_local();
4514 	}
4515 
4516 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4517 }
4518 
4519 void
4520 pmap_switch(
4521 	pmap_t pmap)
4522 {
4523 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4524 #if XNU_MONITOR
4525 	pmap_switch_ppl(pmap);
4526 #else
4527 	pmap_switch_internal(pmap);
4528 #endif
4529 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4530 }
4531 
4532 void
4533 pmap_page_protect(
4534 	ppnum_t ppnum,
4535 	vm_prot_t prot)
4536 {
4537 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4538 }
4539 
4540 /*
4541  *	Routine:	pmap_page_protect_options
4542  *
4543  *	Function:
4544  *		Lower the permission for all mappings to a given
4545  *		page.
4546  */
4547 MARK_AS_PMAP_TEXT static void
4548 pmap_page_protect_options_with_flush_range(
4549 	ppnum_t ppnum,
4550 	vm_prot_t prot,
4551 	unsigned int options,
4552 	pmap_tlb_flush_range_t *flush_range)
4553 {
4554 	pmap_paddr_t    phys = ptoa(ppnum);
4555 	pv_entry_t    **pv_h;
4556 	pv_entry_t     *pve_p, *orig_pve_p;
4557 	pv_entry_t     *pveh_p;
4558 	pv_entry_t     *pvet_p;
4559 	pt_entry_t     *pte_p, *orig_pte_p;
4560 	pv_entry_t     *new_pve_p;
4561 	pt_entry_t     *new_pte_p;
4562 	vm_offset_t     pvh_flags;
4563 	unsigned int    pai;
4564 	bool            remove;
4565 	bool            set_NX;
4566 	unsigned int    pvh_cnt = 0;
4567 	unsigned int    pass1_updated = 0;
4568 	unsigned int    pass2_updated = 0;
4569 
4570 	assert(ppnum != vm_page_fictitious_addr);
4571 
4572 	/* Only work with managed pages. */
4573 	if (!pa_valid(phys)) {
4574 		return;
4575 	}
4576 
4577 	/*
4578 	 * Determine the new protection.
4579 	 */
4580 	switch (prot) {
4581 	case VM_PROT_ALL:
4582 		return;         /* nothing to do */
4583 	case VM_PROT_READ:
4584 	case VM_PROT_READ | VM_PROT_EXECUTE:
4585 		remove = false;
4586 		break;
4587 	default:
4588 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4589 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4590 		remove = true;
4591 		break;
4592 	}
4593 
4594 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4595 	if (remove) {
4596 #if !XNU_MONITOR
4597 		mp_disable_preemption();
4598 #endif
4599 		pmap_cpu_data = pmap_get_cpu_data();
4600 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4601 		/*
4602 		 * Ensure the store to inflight_disconnect will be observed before any of the
4603 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4604 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4605 		 * another CPU, in between this function's clearing a PTE and dropping the
4606 		 * corresponding pagetable refcount.  That can lead to a panic if the
4607 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4608 		 * store barrier; a store-release operation would not be sufficient.
4609 		 */
4610 		os_atomic_thread_fence(release);
4611 	}
4612 
4613 	pai = pa_index(phys);
4614 	pvh_lock(pai);
4615 	pv_h = pai_to_pvh(pai);
4616 	pvh_flags = pvh_get_flags(pv_h);
4617 
4618 #if XNU_MONITOR
4619 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4620 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4621 	}
4622 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4623 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4624 	}
4625 #endif
4626 
4627 
4628 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4629 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4630 	pveh_p = PV_ENTRY_NULL;
4631 	pvet_p = PV_ENTRY_NULL;
4632 	new_pve_p = PV_ENTRY_NULL;
4633 	new_pte_p = PT_ENTRY_NULL;
4634 
4635 
4636 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4637 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4638 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4639 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4640 		pveh_p = pve_p;
4641 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4642 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4643 	}
4644 
4645 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4646 	int pve_ptep_idx = 0;
4647 
4648 	/*
4649 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4650 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4651 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4652 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4653 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4654 	 * tlb_flush_needed to be true while issue_tlbi is false.
4655 	 */
4656 	bool issue_tlbi = false;
4657 	bool tlb_flush_needed = false;
4658 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4659 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4660 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4661 		bool update = false;
4662 
4663 		if (pve_p != PV_ENTRY_NULL) {
4664 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4665 			if (pte_p == PT_ENTRY_NULL) {
4666 				goto protect_skip_pve_pass1;
4667 			}
4668 		}
4669 
4670 #ifdef PVH_FLAG_IOMMU
4671 		if (pvh_ptep_is_iommu(pte_p)) {
4672 #if XNU_MONITOR
4673 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4674 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4675 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4676 			}
4677 #endif
4678 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4679 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4680 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4681 			}
4682 			goto protect_skip_pve_pass1;
4683 		}
4684 #endif
4685 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4686 		const pmap_t pmap = ptdp->pmap;
4687 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4688 
4689 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4690 #if MACH_ASSERT
4691 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4692 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4693 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4694 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4695 
4696 				pv_entry_t *check_pvep = pve_p;
4697 
4698 				do {
4699 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4700 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4701 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4702 					}
4703 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4704 
4705 				/* Restore previous PTEP value. */
4706 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4707 			}
4708 #endif
4709 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4710 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4711 		}
4712 
4713 #if DEVELOPMENT || DEBUG
4714 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4715 #else
4716 		if ((prot & VM_PROT_EXECUTE))
4717 #endif
4718 		{
4719 			set_NX = false;
4720 		} else {
4721 			set_NX = true;
4722 		}
4723 
4724 		/* Remove the mapping if new protection is NONE */
4725 		if (remove) {
4726 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4727 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4728 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4729 			pt_entry_t spte = *pte_p;
4730 
4731 			if (pte_is_wired(spte)) {
4732 				pte_set_wired(pmap, pte_p, 0);
4733 				spte = *pte_p;
4734 				if (pmap != kernel_pmap) {
4735 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4736 				}
4737 			}
4738 
4739 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4740 			    (uint64_t)spte, pte_p, ppnum);
4741 
4742 			if (compress && is_internal && (pmap != kernel_pmap)) {
4743 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4744 				/* mark this PTE as having been "compressed" */
4745 				tmplate = ARM_PTE_COMPRESSED;
4746 				if (is_altacct) {
4747 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4748 				}
4749 			} else {
4750 				tmplate = ARM_PTE_TYPE_FAULT;
4751 			}
4752 
4753 			assert(spte != tmplate);
4754 			write_pte_fast(pte_p, tmplate);
4755 			update = true;
4756 			++pass1_updated;
4757 
4758 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4759 
4760 			if (pmap != kernel_pmap) {
4761 				if (ppattr_test_reusable(pai) &&
4762 				    is_internal &&
4763 				    !is_altacct) {
4764 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4765 				} else if (!is_internal) {
4766 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4767 				}
4768 
4769 				if (is_altacct) {
4770 					assert(is_internal);
4771 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4772 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4773 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4774 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4775 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4776 					}
4777 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4778 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4779 				} else if (ppattr_test_reusable(pai)) {
4780 					assert(is_internal);
4781 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4782 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4783 						/* was not in footprint, but is now */
4784 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4785 					}
4786 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4787 				} else if (is_internal) {
4788 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4789 
4790 					/*
4791 					 * Update all stats related to physical footprint, which only
4792 					 * deals with internal pages.
4793 					 */
4794 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4795 						/*
4796 						 * This removal is only being done so we can send this page to
4797 						 * the compressor; therefore it mustn't affect total task footprint.
4798 						 */
4799 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4800 					} else {
4801 						/*
4802 						 * This internal page isn't going to the compressor, so adjust stats to keep
4803 						 * phys_footprint up to date.
4804 						 */
4805 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4806 					}
4807 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4808 				} else {
4809 					/* external page: no impact on ledgers */
4810 				}
4811 			}
4812 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4813 		} else {
4814 			pt_entry_t spte = *pte_p;
4815 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4816 
4817 			if (pmap == kernel_pmap) {
4818 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4819 			} else {
4820 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4821 			}
4822 
4823 			/*
4824 			 * While the naive implementation of this would serve to add execute
4825 			 * permission, this is not how the VM uses this interface, or how
4826 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4827 			 */
4828 			if (set_NX) {
4829 				tmplate |= pt_attr_leaf_xn(pt_attr);
4830 			}
4831 
4832 
4833 			assert(spte != ARM_PTE_TYPE_FAULT);
4834 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4835 
4836 			if (spte != tmplate) {
4837 				/*
4838 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4839 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4840 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4841 				 * should always be cleared by this function.
4842 				 */
4843 				pte_set_was_writeable(tmplate, true);
4844 				write_pte_fast(pte_p, tmplate);
4845 				update = true;
4846 				++pass1_updated;
4847 			} else if (pte_was_writeable(tmplate)) {
4848 				/*
4849 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4850 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
4851 				 * write access to a page, this function should always at least clear that flag for
4852 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4853 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
4854 				 * be handled through arm_fast_fault().
4855 				 */
4856 				pte_set_was_writeable(tmplate, false);
4857 				write_pte_fast(pte_p, tmplate);
4858 			}
4859 		}
4860 
4861 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4862 			tlb_flush_needed = true;
4863 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4864 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4865 				issue_tlbi = true;
4866 			}
4867 		}
4868 protect_skip_pve_pass1:
4869 		pte_p = PT_ENTRY_NULL;
4870 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4871 			pve_ptep_idx = 0;
4872 			pve_p = pve_next(pve_p);
4873 		}
4874 	}
4875 
4876 	if (tlb_flush_needed) {
4877 		FLUSH_PTE_STRONG();
4878 	}
4879 
4880 	if (!remove && !issue_tlbi) {
4881 		goto protect_finish;
4882 	}
4883 
4884 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
4885 	pv_entry_t **pve_pp = pv_h;
4886 	pve_p = orig_pve_p;
4887 	pte_p = orig_pte_p;
4888 	pve_ptep_idx = 0;
4889 
4890 	/*
4891 	 * We need to keep track of whether a particular PVE list contains IOMMU
4892 	 * mappings when removing entries, because we should only remove CPU
4893 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4894 	 * it around.
4895 	 */
4896 	bool iommu_mapping_in_pve = false;
4897 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4898 		if (pve_p != PV_ENTRY_NULL) {
4899 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4900 			if (pte_p == PT_ENTRY_NULL) {
4901 				goto protect_skip_pve_pass2;
4902 			}
4903 		}
4904 
4905 #ifdef PVH_FLAG_IOMMU
4906 		if (pvh_ptep_is_iommu(pte_p)) {
4907 			iommu_mapping_in_pve = true;
4908 			if (remove && (pve_p == PV_ENTRY_NULL)) {
4909 				/*
4910 				 * We've found an IOMMU entry and it's the only entry in the PV list.
4911 				 * We don't discard IOMMU entries, so simply set up the new PV list to
4912 				 * contain the single IOMMU PTE and exit the loop.
4913 				 */
4914 				new_pte_p = pte_p;
4915 				break;
4916 			}
4917 			goto protect_skip_pve_pass2;
4918 		}
4919 #endif
4920 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4921 		const pmap_t pmap = ptdp->pmap;
4922 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4923 
4924 		if (remove) {
4925 			if (!compress && (pmap != kernel_pmap)) {
4926 				/*
4927 				 * We must wait to decrement the refcount until we're completely finished using the PTE
4928 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
4929 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
4930 				 * under us.
4931 				 */
4932 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
4933 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
4934 				}
4935 			}
4936 			/* Remove this CPU mapping from PVE list. */
4937 			if (pve_p != PV_ENTRY_NULL) {
4938 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4939 			}
4940 		} else {
4941 			pt_entry_t spte = *pte_p;
4942 			if (pte_was_writeable(spte)) {
4943 				pte_set_was_writeable(spte, false);
4944 				write_pte_fast(pte_p, spte);
4945 			} else {
4946 				goto protect_skip_pve_pass2;
4947 			}
4948 		}
4949 		++pass2_updated;
4950 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4951 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4952 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
4953 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
4954 		}
4955 
4956 protect_skip_pve_pass2:
4957 		pte_p = PT_ENTRY_NULL;
4958 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4959 			pve_ptep_idx = 0;
4960 
4961 			if (remove) {
4962 				/**
4963 				 * If there are any IOMMU mappings in the PVE list, preserve
4964 				 * those mappings in a new PVE list (new_pve_p) which will later
4965 				 * become the new PVH entry. Keep track of the CPU mappings in
4966 				 * pveh_p/pvet_p so they can be deallocated later.
4967 				 */
4968 				if (iommu_mapping_in_pve) {
4969 					iommu_mapping_in_pve = false;
4970 					pv_entry_t *temp_pve_p = pve_next(pve_p);
4971 					pve_remove(pv_h, pve_pp, pve_p);
4972 					pveh_p = pvh_pve_list(pv_h);
4973 					pve_p->pve_next = new_pve_p;
4974 					new_pve_p = pve_p;
4975 					pve_p = temp_pve_p;
4976 					continue;
4977 				} else {
4978 					pvet_p = pve_p;
4979 					pvh_cnt++;
4980 				}
4981 			}
4982 
4983 			pve_pp = pve_next_ptr(pve_p);
4984 			pve_p = pve_next(pve_p);
4985 			iommu_mapping_in_pve = false;
4986 		}
4987 	}
4988 
4989 protect_finish:
4990 
4991 #ifdef PVH_FLAG_EXEC
4992 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
4993 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
4994 	}
4995 #endif
4996 	if (__improbable(pass1_updated != pass2_updated)) {
4997 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
4998 		    __func__, pass1_updated, pass2_updated);
4999 	}
5000 	/* if we removed a bunch of entries, take care of them now */
5001 	if (remove) {
5002 		if (new_pve_p != PV_ENTRY_NULL) {
5003 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5004 			pvh_set_flags(pv_h, pvh_flags);
5005 		} else if (new_pte_p != PT_ENTRY_NULL) {
5006 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5007 			pvh_set_flags(pv_h, pvh_flags);
5008 		} else {
5009 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5010 		}
5011 	}
5012 
5013 	if (flush_range && tlb_flush_needed) {
5014 		if (!remove) {
5015 			flush_range->ptfr_flush_needed = true;
5016 			tlb_flush_needed = false;
5017 		}
5018 	}
5019 
5020 	/*
5021 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5022 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5023 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5024 	 * a page to be repurposed while it is still live in the TLBs.
5025 	 */
5026 	if (remove && tlb_flush_needed) {
5027 		sync_tlb_flush();
5028 	}
5029 
5030 	pvh_unlock(pai);
5031 
5032 	if (remove) {
5033 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5034 #if !XNU_MONITOR
5035 		mp_enable_preemption();
5036 #endif
5037 	}
5038 
5039 	if (!remove && tlb_flush_needed) {
5040 		sync_tlb_flush();
5041 	}
5042 
5043 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5044 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5045 	}
5046 }
5047 
5048 MARK_AS_PMAP_TEXT void
5049 pmap_page_protect_options_internal(
5050 	ppnum_t ppnum,
5051 	vm_prot_t prot,
5052 	unsigned int options,
5053 	void *arg)
5054 {
5055 	if (arg != NULL) {
5056 		/*
5057 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5058 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5059 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5060 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5061 		 * In that case, force the flush to take place.
5062 		 */
5063 		options &= ~PMAP_OPTIONS_NOFLUSH;
5064 	}
5065 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5066 }
5067 
5068 void
5069 pmap_page_protect_options(
5070 	ppnum_t ppnum,
5071 	vm_prot_t prot,
5072 	unsigned int options,
5073 	void *arg)
5074 {
5075 	pmap_paddr_t    phys = ptoa(ppnum);
5076 
5077 	assert(ppnum != vm_page_fictitious_addr);
5078 
5079 	/* Only work with managed pages. */
5080 	if (!pa_valid(phys)) {
5081 		return;
5082 	}
5083 
5084 	/*
5085 	 * Determine the new protection.
5086 	 */
5087 	if (prot == VM_PROT_ALL) {
5088 		return;         /* nothing to do */
5089 	}
5090 
5091 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5092 
5093 #if XNU_MONITOR
5094 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5095 #else
5096 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5097 #endif
5098 
5099 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5100 }
5101 
5102 
5103 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5104 MARK_AS_PMAP_TEXT void
5105 pmap_disable_user_jop_internal(pmap_t pmap)
5106 {
5107 	if (pmap == kernel_pmap) {
5108 		panic("%s: called with kernel_pmap", __func__);
5109 	}
5110 	validate_pmap_mutable(pmap);
5111 	pmap->disable_jop = true;
5112 }
5113 
5114 void
5115 pmap_disable_user_jop(pmap_t pmap)
5116 {
5117 #if XNU_MONITOR
5118 	pmap_disable_user_jop_ppl(pmap);
5119 #else
5120 	pmap_disable_user_jop_internal(pmap);
5121 #endif
5122 }
5123 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5124 
5125 /*
5126  * Indicates if the pmap layer enforces some additional restrictions on the
5127  * given set of protections.
5128  */
5129 bool
5130 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5131 {
5132 	return false;
5133 }
5134 
5135 /*
5136  *	Set the physical protection on the
5137  *	specified range of this map as requested.
5138  *	VERY IMPORTANT: Will not increase permissions.
5139  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5140  */
5141 void
5142 pmap_protect(
5143 	pmap_t pmap,
5144 	vm_map_address_t b,
5145 	vm_map_address_t e,
5146 	vm_prot_t prot)
5147 {
5148 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5149 }
5150 
5151 MARK_AS_PMAP_TEXT vm_map_address_t
5152 pmap_protect_options_internal(
5153 	pmap_t pmap,
5154 	vm_map_address_t start,
5155 	vm_map_address_t end,
5156 	vm_prot_t prot,
5157 	unsigned int options,
5158 	__unused void *args)
5159 {
5160 	tt_entry_t      *tte_p;
5161 	pt_entry_t      *bpte_p, *epte_p;
5162 	pt_entry_t      *pte_p;
5163 	boolean_t        set_NX = TRUE;
5164 	boolean_t        set_XO = FALSE;
5165 	boolean_t        should_have_removed = FALSE;
5166 	bool             need_strong_sync = false;
5167 
5168 	/* Validate the pmap input before accessing its data. */
5169 	validate_pmap_mutable(pmap);
5170 
5171 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5172 
5173 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5174 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5175 	}
5176 
5177 #if DEVELOPMENT || DEBUG
5178 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5179 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5180 			should_have_removed = TRUE;
5181 		}
5182 	} else
5183 #endif
5184 	{
5185 		/* Determine the new protection. */
5186 		switch (prot) {
5187 		case VM_PROT_EXECUTE:
5188 			set_XO = TRUE;
5189 			OS_FALLTHROUGH;
5190 		case VM_PROT_READ:
5191 		case VM_PROT_READ | VM_PROT_EXECUTE:
5192 			break;
5193 		case VM_PROT_READ | VM_PROT_WRITE:
5194 		case VM_PROT_ALL:
5195 			return end;         /* nothing to do */
5196 		default:
5197 			should_have_removed = TRUE;
5198 		}
5199 	}
5200 
5201 	if (should_have_removed) {
5202 		panic("%s: should have been a remove operation, "
5203 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5204 		    __FUNCTION__,
5205 		    pmap, (void *)start, (void *)end, prot, options, args);
5206 	}
5207 
5208 #if DEVELOPMENT || DEBUG
5209 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5210 #else
5211 	if ((prot & VM_PROT_EXECUTE))
5212 #endif
5213 	{
5214 		set_NX = FALSE;
5215 	} else {
5216 		set_NX = TRUE;
5217 	}
5218 
5219 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5220 	vm_map_address_t va = start;
5221 	unsigned int npages = 0;
5222 
5223 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5224 
5225 	tte_p = pmap_tte(pmap, start);
5226 
5227 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5228 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5229 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5230 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5231 		pte_p = bpte_p;
5232 
5233 		for (pte_p = bpte_p;
5234 		    pte_p < epte_p;
5235 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5236 			++npages;
5237 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5238 			    pmap_pending_preemption())) {
5239 				break;
5240 			}
5241 			pt_entry_t spte;
5242 #if DEVELOPMENT || DEBUG
5243 			boolean_t  force_write = FALSE;
5244 #endif
5245 
5246 			spte = *((volatile pt_entry_t*)pte_p);
5247 
5248 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5249 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5250 				continue;
5251 			}
5252 
5253 			pmap_paddr_t    pa;
5254 			unsigned int    pai = 0;
5255 			boolean_t       managed = FALSE;
5256 
5257 			while (!managed) {
5258 				/*
5259 				 * It may be possible for the pte to transition from managed
5260 				 * to unmanaged in this timeframe; for now, elide the assert.
5261 				 * We should break out as a consequence of checking pa_valid.
5262 				 */
5263 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5264 				pa = pte_to_pa(spte);
5265 				if (!pa_valid(pa)) {
5266 					break;
5267 				}
5268 				pai = pa_index(pa);
5269 				pvh_lock(pai);
5270 				spte = *((volatile pt_entry_t*)pte_p);
5271 				pa = pte_to_pa(spte);
5272 				if (pai == pa_index(pa)) {
5273 					managed = TRUE;
5274 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5275 				}
5276 				pvh_unlock(pai);
5277 			}
5278 
5279 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5280 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5281 				continue;
5282 			}
5283 
5284 			pt_entry_t      tmplate;
5285 
5286 			if (pmap == kernel_pmap) {
5287 #if DEVELOPMENT || DEBUG
5288 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5289 					force_write = TRUE;
5290 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5291 				} else
5292 #endif
5293 				{
5294 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5295 				}
5296 			} else {
5297 #if DEVELOPMENT || DEBUG
5298 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5299 					assert(pmap->type != PMAP_TYPE_NESTED);
5300 					force_write = TRUE;
5301 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5302 				} else
5303 #endif
5304 				{
5305 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5306 				}
5307 			}
5308 
5309 			/*
5310 			 * XXX Removing "NX" would
5311 			 * grant "execute" access
5312 			 * immediately, bypassing any
5313 			 * checks VM might want to do
5314 			 * in its soft fault path.
5315 			 * pmap_protect() and co. are
5316 			 * not allowed to increase
5317 			 * access permissions.
5318 			 */
5319 			if (set_NX) {
5320 				tmplate |= pt_attr_leaf_xn(pt_attr);
5321 			} else {
5322 				if (pmap == kernel_pmap) {
5323 					/* do NOT clear "PNX"! */
5324 					tmplate |= ARM_PTE_NX;
5325 				} else {
5326 					/* do NOT clear "NX"! */
5327 					tmplate |= pt_attr_leaf_x(pt_attr);
5328 					if (set_XO) {
5329 						tmplate &= ~ARM_PTE_APMASK;
5330 						tmplate |= pt_attr_leaf_rona(pt_attr);
5331 					}
5332 				}
5333 			}
5334 
5335 #if DEVELOPMENT || DEBUG
5336 			if (force_write) {
5337 				/*
5338 				 * TODO: Run CS/Monitor checks here.
5339 				 */
5340 				if (managed) {
5341 					/*
5342 					 * We are marking the page as writable,
5343 					 * so we consider it to be modified and
5344 					 * referenced.
5345 					 */
5346 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5347 					tmplate |= ARM_PTE_AF;
5348 
5349 					if (ppattr_test_reffault(pai)) {
5350 						ppattr_clear_reffault(pai);
5351 					}
5352 
5353 					if (ppattr_test_modfault(pai)) {
5354 						ppattr_clear_modfault(pai);
5355 					}
5356 				}
5357 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5358 				/*
5359 				 * An immediate request for anything other than
5360 				 * write should still mark the page as
5361 				 * referenced if managed.
5362 				 */
5363 				if (managed) {
5364 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5365 					tmplate |= ARM_PTE_AF;
5366 
5367 					if (ppattr_test_reffault(pai)) {
5368 						ppattr_clear_reffault(pai);
5369 					}
5370 				}
5371 			}
5372 #endif
5373 
5374 			/* We do not expect to write fast fault the entry. */
5375 			pte_set_was_writeable(tmplate, false);
5376 
5377 			write_pte_fast(pte_p, tmplate);
5378 
5379 			if (managed) {
5380 				pvh_assert_locked(pai);
5381 				pvh_unlock(pai);
5382 			}
5383 		}
5384 		FLUSH_PTE_STRONG();
5385 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5386 	} else {
5387 		va = end;
5388 	}
5389 
5390 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5391 	return va;
5392 }
5393 
5394 void
5395 pmap_protect_options(
5396 	pmap_t pmap,
5397 	vm_map_address_t b,
5398 	vm_map_address_t e,
5399 	vm_prot_t prot,
5400 	unsigned int options,
5401 	__unused void *args)
5402 {
5403 	vm_map_address_t l, beg;
5404 
5405 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5406 
5407 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5408 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5409 		    pmap, (uint64_t)b, (uint64_t)e);
5410 	}
5411 
5412 	/*
5413 	 * We allow single-page requests to execute non-preemptibly,
5414 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5415 	 * operation, and there are a couple of special use cases that
5416 	 * require a non-preemptible single-page operation.
5417 	 */
5418 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5419 		pmap_verify_preemptible();
5420 	}
5421 
5422 #if DEVELOPMENT || DEBUG
5423 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5424 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5425 			pmap_remove_options(pmap, b, e, options);
5426 			return;
5427 		}
5428 	} else
5429 #endif
5430 	{
5431 		/* Determine the new protection. */
5432 		switch (prot) {
5433 		case VM_PROT_EXECUTE:
5434 		case VM_PROT_READ:
5435 		case VM_PROT_READ | VM_PROT_EXECUTE:
5436 			break;
5437 		case VM_PROT_READ | VM_PROT_WRITE:
5438 		case VM_PROT_ALL:
5439 			return;         /* nothing to do */
5440 		default:
5441 			pmap_remove_options(pmap, b, e, options);
5442 			return;
5443 		}
5444 	}
5445 
5446 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5447 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5448 	    VM_KERNEL_ADDRHIDE(e));
5449 
5450 	beg = b;
5451 
5452 	while (beg < e) {
5453 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5454 
5455 		if (l > e) {
5456 			l = e;
5457 		}
5458 
5459 #if XNU_MONITOR
5460 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5461 #else
5462 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5463 #endif
5464 	}
5465 
5466 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5467 }
5468 
5469 /**
5470  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5471  *
5472  * @param pmap pmap to insert the pages into.
5473  * @param va virtual address to map the pages into.
5474  * @param pa page number of the first physical page to map.
5475  * @param size block size, in number of pages.
5476  * @param prot mapping protection attributes.
5477  * @param attr flags to pass to pmap_enter().
5478  *
5479  * @return KERN_SUCCESS.
5480  */
5481 kern_return_t
5482 pmap_map_block(
5483 	pmap_t pmap,
5484 	addr64_t va,
5485 	ppnum_t pa,
5486 	uint32_t size,
5487 	vm_prot_t prot,
5488 	int attr,
5489 	unsigned int flags)
5490 {
5491 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5492 }
5493 
5494 /**
5495  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5496  * As opposed to pmap_map_block(), this function takes
5497  * a physical address as an input and operates using the
5498  * page size associated with the input pmap.
5499  *
5500  * @param pmap pmap to insert the pages into.
5501  * @param va virtual address to map the pages into.
5502  * @param pa physical address of the first physical page to map.
5503  * @param size block size, in number of pages.
5504  * @param prot mapping protection attributes.
5505  * @param attr flags to pass to pmap_enter().
5506  *
5507  * @return KERN_SUCCESS.
5508  */
5509 kern_return_t
5510 pmap_map_block_addr(
5511 	pmap_t pmap,
5512 	addr64_t va,
5513 	pmap_paddr_t pa,
5514 	uint32_t size,
5515 	vm_prot_t prot,
5516 	int attr,
5517 	unsigned int flags)
5518 {
5519 #if __ARM_MIXED_PAGE_SIZE__
5520 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5521 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5522 #else
5523 	const uint64_t pmap_page_size = PAGE_SIZE;
5524 #endif
5525 
5526 	for (ppnum_t page = 0; page < size; page++) {
5527 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5528 			panic("%s: failed pmap_enter_addr, "
5529 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5530 			    __FUNCTION__,
5531 			    pmap, va, (uint64_t)pa, size, prot, flags);
5532 		}
5533 
5534 		va += pmap_page_size;
5535 		pa += pmap_page_size;
5536 	}
5537 
5538 	return KERN_SUCCESS;
5539 }
5540 
5541 kern_return_t
5542 pmap_enter_addr(
5543 	pmap_t pmap,
5544 	vm_map_address_t v,
5545 	pmap_paddr_t pa,
5546 	vm_prot_t prot,
5547 	vm_prot_t fault_type,
5548 	unsigned int flags,
5549 	boolean_t wired)
5550 {
5551 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5552 }
5553 
5554 /*
5555  *	Insert the given physical page (p) at
5556  *	the specified virtual address (v) in the
5557  *	target physical map with the protection requested.
5558  *
5559  *	If specified, the page will be wired down, meaning
5560  *	that the related pte can not be reclaimed.
5561  *
5562  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5563  *	or lose information.  That is, this routine must actually
5564  *	insert this page into the given map eventually (must make
5565  *	forward progress eventually.
5566  */
5567 kern_return_t
5568 pmap_enter(
5569 	pmap_t pmap,
5570 	vm_map_address_t v,
5571 	ppnum_t pn,
5572 	vm_prot_t prot,
5573 	vm_prot_t fault_type,
5574 	unsigned int flags,
5575 	boolean_t wired)
5576 {
5577 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5578 }
5579 
5580 /*
5581  * Attempt to commit the pte.
5582  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5583  * Performs no page table or accounting writes on failures.
5584  */
5585 static inline bool
5586 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5587 {
5588 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5589 	bool success = false, changed_wiring = false;
5590 
5591 	__unreachable_ok_push
5592 	if (TEST_PAGE_RATIO_4) {
5593 		/*
5594 		 * 16K virtual pages w/ 4K hw pages.
5595 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5596 		 * As a result we require the exclusive pmap lock.
5597 		 */
5598 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5599 		*old_pte = *pte_p;
5600 		if (*old_pte == new_pte) {
5601 			/* Another thread completed this operation. Nothing to do here. */
5602 			success = true;
5603 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5604 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5605 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5606 			success = false;
5607 		} else {
5608 			write_pte_fast(pte_p, new_pte);
5609 			success = true;
5610 		}
5611 	} else {
5612 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5613 	}
5614 	__unreachable_ok_pop
5615 
5616 	if (success && *old_pte != new_pte) {
5617 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5618 			FLUSH_PTE_STRONG();
5619 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5620 		} else {
5621 			FLUSH_PTE();
5622 			__builtin_arm_isb(ISB_SY);
5623 		}
5624 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5625 		    (new_pte & ARM_PTE_WIRED) != 0 :
5626 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5627 
5628 		if (pmap != kernel_pmap && changed_wiring) {
5629 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5630 			if (new_pte & ARM_PTE_WIRED) {
5631 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5632 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5633 			} else {
5634 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5635 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5636 			}
5637 		}
5638 
5639 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5640 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5641 	}
5642 	return success;
5643 }
5644 
5645 MARK_AS_PMAP_TEXT static pt_entry_t
5646 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5647 {
5648 	pt_entry_t pte;
5649 
5650 	switch (wimg & (VM_WIMG_MASK)) {
5651 	case VM_WIMG_IO:
5652 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5653 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5654 		// AP, while preserving the security benefits of using device
5655 		// mapping against side-channel attacks. On pre-H14 platforms,
5656 		// the accesses will still be strongly ordered.
5657 		if (is_dram_addr(pa)) {
5658 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5659 		} else {
5660 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5661 		}
5662 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5663 		break;
5664 	case VM_WIMG_RT:
5665 #if HAS_UCNORMAL_MEM
5666 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5667 #else
5668 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5669 #endif
5670 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5671 		break;
5672 	case VM_WIMG_POSTED:
5673 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5674 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5675 		break;
5676 	case VM_WIMG_POSTED_REORDERED:
5677 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5678 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5679 		break;
5680 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5681 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5682 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5683 		break;
5684 	case VM_WIMG_WCOMB:
5685 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5686 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5687 		break;
5688 	case VM_WIMG_WTHRU:
5689 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5690 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5691 		break;
5692 	case VM_WIMG_COPYBACK:
5693 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5694 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5695 		break;
5696 	case VM_WIMG_INNERWBACK:
5697 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5698 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5699 		break;
5700 	default:
5701 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5702 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5703 	}
5704 
5705 	return pte;
5706 }
5707 
5708 
5709 /*
5710  * Construct a PTE (and the physical page attributes) for the given virtual to
5711  * physical mapping.
5712  *
5713  * This function has no side effects and is safe to call so that it is safe to
5714  * call while attempting a pmap_enter transaction.
5715  */
5716 MARK_AS_PMAP_TEXT static pt_entry_t
5717 pmap_construct_pte(
5718 	const pmap_t pmap,
5719 	vm_map_address_t va,
5720 	pmap_paddr_t pa,
5721 	vm_prot_t prot,
5722 	vm_prot_t fault_type,
5723 	boolean_t wired,
5724 	const pt_attr_t* const pt_attr,
5725 	uint16_t *pp_attr_bits /* OUTPUT */
5726 	)
5727 {
5728 	bool set_NX = false, set_XO = false;
5729 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5730 	assert(pp_attr_bits != NULL);
5731 	*pp_attr_bits = 0;
5732 
5733 	if (wired) {
5734 		pte |= ARM_PTE_WIRED;
5735 	}
5736 
5737 #if DEVELOPMENT || DEBUG
5738 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5739 #else
5740 	if ((prot & VM_PROT_EXECUTE))
5741 #endif
5742 	{
5743 		set_NX = false;
5744 	} else {
5745 		set_NX = true;
5746 	}
5747 
5748 	if (prot == VM_PROT_EXECUTE) {
5749 		set_XO = true;
5750 	}
5751 
5752 	if (set_NX) {
5753 		pte |= pt_attr_leaf_xn(pt_attr);
5754 	} else {
5755 		if (pmap == kernel_pmap) {
5756 			pte |= ARM_PTE_NX;
5757 		} else {
5758 			pte |= pt_attr_leaf_x(pt_attr);
5759 		}
5760 	}
5761 
5762 	if (pmap == kernel_pmap) {
5763 #if __ARM_KERNEL_PROTECT__
5764 		pte |= ARM_PTE_NG;
5765 #endif /* __ARM_KERNEL_PROTECT__ */
5766 		if (prot & VM_PROT_WRITE) {
5767 			pte |= ARM_PTE_AP(AP_RWNA);
5768 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5769 		} else {
5770 			pte |= ARM_PTE_AP(AP_RONA);
5771 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5772 		}
5773 	} else {
5774 		if (pmap->type != PMAP_TYPE_NESTED) {
5775 			pte |= ARM_PTE_NG;
5776 		} else if ((pmap->nested_region_asid_bitmap)
5777 		    && (va >= pmap->nested_region_addr)
5778 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5779 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5780 
5781 			if ((pmap->nested_region_asid_bitmap)
5782 			    && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5783 				pte |= ARM_PTE_NG;
5784 			}
5785 		}
5786 		if (prot & VM_PROT_WRITE) {
5787 			assert(pmap->type != PMAP_TYPE_NESTED);
5788 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5789 				if (fault_type & VM_PROT_WRITE) {
5790 					pte |= pt_attr_leaf_rw(pt_attr);
5791 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5792 				} else {
5793 					pte |= pt_attr_leaf_ro(pt_attr);
5794 					/*
5795 					 * Mark the page as MODFAULT so that a subsequent write
5796 					 * may be handled through arm_fast_fault().
5797 					 */
5798 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5799 					pte_set_was_writeable(pte, true);
5800 				}
5801 			} else {
5802 				pte |= pt_attr_leaf_rw(pt_attr);
5803 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5804 			}
5805 		} else {
5806 			if (set_XO) {
5807 				pte |= pt_attr_leaf_rona(pt_attr);
5808 			} else {
5809 				pte |= pt_attr_leaf_ro(pt_attr);
5810 			}
5811 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5812 		}
5813 	}
5814 
5815 	pte |= ARM_PTE_AF;
5816 	return pte;
5817 }
5818 
5819 MARK_AS_PMAP_TEXT kern_return_t
5820 pmap_enter_options_internal(
5821 	pmap_t pmap,
5822 	vm_map_address_t v,
5823 	pmap_paddr_t pa,
5824 	vm_prot_t prot,
5825 	vm_prot_t fault_type,
5826 	unsigned int flags,
5827 	boolean_t wired,
5828 	unsigned int options)
5829 {
5830 	ppnum_t         pn = (ppnum_t)atop(pa);
5831 	pt_entry_t      pte;
5832 	pt_entry_t      spte;
5833 	pt_entry_t      *pte_p;
5834 	bool            refcnt_updated;
5835 	bool            wiredcnt_updated;
5836 	bool            ro_va = false;
5837 	unsigned int    wimg_bits;
5838 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5839 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5840 	kern_return_t   kr = KERN_SUCCESS;
5841 	uint16_t pp_attr_bits;
5842 	volatile uint16_t *refcnt;
5843 	volatile uint16_t *wiredcnt;
5844 	pv_free_list_t *local_pv_free;
5845 
5846 	validate_pmap_mutable(pmap);
5847 
5848 #if XNU_MONITOR
5849 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
5850 		panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
5851 	}
5852 #endif
5853 
5854 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5855 
5856 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5857 		panic("pmap_enter_options() pmap %p v 0x%llx",
5858 		    pmap, (uint64_t)v);
5859 	}
5860 
5861 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
5862 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5863 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5864 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5865 	}
5866 
5867 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5868 		panic("pmap_enter_options() pmap %p pa 0x%llx",
5869 		    pmap, (uint64_t)pa);
5870 	}
5871 
5872 	/* The PA should not extend beyond the architected physical address space */
5873 	pa &= ARM_PTE_PAGE_MASK;
5874 
5875 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5876 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
5877 		extern vm_offset_t ctrr_test_page;
5878 		if (__probable(v != ctrr_test_page))
5879 #endif
5880 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5881 	}
5882 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
5883 		if (__improbable(prot != VM_PROT_READ)) {
5884 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
5885 			    __func__, (unsigned long long)v, prot);
5886 		}
5887 		ro_va = true;
5888 	}
5889 	assert(pn != vm_page_fictitious_addr);
5890 
5891 	refcnt_updated = false;
5892 	wiredcnt_updated = false;
5893 
5894 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
5895 		/*
5896 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
5897 		 *
5898 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
5899 		 */
5900 		lock_mode = PMAP_LOCK_EXCLUSIVE;
5901 	}
5902 
5903 	if (!pmap_lock_preempt(pmap, lock_mode)) {
5904 		return KERN_ABORTED;
5905 	}
5906 
5907 	/*
5908 	 *	Expand pmap to include this pte.  Assume that
5909 	 *	pmap is always expanded to include enough hardware
5910 	 *	pages to map one VM page.
5911 	 */
5912 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5913 		/* Must unlock to expand the pmap. */
5914 		pmap_unlock(pmap, lock_mode);
5915 
5916 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5917 
5918 		if (kr != KERN_SUCCESS) {
5919 			return kr;
5920 		}
5921 
5922 		if (!pmap_lock_preempt(pmap, lock_mode)) {
5923 			return KERN_ABORTED;
5924 		}
5925 	}
5926 
5927 	if (options & PMAP_OPTIONS_NOENTER) {
5928 		pmap_unlock(pmap, lock_mode);
5929 		return KERN_SUCCESS;
5930 	}
5931 
5932 	/*
5933 	 * Since we may not hold the pmap lock exclusive, updating the pte is
5934 	 * done via a cmpxchg loop.
5935 	 * We need to be careful about modifying non-local data structures before commiting
5936 	 * the new pte since we may need to re-do the transaction.
5937 	 */
5938 	spte = os_atomic_load(pte_p, relaxed);
5939 	while (!committed) {
5940 		refcnt = NULL;
5941 		wiredcnt = NULL;
5942 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
5943 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
5944 
5945 		if (pmap != kernel_pmap) {
5946 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
5947 			refcnt = &ptd_info->refcnt;
5948 			wiredcnt = &ptd_info->wiredcnt;
5949 			/*
5950 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
5951 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
5952 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
5953 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
5954 			 * have PTDs, so we can't use the check there.
5955 			 */
5956 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
5957 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
5958 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
5959 			}
5960 			/*
5961 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
5962 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
5963 			 * or acquire the pmap lock exclusive.
5964 			 */
5965 			if (!wiredcnt_updated) {
5966 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
5967 				wiredcnt_updated = true;
5968 			}
5969 			if (!refcnt_updated) {
5970 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
5971 				refcnt_updated = true;
5972 				drop_refcnt = true;
5973 			}
5974 		}
5975 
5976 #if XNU_MONITOR
5977 		/**
5978 		 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
5979 		 * but in support of hibernation we allow temporary read-only mappings of these pages to be
5980 		 * created and later removed.  We must therefore prevent an attacker from downgrading a
5981 		 * a writable mapping in order to allow it to be removed and remapped to something else.
5982 		 */
5983 		if (__improbable(had_valid_mapping && !pa_valid(pte_to_pa(spte)) &&
5984 		    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !(prot & VM_PROT_WRITE) &&
5985 		    (pmap_cache_attributes((ppnum_t)atop(pte_to_pa(spte))) & PP_ATTR_MONITOR))) {
5986 			panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
5987 			    __func__, (uint64_t)pte_to_pa(spte));
5988 		}
5989 #endif
5990 
5991 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
5992 			/*
5993 			 * There is already a mapping here & it's for a different physical page.
5994 			 * First remove that mapping.
5995 			 *
5996 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
5997 			 */
5998 			if (lock_mode == PMAP_LOCK_SHARED) {
5999 				if (pmap_lock_shared_to_exclusive(pmap)) {
6000 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6001 				} else {
6002 					/*
6003 					 * We failed to upgrade to an exclusive lock.
6004 					 * As a result we no longer hold the lock at all,
6005 					 * so we need to re-acquire it and restart the transaction.
6006 					 */
6007 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6008 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6009 					/* pmap might have changed after we dropped the lock. Try again. */
6010 					spte = os_atomic_load(pte_p, relaxed);
6011 					continue;
6012 				}
6013 			}
6014 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6015 			spte = ARM_PTE_TYPE_FAULT;
6016 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6017 		}
6018 
6019 		/*
6020 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6021 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6022 		 * read-write protection. The PMAP layer though still needs to use the right
6023 		 * index, which is the older XO-now-TPRO one and that is specially selected
6024 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6025 		 */
6026 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6027 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6028 		} else {
6029 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6030 		}
6031 
6032 		if (pa_valid(pa)) {
6033 			unsigned int pai;
6034 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6035 
6036 			is_internal = FALSE;
6037 			is_altacct = FALSE;
6038 
6039 			pai = pa_index(pa);
6040 
6041 			pvh_lock(pai);
6042 
6043 			/*
6044 			 * Make sure that the current per-cpu PV free list has
6045 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6046 			 * if the transaction succeeds. We're either in the
6047 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6048 			 * Note that we can still be interrupted, but a primary
6049 			 * interrupt handler can never enter the pmap.
6050 			 */
6051 #if !XNU_MONITOR
6052 			assert(get_preemption_level() > 0);
6053 #endif
6054 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6055 			pv_entry_t **pv_h = pai_to_pvh(pai);
6056 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6057 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6058 
6059 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6060 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6061 				int new_allocated_pves = 0;
6062 
6063 				while (new_allocated_pves < 2) {
6064 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6065 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6066 					if (pv_status == PV_ALLOC_FAIL) {
6067 						break;
6068 					} else if (pv_status == PV_ALLOC_RETRY) {
6069 						/*
6070 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6071 						 * it will have dropped the pmap lock while doing so.
6072 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6073 						 * be on a different CPU now.
6074 						 */
6075 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6076 					} else {
6077 						/* If we've gotten this far then a node should've been allocated. */
6078 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6079 
6080 						new_allocated_pves++;
6081 					}
6082 				}
6083 
6084 				for (int i = 0; i < new_allocated_pves; i++) {
6085 					pv_free(new_pve_p[i]);
6086 				}
6087 			}
6088 
6089 			if (pv_status == PV_ALLOC_FAIL) {
6090 				pvh_unlock(pai);
6091 				kr = KERN_RESOURCE_SHORTAGE;
6092 				break;
6093 			} else if (pv_status == PV_ALLOC_RETRY) {
6094 				pvh_unlock(pai);
6095 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6096 				spte = os_atomic_load(pte_p, relaxed);
6097 				continue;
6098 			}
6099 
6100 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6101 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6102 			} else {
6103 				wimg_bits = pmap_cache_attributes(pn);
6104 			}
6105 
6106 			/* We may be retrying this operation after dropping the PVH lock.
6107 			 * Cache attributes for the physical page may have changed while the lock
6108 			 * was dropped, so clear any cache attributes we may have previously set
6109 			 * in the PTE template. */
6110 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6111 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6112 
6113 #if XNU_MONITOR
6114 			/* The regular old kernel is not allowed to remap PPL pages. */
6115 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6116 				panic("%s: page belongs to PPL, "
6117 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6118 				    __FUNCTION__,
6119 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6120 			}
6121 
6122 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6123 				panic("%s: page locked down, "
6124 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6125 				    __FUNCTION__,
6126 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6127 			}
6128 #endif
6129 
6130 
6131 
6132 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6133 			if (!committed) {
6134 				pvh_unlock(pai);
6135 				continue;
6136 			}
6137 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6138 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6139 
6140 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6141 			/*
6142 			 * If there was already a valid pte here then we reuse its reference
6143 			 * on the ptd and drop the one that we took above.
6144 			 */
6145 			drop_refcnt = had_valid_mapping;
6146 
6147 			if (!had_valid_mapping) {
6148 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6149 				int pve_ptep_idx = 0;
6150 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6151 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6152 				if (pv_status != PV_ALLOC_SUCCESS) {
6153 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6154 					    __func__, pv_status, new_pve_p, pmap);
6155 				}
6156 
6157 				if (pmap != kernel_pmap) {
6158 					if (options & PMAP_OPTIONS_INTERNAL) {
6159 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6160 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6161 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6162 							/*
6163 							 * Make a note to ourselves that this
6164 							 * mapping is using alternative
6165 							 * accounting. We'll need this in order
6166 							 * to know which ledger to debit when
6167 							 * the mapping is removed.
6168 							 *
6169 							 * The altacct bit must be set while
6170 							 * the pv head is locked. Defer the
6171 							 * ledger accounting until after we've
6172 							 * dropped the lock.
6173 							 */
6174 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6175 							is_altacct = TRUE;
6176 						}
6177 					}
6178 					if (ppattr_test_reusable(pai) &&
6179 					    !is_altacct) {
6180 						is_reusable = TRUE;
6181 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6182 						is_internal = TRUE;
6183 					} else {
6184 						is_external = TRUE;
6185 					}
6186 				}
6187 			}
6188 
6189 			pvh_unlock(pai);
6190 
6191 			if (pp_attr_bits != 0) {
6192 				ppattr_pa_set_bits(pa, pp_attr_bits);
6193 			}
6194 
6195 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6196 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6197 
6198 				if (is_internal) {
6199 					/*
6200 					 * Make corresponding adjustments to
6201 					 * phys_footprint statistics.
6202 					 */
6203 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6204 					if (is_altacct) {
6205 						/*
6206 						 * If this page is internal and
6207 						 * in an IOKit region, credit
6208 						 * the task's total count of
6209 						 * dirty, internal IOKit pages.
6210 						 * It should *not* count towards
6211 						 * the task's total physical
6212 						 * memory footprint, because
6213 						 * this entire region was
6214 						 * already billed to the task
6215 						 * at the time the mapping was
6216 						 * created.
6217 						 *
6218 						 * Put another way, this is
6219 						 * internal++ and
6220 						 * alternate_accounting++, so
6221 						 * net effect on phys_footprint
6222 						 * is 0. That means: don't
6223 						 * touch phys_footprint here.
6224 						 */
6225 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6226 					} else {
6227 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6228 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6229 							skip_footprint_debit = true;
6230 						} else {
6231 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6232 						}
6233 					}
6234 				}
6235 				if (is_reusable) {
6236 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6237 				} else if (is_external) {
6238 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6239 				}
6240 			}
6241 		} else {
6242 			if (prot & VM_PROT_EXECUTE) {
6243 				kr = KERN_FAILURE;
6244 				break;
6245 			}
6246 
6247 			wimg_bits = pmap_cache_attributes(pn);
6248 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6249 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6250 			}
6251 
6252 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6253 
6254 #if XNU_MONITOR
6255 			if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6256 				uint64_t xprr_perm = pte_to_xprr_perm(pte);
6257 				switch (xprr_perm) {
6258 				case XPRR_KERN_RO_PERM:
6259 					break;
6260 				case XPRR_KERN_RW_PERM:
6261 					pte &= ~ARM_PTE_XPRR_MASK;
6262 					pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6263 					break;
6264 				default:
6265 					panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6266 				}
6267 			}
6268 #endif
6269 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6270 			if (committed) {
6271 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6272 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6273 
6274 				/**
6275 				 * If there was already a valid pte here then we reuse its
6276 				 * reference on the ptd and drop the one that we took above.
6277 				 */
6278 				drop_refcnt = had_valid_mapping;
6279 			}
6280 		}
6281 		if (committed) {
6282 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6283 				assert(pmap != kernel_pmap);
6284 
6285 				/* One less "compressed" */
6286 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6287 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6288 
6289 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6290 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6291 				} else if (!skip_footprint_debit) {
6292 					/* Was part of the footprint */
6293 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6294 				}
6295 				/* The old entry held a reference so drop the extra one that we took above. */
6296 				drop_refcnt = true;
6297 			}
6298 		}
6299 	}
6300 
6301 	if (drop_refcnt && refcnt != NULL) {
6302 		assert(refcnt_updated);
6303 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6304 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6305 		}
6306 	}
6307 
6308 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6309 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6310 	}
6311 
6312 	pmap_unlock(pmap, lock_mode);
6313 
6314 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6315 		pmap_phys_write_disable(v);
6316 	}
6317 
6318 	return kr;
6319 }
6320 
6321 kern_return_t
6322 pmap_enter_options_addr(
6323 	pmap_t pmap,
6324 	vm_map_address_t v,
6325 	pmap_paddr_t pa,
6326 	vm_prot_t prot,
6327 	vm_prot_t fault_type,
6328 	unsigned int flags,
6329 	boolean_t wired,
6330 	unsigned int options,
6331 	__unused void   *arg)
6332 {
6333 	kern_return_t kr = KERN_FAILURE;
6334 
6335 
6336 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6337 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6338 
6339 
6340 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6341 	do {
6342 #if XNU_MONITOR
6343 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6344 #else
6345 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6346 #endif
6347 
6348 		if (kr == KERN_RESOURCE_SHORTAGE) {
6349 #if XNU_MONITOR
6350 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6351 #endif
6352 			if (nowait_requested) {
6353 				break;
6354 			}
6355 		}
6356 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6357 
6358 #if XNU_MONITOR
6359 	pmap_ledger_check_balance(pmap);
6360 #endif
6361 
6362 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6363 
6364 	return kr;
6365 }
6366 
6367 kern_return_t
6368 pmap_enter_options(
6369 	pmap_t pmap,
6370 	vm_map_address_t v,
6371 	ppnum_t pn,
6372 	vm_prot_t prot,
6373 	vm_prot_t fault_type,
6374 	unsigned int flags,
6375 	boolean_t wired,
6376 	unsigned int options,
6377 	__unused void   *arg)
6378 {
6379 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6380 }
6381 
6382 /*
6383  *	Routine:	pmap_change_wiring
6384  *	Function:	Change the wiring attribute for a map/virtual-address
6385  *			pair.
6386  *	In/out conditions:
6387  *			The mapping must already exist in the pmap.
6388  */
6389 MARK_AS_PMAP_TEXT kern_return_t
6390 pmap_change_wiring_internal(
6391 	pmap_t pmap,
6392 	vm_map_address_t v,
6393 	boolean_t wired)
6394 {
6395 	pt_entry_t     *pte_p;
6396 	pmap_paddr_t    pa;
6397 
6398 	validate_pmap_mutable(pmap);
6399 
6400 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6401 		return KERN_ABORTED;
6402 	}
6403 
6404 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6405 
6406 	pte_p = pmap_pte(pmap, v);
6407 	if (pte_p == PT_ENTRY_NULL) {
6408 		if (!wired) {
6409 			/*
6410 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6411 			 * may have been freed by a remove operation.
6412 			 */
6413 			goto pmap_change_wiring_return;
6414 		} else {
6415 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6416 		}
6417 	}
6418 	/*
6419 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6420 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6421 	 */
6422 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6423 
6424 	while (pa_valid(pa)) {
6425 		pmap_paddr_t new_pa;
6426 
6427 		pvh_lock(pa_index(pa));
6428 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6429 
6430 		if (pa == new_pa) {
6431 			break;
6432 		}
6433 
6434 		pvh_unlock(pa_index(pa));
6435 		pa = new_pa;
6436 	}
6437 
6438 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6439 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6440 		if (!wired) {
6441 			/* PTE cleared by prior remove/disconnect operation */
6442 			goto pmap_change_wiring_cleanup;
6443 		} else {
6444 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6445 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6446 		}
6447 	}
6448 
6449 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6450 	if (wired != pte_is_wired(*pte_p)) {
6451 		pte_set_wired(pmap, pte_p, wired);
6452 		if (pmap != kernel_pmap) {
6453 			if (wired) {
6454 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6455 			} else if (!wired) {
6456 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6457 			}
6458 		}
6459 	}
6460 
6461 pmap_change_wiring_cleanup:
6462 	if (pa_valid(pa)) {
6463 		pvh_unlock(pa_index(pa));
6464 	}
6465 
6466 pmap_change_wiring_return:
6467 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6468 
6469 	return KERN_SUCCESS;
6470 }
6471 
6472 void
6473 pmap_change_wiring(
6474 	pmap_t pmap,
6475 	vm_map_address_t v,
6476 	boolean_t wired)
6477 {
6478 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6479 	pmap_verify_preemptible();
6480 
6481 	kern_return_t kr = KERN_FAILURE;
6482 #if XNU_MONITOR
6483 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6484 	do {
6485 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6486 	} while (kr == KERN_ABORTED);
6487 
6488 	pmap_ledger_check_balance(pmap);
6489 #else
6490 	/* Since we verified preemptibility, call the helper only once. */
6491 	kr = pmap_change_wiring_internal(pmap, v, wired);
6492 #endif
6493 
6494 	if (kr != KERN_SUCCESS) {
6495 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6496 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6497 	}
6498 }
6499 
6500 MARK_AS_PMAP_TEXT pmap_paddr_t
6501 pmap_find_pa_internal(
6502 	pmap_t pmap,
6503 	addr64_t va)
6504 {
6505 	pmap_paddr_t    pa = 0;
6506 
6507 	validate_pmap(pmap);
6508 
6509 	if (pmap != kernel_pmap) {
6510 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6511 	}
6512 
6513 	pa = pmap_vtophys(pmap, va);
6514 
6515 	if (pmap != kernel_pmap) {
6516 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6517 	}
6518 
6519 	return pa;
6520 }
6521 
6522 pmap_paddr_t
6523 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6524 {
6525 	pmap_paddr_t pa = 0;
6526 
6527 	if (pmap == kernel_pmap) {
6528 		pa = mmu_kvtop(va);
6529 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6530 		/*
6531 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6532 		 * translation even if PAN would prevent kernel access through the translation.
6533 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6534 		 */
6535 		pa = mmu_uvtop(va);
6536 	}
6537 	return pa;
6538 }
6539 
6540 pmap_paddr_t
6541 pmap_find_pa(
6542 	pmap_t pmap,
6543 	addr64_t va)
6544 {
6545 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6546 
6547 	if (pa != 0) {
6548 		return pa;
6549 	}
6550 
6551 	if (not_in_kdp) {
6552 #if XNU_MONITOR
6553 		return pmap_find_pa_ppl(pmap, va);
6554 #else
6555 		return pmap_find_pa_internal(pmap, va);
6556 #endif
6557 	} else {
6558 		return pmap_vtophys(pmap, va);
6559 	}
6560 }
6561 
6562 ppnum_t
6563 pmap_find_phys_nofault(
6564 	pmap_t pmap,
6565 	addr64_t va)
6566 {
6567 	ppnum_t ppn;
6568 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6569 	return ppn;
6570 }
6571 
6572 ppnum_t
6573 pmap_find_phys(
6574 	pmap_t pmap,
6575 	addr64_t va)
6576 {
6577 	ppnum_t ppn;
6578 	ppn = atop(pmap_find_pa(pmap, va));
6579 	return ppn;
6580 }
6581 
6582 /**
6583  * Translate a kernel virtual address into a physical address.
6584  *
6585  * @param va The kernel virtual address to translate. Does not work on user
6586  *           virtual addresses.
6587  *
6588  * @return The physical address if the translation was successful, or zero if
6589  *         no valid mappings were found for the given virtual address.
6590  */
6591 pmap_paddr_t
6592 kvtophys(vm_offset_t va)
6593 {
6594 	/**
6595 	 * Attempt to do the translation first in hardware using the AT (address
6596 	 * translation) instruction. This will attempt to use the MMU to do the
6597 	 * translation for us.
6598 	 */
6599 	pmap_paddr_t pa = mmu_kvtop(va);
6600 
6601 	if (pa) {
6602 		return pa;
6603 	}
6604 
6605 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6606 	return pmap_vtophys(kernel_pmap, va);
6607 }
6608 
6609 /**
6610  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6611  * points to a non-kernel-managed physical page, then this call will panic().
6612  *
6613  * @note The output of this function is guaranteed to be a kernel-managed
6614  *       physical page, which means it's safe to pass the output directly to
6615  *       pa_index() to create a physical address index for various pmap data
6616  *       structures.
6617  *
6618  * @param va The kernel virtual address to translate. Does not work on user
6619  *           virtual addresses.
6620  *
6621  * @return The translated physical address for the given virtual address.
6622  */
6623 pmap_paddr_t
6624 kvtophys_nofail(vm_offset_t va)
6625 {
6626 	pmap_paddr_t pa = kvtophys(va);
6627 
6628 	if (!pa_valid(pa)) {
6629 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6630 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6631 	}
6632 
6633 	return pa;
6634 }
6635 
6636 pmap_paddr_t
6637 pmap_vtophys(
6638 	pmap_t pmap,
6639 	addr64_t va)
6640 {
6641 	if ((va < pmap->min) || (va >= pmap->max)) {
6642 		return 0;
6643 	}
6644 
6645 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6646 
6647 	tt_entry_t * ttp = NULL;
6648 	tt_entry_t * ttep = NULL;
6649 	tt_entry_t   tte = ARM_TTE_EMPTY;
6650 	pmap_paddr_t pa = 0;
6651 	unsigned int cur_level;
6652 
6653 	ttp = pmap->tte;
6654 
6655 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6656 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6657 
6658 		tte = *ttep;
6659 
6660 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6661 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6662 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6663 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6664 
6665 		if ((tte & valid_mask) != valid_mask) {
6666 			return (pmap_paddr_t) 0;
6667 		}
6668 
6669 		/* This detects both leaf entries and intermediate block mappings. */
6670 		if ((tte & type_mask) == type_block) {
6671 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6672 			break;
6673 		}
6674 
6675 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6676 	}
6677 
6678 	return pa;
6679 }
6680 
6681 /*
6682  *	pmap_init_pte_page - Initialize a page table page.
6683  */
6684 MARK_AS_PMAP_TEXT void
6685 pmap_init_pte_page(
6686 	pmap_t pmap,
6687 	pt_entry_t *pte_p,
6688 	vm_offset_t va,
6689 	unsigned int ttlevel,
6690 	boolean_t alloc_ptd)
6691 {
6692 	pt_desc_t   *ptdp = NULL;
6693 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6694 
6695 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6696 		if (alloc_ptd) {
6697 			/*
6698 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6699 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6700 			 * bootstrap request, so we check for an existing PTD here.
6701 			 */
6702 			ptdp = ptd_alloc(pmap);
6703 			if (ptdp == NULL) {
6704 				panic("%s: unable to allocate PTD", __func__);
6705 			}
6706 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6707 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6708 			pvh_set_flags(pvh, 0);
6709 		} else {
6710 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6711 		}
6712 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6713 		ptdp = pvh_ptd(pvh);
6714 	} else {
6715 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6716 	}
6717 
6718 	// below barrier ensures previous updates to the page are visible to PTW before
6719 	// it is linked to the PTE of previous level
6720 	__builtin_arm_dmb(DMB_ISHST);
6721 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6722 }
6723 
6724 /*
6725  *	Routine:	pmap_expand
6726  *
6727  *	Expands a pmap to be able to map the specified virtual address.
6728  *
6729  *	Allocates new memory for the default (COARSE) translation table
6730  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6731  *	also allocates space for the corresponding pv entries.
6732  *
6733  *	Nothing should be locked.
6734  */
6735 MARK_AS_PMAP_TEXT static kern_return_t
6736 pmap_expand(
6737 	pmap_t pmap,
6738 	vm_map_address_t v,
6739 	unsigned int options,
6740 	unsigned int level)
6741 {
6742 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6743 
6744 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6745 		return KERN_INVALID_ADDRESS;
6746 	}
6747 	pmap_paddr_t    pa;
6748 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6749 	tt_entry_t              *tte_p;
6750 	tt_entry_t              *tt_p;
6751 
6752 	pa = 0x0ULL;
6753 	tt_p =  (tt_entry_t *)NULL;
6754 
6755 	for (; ttlevel < level; ttlevel++) {
6756 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6757 			return KERN_ABORTED;
6758 		}
6759 
6760 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6761 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6762 			while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6763 				if (options & PMAP_OPTIONS_NOWAIT) {
6764 					return KERN_RESOURCE_SHORTAGE;
6765 				}
6766 #if XNU_MONITOR
6767 				panic("%s: failed to allocate tt, "
6768 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6769 				    __FUNCTION__,
6770 				    pmap, (void *)v, options, level);
6771 #else
6772 				VM_PAGE_WAIT();
6773 #endif
6774 			}
6775 
6776 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6777 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6778 				return KERN_ABORTED;
6779 			}
6780 
6781 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6782 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6783 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6784 				tte_p = pmap_ttne(pmap, ttlevel, v);
6785 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6786 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6787 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6788 				pa = 0x0ULL;
6789 				tt_p = (tt_entry_t *)NULL;
6790 			}
6791 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6792 		} else {
6793 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6794 		}
6795 
6796 		if (tt_p != (tt_entry_t *)NULL) {
6797 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6798 			tt_p = (tt_entry_t *)NULL;
6799 		}
6800 	}
6801 
6802 	return KERN_SUCCESS;
6803 }
6804 
6805 /*
6806  *	Routine:	pmap_gc
6807  *	Function:
6808  *              Pmap garbage collection
6809  *		Called by the pageout daemon when pages are scarce.
6810  *
6811  */
6812 void
6813 pmap_gc(void)
6814 {
6815 	/*
6816 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6817 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6818 	 * or may contain wired mappings.  However, with the relatively recent change to
6819 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6820 	 * page, it may make sense to call that function here.
6821 	 */
6822 }
6823 
6824 /*
6825  *      By default, don't attempt pmap GC more frequently
6826  *      than once / 1 minutes.
6827  */
6828 
6829 void
6830 compute_pmap_gc_throttle(
6831 	void *arg __unused)
6832 {
6833 }
6834 
6835 /*
6836  * pmap_attribute_cache_sync(vm_offset_t pa)
6837  *
6838  * Invalidates all of the instruction cache on a physical page and
6839  * pushes any dirty data from the data cache for the same physical page
6840  */
6841 
6842 kern_return_t
6843 pmap_attribute_cache_sync(
6844 	ppnum_t pp,
6845 	vm_size_t size,
6846 	__unused vm_machine_attribute_t attribute,
6847 	__unused vm_machine_attribute_val_t * value)
6848 {
6849 	if (size > PAGE_SIZE) {
6850 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6851 	} else {
6852 		cache_sync_page(pp);
6853 	}
6854 
6855 	return KERN_SUCCESS;
6856 }
6857 
6858 /*
6859  * pmap_sync_page_data_phys(ppnum_t pp)
6860  *
6861  * Invalidates all of the instruction cache on a physical page and
6862  * pushes any dirty data from the data cache for the same physical page
6863  */
6864 void
6865 pmap_sync_page_data_phys(
6866 	ppnum_t pp)
6867 {
6868 	cache_sync_page(pp);
6869 }
6870 
6871 /*
6872  * pmap_sync_page_attributes_phys(ppnum_t pp)
6873  *
6874  * Write back and invalidate all cachelines on a physical page.
6875  */
6876 void
6877 pmap_sync_page_attributes_phys(
6878 	ppnum_t pp)
6879 {
6880 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6881 }
6882 
6883 #if CONFIG_COREDUMP
6884 /* temporary workaround */
6885 boolean_t
6886 coredumpok(
6887 	vm_map_t map,
6888 	mach_vm_offset_t va)
6889 {
6890 	pt_entry_t     *pte_p;
6891 	pt_entry_t      spte;
6892 
6893 	pte_p = pmap_pte(map->pmap, va);
6894 	if (0 == pte_p) {
6895 		return FALSE;
6896 	}
6897 	if (vm_map_entry_has_device_pager(map, va)) {
6898 		return FALSE;
6899 	}
6900 	spte = *pte_p;
6901 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6902 }
6903 #endif
6904 
6905 void
6906 fillPage(
6907 	ppnum_t pn,
6908 	unsigned int fill)
6909 {
6910 	unsigned int   *addr;
6911 	int             count;
6912 
6913 	addr = (unsigned int *) phystokv(ptoa(pn));
6914 	count = PAGE_SIZE / sizeof(unsigned int);
6915 	while (count--) {
6916 		*addr++ = fill;
6917 	}
6918 }
6919 
6920 extern void     mapping_set_mod(ppnum_t pn);
6921 
6922 void
6923 mapping_set_mod(
6924 	ppnum_t pn)
6925 {
6926 	pmap_set_modify(pn);
6927 }
6928 
6929 extern void     mapping_set_ref(ppnum_t pn);
6930 
6931 void
6932 mapping_set_ref(
6933 	ppnum_t pn)
6934 {
6935 	pmap_set_reference(pn);
6936 }
6937 
6938 /*
6939  * Clear specified attribute bits.
6940  *
6941  * Try to force an arm_fast_fault() for all mappings of
6942  * the page - to force attributes to be set again at fault time.
6943  * If the forcing succeeds, clear the cached bits at the head.
6944  * Otherwise, something must have been wired, so leave the cached
6945  * attributes alone.
6946  */
6947 MARK_AS_PMAP_TEXT static void
6948 phys_attribute_clear_with_flush_range(
6949 	ppnum_t         pn,
6950 	unsigned int    bits,
6951 	int             options,
6952 	void            *arg,
6953 	pmap_tlb_flush_range_t *flush_range)
6954 {
6955 	pmap_paddr_t    pa = ptoa(pn);
6956 	vm_prot_t       allow_mode = VM_PROT_ALL;
6957 
6958 #if XNU_MONITOR
6959 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
6960 		panic("%s: illegal request, "
6961 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
6962 		    __FUNCTION__,
6963 		    pn, bits, options, arg, flush_range);
6964 	}
6965 #endif
6966 	if ((arg != NULL) || (flush_range != NULL)) {
6967 		options = options & ~PMAP_OPTIONS_NOFLUSH;
6968 	}
6969 
6970 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
6971 	    (options & PMAP_OPTIONS_NOFLUSH))) {
6972 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
6973 		    "should not clear 'modified' without flushing TLBs\n",
6974 		    pn, bits, options, arg, flush_range);
6975 	}
6976 
6977 	assert(pn != vm_page_fictitious_addr);
6978 
6979 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
6980 		assert(bits == PP_ATTR_MODIFIED);
6981 
6982 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
6983 		/*
6984 		 * We short circuit this case; it should not need to
6985 		 * invoke arm_force_fast_fault, so just clear the modified bit.
6986 		 * pmap_page_protect has taken care of resetting
6987 		 * the state so that we'll see the next write as a fault to
6988 		 * the VM (i.e. we don't want a fast fault).
6989 		 */
6990 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6991 		return;
6992 	}
6993 	if (bits & PP_ATTR_REFERENCED) {
6994 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
6995 	}
6996 	if (bits & PP_ATTR_MODIFIED) {
6997 		allow_mode &= ~VM_PROT_WRITE;
6998 	}
6999 
7000 	if (bits == PP_ATTR_NOENCRYPT) {
7001 		/*
7002 		 * We short circuit this case; it should not need to
7003 		 * invoke arm_force_fast_fault, so just clear and
7004 		 * return.  On ARM, this bit is just a debugging aid.
7005 		 */
7006 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7007 		return;
7008 	}
7009 
7010 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7011 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7012 	}
7013 }
7014 
7015 MARK_AS_PMAP_TEXT void
7016 phys_attribute_clear_internal(
7017 	ppnum_t         pn,
7018 	unsigned int    bits,
7019 	int             options,
7020 	void            *arg)
7021 {
7022 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7023 }
7024 
7025 #if __ARM_RANGE_TLBI__
7026 MARK_AS_PMAP_TEXT static vm_map_address_t
7027 phys_attribute_clear_twig_internal(
7028 	pmap_t pmap,
7029 	vm_map_address_t start,
7030 	vm_map_address_t end,
7031 	unsigned int bits,
7032 	unsigned int options,
7033 	pmap_tlb_flush_range_t *flush_range)
7034 {
7035 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7036 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7037 	assert(end >= start);
7038 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7039 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7040 	vm_map_address_t va = start;
7041 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7042 	tt_entry_t     *tte_p;
7043 	tte_p = pmap_tte(pmap, start);
7044 	unsigned int npages = 0;
7045 
7046 	if (tte_p == (tt_entry_t *) NULL) {
7047 		return end;
7048 	}
7049 
7050 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7051 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7052 
7053 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7054 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7055 		assert(end_pte_p >= start_pte_p);
7056 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7057 			if (__improbable(npages++ && pmap_pending_preemption())) {
7058 				return va;
7059 			}
7060 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7061 			if (pa_valid(pa)) {
7062 				ppnum_t pn = (ppnum_t) atop(pa);
7063 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7064 			}
7065 		}
7066 	}
7067 	return end;
7068 }
7069 
7070 MARK_AS_PMAP_TEXT vm_map_address_t
7071 phys_attribute_clear_range_internal(
7072 	pmap_t pmap,
7073 	vm_map_address_t start,
7074 	vm_map_address_t end,
7075 	unsigned int bits,
7076 	unsigned int options)
7077 {
7078 	if (__improbable(end < start)) {
7079 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7080 	}
7081 	validate_pmap_mutable(pmap);
7082 
7083 	vm_map_address_t va = start;
7084 	pmap_tlb_flush_range_t flush_range = {
7085 		.ptfr_pmap = pmap,
7086 		.ptfr_start = start,
7087 		.ptfr_end = end,
7088 		.ptfr_flush_needed = false
7089 	};
7090 
7091 	pmap_lock(pmap, PMAP_LOCK_SHARED);
7092 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7093 
7094 	while (va < end) {
7095 		vm_map_address_t curr_end;
7096 
7097 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7098 		if (curr_end > end) {
7099 			curr_end = end;
7100 		}
7101 
7102 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7103 		if ((va < curr_end) || pmap_pending_preemption()) {
7104 			break;
7105 		}
7106 	}
7107 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7108 	if (flush_range.ptfr_flush_needed) {
7109 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7110 			flush_range.ptfr_start,
7111 			flush_range.ptfr_end - flush_range.ptfr_start,
7112 			flush_range.ptfr_pmap,
7113 			true);
7114 		sync_tlb_flush();
7115 	}
7116 	return va;
7117 }
7118 
7119 static void
7120 phys_attribute_clear_range(
7121 	pmap_t pmap,
7122 	vm_map_address_t start,
7123 	vm_map_address_t end,
7124 	unsigned int bits,
7125 	unsigned int options)
7126 {
7127 	/*
7128 	 * We allow single-page requests to execute non-preemptibly,
7129 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7130 	 * operation, and there are a couple of special use cases that
7131 	 * require a non-preemptible single-page operation.
7132 	 */
7133 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7134 		pmap_verify_preemptible();
7135 	}
7136 
7137 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7138 
7139 	while (start < end) {
7140 #if XNU_MONITOR
7141 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7142 #else
7143 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7144 #endif
7145 	}
7146 
7147 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7148 }
7149 #endif /* __ARM_RANGE_TLBI__ */
7150 
7151 static void
7152 phys_attribute_clear(
7153 	ppnum_t         pn,
7154 	unsigned int    bits,
7155 	int             options,
7156 	void            *arg)
7157 {
7158 	/*
7159 	 * Do we really want this tracepoint?  It will be extremely chatty.
7160 	 * Also, should we have a corresponding trace point for the set path?
7161 	 */
7162 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7163 
7164 #if XNU_MONITOR
7165 	phys_attribute_clear_ppl(pn, bits, options, arg);
7166 #else
7167 	phys_attribute_clear_internal(pn, bits, options, arg);
7168 #endif
7169 
7170 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7171 }
7172 
7173 /*
7174  *	Set specified attribute bits.
7175  *
7176  *	Set cached value in the pv head because we have
7177  *	no per-mapping hardware support for referenced and
7178  *	modify bits.
7179  */
7180 MARK_AS_PMAP_TEXT void
7181 phys_attribute_set_internal(
7182 	ppnum_t pn,
7183 	unsigned int bits)
7184 {
7185 	pmap_paddr_t    pa = ptoa(pn);
7186 	assert(pn != vm_page_fictitious_addr);
7187 
7188 #if XNU_MONITOR
7189 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7190 		panic("%s: illegal request, "
7191 		    "pn=%u, bits=%#x",
7192 		    __FUNCTION__,
7193 		    pn, bits);
7194 	}
7195 #endif
7196 
7197 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7198 
7199 	return;
7200 }
7201 
7202 static void
7203 phys_attribute_set(
7204 	ppnum_t pn,
7205 	unsigned int bits)
7206 {
7207 #if XNU_MONITOR
7208 	phys_attribute_set_ppl(pn, bits);
7209 #else
7210 	phys_attribute_set_internal(pn, bits);
7211 #endif
7212 }
7213 
7214 
7215 /*
7216  *	Check specified attribute bits.
7217  *
7218  *	use the software cached bits (since no hw support).
7219  */
7220 static boolean_t
7221 phys_attribute_test(
7222 	ppnum_t pn,
7223 	unsigned int bits)
7224 {
7225 	pmap_paddr_t    pa = ptoa(pn);
7226 	assert(pn != vm_page_fictitious_addr);
7227 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7228 }
7229 
7230 
7231 /*
7232  *	Set the modify/reference bits on the specified physical page.
7233  */
7234 void
7235 pmap_set_modify(ppnum_t pn)
7236 {
7237 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7238 }
7239 
7240 
7241 /*
7242  *	Clear the modify bits on the specified physical page.
7243  */
7244 void
7245 pmap_clear_modify(
7246 	ppnum_t pn)
7247 {
7248 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7249 }
7250 
7251 
7252 /*
7253  *	pmap_is_modified:
7254  *
7255  *	Return whether or not the specified physical page is modified
7256  *	by any physical maps.
7257  */
7258 boolean_t
7259 pmap_is_modified(
7260 	ppnum_t pn)
7261 {
7262 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7263 }
7264 
7265 
7266 /*
7267  *	Set the reference bit on the specified physical page.
7268  */
7269 static void
7270 pmap_set_reference(
7271 	ppnum_t pn)
7272 {
7273 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7274 }
7275 
7276 /*
7277  *	Clear the reference bits on the specified physical page.
7278  */
7279 void
7280 pmap_clear_reference(
7281 	ppnum_t pn)
7282 {
7283 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7284 }
7285 
7286 
7287 /*
7288  *	pmap_is_referenced:
7289  *
7290  *	Return whether or not the specified physical page is referenced
7291  *	by any physical maps.
7292  */
7293 boolean_t
7294 pmap_is_referenced(
7295 	ppnum_t pn)
7296 {
7297 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7298 }
7299 
7300 /*
7301  * pmap_get_refmod(phys)
7302  *  returns the referenced and modified bits of the specified
7303  *  physical page.
7304  */
7305 unsigned int
7306 pmap_get_refmod(
7307 	ppnum_t pn)
7308 {
7309 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7310 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7311 }
7312 
7313 static inline unsigned int
7314 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7315 {
7316 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7317 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7318 }
7319 
7320 /*
7321  * pmap_clear_refmod(phys, mask)
7322  *  clears the referenced and modified bits as specified by the mask
7323  *  of the specified physical page.
7324  */
7325 void
7326 pmap_clear_refmod_options(
7327 	ppnum_t         pn,
7328 	unsigned int    mask,
7329 	unsigned int    options,
7330 	void            *arg)
7331 {
7332 	unsigned int    bits;
7333 
7334 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7335 	phys_attribute_clear(pn, bits, options, arg);
7336 }
7337 
7338 /*
7339  * Perform pmap_clear_refmod_options on a virtual address range.
7340  * The operation will be performed in bulk & tlb flushes will be coalesced
7341  * if possible.
7342  *
7343  * Returns true if the operation is supported on this platform.
7344  * If this function returns false, the operation is not supported and
7345  * nothing has been modified in the pmap.
7346  */
7347 bool
7348 pmap_clear_refmod_range_options(
7349 	pmap_t pmap __unused,
7350 	vm_map_address_t start __unused,
7351 	vm_map_address_t end __unused,
7352 	unsigned int mask __unused,
7353 	unsigned int options __unused)
7354 {
7355 #if __ARM_RANGE_TLBI__
7356 	unsigned int    bits;
7357 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7358 	phys_attribute_clear_range(pmap, start, end, bits, options);
7359 	return true;
7360 #else /* __ARM_RANGE_TLBI__ */
7361 #pragma unused(pmap, start, end, mask, options)
7362 	/*
7363 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7364 	 * contiguous range of addresses. This is large performance improvement on
7365 	 * platforms that support ranged tlbi instructions. But on older platforms,
7366 	 * we can only flush per-page or the entire asid. So we currently
7367 	 * only support this operation on platforms that support ranged tlbi.
7368 	 * instructions. On other platforms, we require that
7369 	 * the VM modify the bits on a per-page basis.
7370 	 */
7371 	return false;
7372 #endif /* __ARM_RANGE_TLBI__ */
7373 }
7374 
7375 void
7376 pmap_clear_refmod(
7377 	ppnum_t pn,
7378 	unsigned int mask)
7379 {
7380 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7381 }
7382 
7383 unsigned int
7384 pmap_disconnect_options(
7385 	ppnum_t pn,
7386 	unsigned int options,
7387 	void *arg)
7388 {
7389 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7390 		/*
7391 		 * On ARM, the "modified" bit is managed by software, so
7392 		 * we know up-front if the physical page is "modified",
7393 		 * without having to scan all the PTEs pointing to it.
7394 		 * The caller should have made the VM page "busy" so noone
7395 		 * should be able to establish any new mapping and "modify"
7396 		 * the page behind us.
7397 		 */
7398 		if (pmap_is_modified(pn)) {
7399 			/*
7400 			 * The page has been modified and will be sent to
7401 			 * the VM compressor.
7402 			 */
7403 			options |= PMAP_OPTIONS_COMPRESSOR;
7404 		} else {
7405 			/*
7406 			 * The page hasn't been modified and will be freed
7407 			 * instead of compressed.
7408 			 */
7409 		}
7410 	}
7411 
7412 	/* disconnect the page */
7413 	pmap_page_protect_options(pn, 0, options, arg);
7414 
7415 	/* return ref/chg status */
7416 	return pmap_get_refmod(pn);
7417 }
7418 
7419 /*
7420  *	Routine:
7421  *		pmap_disconnect
7422  *
7423  *	Function:
7424  *		Disconnect all mappings for this page and return reference and change status
7425  *		in generic format.
7426  *
7427  */
7428 unsigned int
7429 pmap_disconnect(
7430 	ppnum_t pn)
7431 {
7432 	pmap_page_protect(pn, 0);       /* disconnect the page */
7433 	return pmap_get_refmod(pn);   /* return ref/chg status */
7434 }
7435 
7436 boolean_t
7437 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7438 {
7439 	if (ptoa(first) >= vm_last_phys) {
7440 		return FALSE;
7441 	}
7442 	if (ptoa(last) < vm_first_phys) {
7443 		return FALSE;
7444 	}
7445 
7446 	return TRUE;
7447 }
7448 
7449 /*
7450  * The state maintained by the noencrypt functions is used as a
7451  * debugging aid on ARM.  This incurs some overhead on the part
7452  * of the caller.  A special case check in phys_attribute_clear
7453  * (the most expensive path) currently minimizes this overhead,
7454  * but stubbing these functions out on RELEASE kernels yields
7455  * further wins.
7456  */
7457 boolean_t
7458 pmap_is_noencrypt(
7459 	ppnum_t pn)
7460 {
7461 #if DEVELOPMENT || DEBUG
7462 	boolean_t result = FALSE;
7463 
7464 	if (!pa_valid(ptoa(pn))) {
7465 		return FALSE;
7466 	}
7467 
7468 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7469 
7470 	return result;
7471 #else
7472 #pragma unused(pn)
7473 	return FALSE;
7474 #endif
7475 }
7476 
7477 void
7478 pmap_set_noencrypt(
7479 	ppnum_t pn)
7480 {
7481 #if DEVELOPMENT || DEBUG
7482 	if (!pa_valid(ptoa(pn))) {
7483 		return;
7484 	}
7485 
7486 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7487 #else
7488 #pragma unused(pn)
7489 #endif
7490 }
7491 
7492 void
7493 pmap_clear_noencrypt(
7494 	ppnum_t pn)
7495 {
7496 #if DEVELOPMENT || DEBUG
7497 	if (!pa_valid(ptoa(pn))) {
7498 		return;
7499 	}
7500 
7501 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7502 #else
7503 #pragma unused(pn)
7504 #endif
7505 }
7506 
7507 #if XNU_MONITOR
7508 boolean_t
7509 pmap_is_monitor(ppnum_t pn)
7510 {
7511 	assert(pa_valid(ptoa(pn)));
7512 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7513 }
7514 #endif
7515 
7516 void
7517 pmap_lock_phys_page(ppnum_t pn)
7518 {
7519 #if !XNU_MONITOR
7520 	unsigned int    pai;
7521 	pmap_paddr_t    phys = ptoa(pn);
7522 
7523 	if (pa_valid(phys)) {
7524 		pai = pa_index(phys);
7525 		pvh_lock(pai);
7526 	} else
7527 #else
7528 	(void)pn;
7529 #endif
7530 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7531 }
7532 
7533 
7534 void
7535 pmap_unlock_phys_page(ppnum_t pn)
7536 {
7537 #if !XNU_MONITOR
7538 	unsigned int    pai;
7539 	pmap_paddr_t    phys = ptoa(pn);
7540 
7541 	if (pa_valid(phys)) {
7542 		pai = pa_index(phys);
7543 		pvh_unlock(pai);
7544 	} else
7545 #else
7546 	(void)pn;
7547 #endif
7548 	{ simple_unlock(&phys_backup_lock);}
7549 }
7550 
7551 MARK_AS_PMAP_TEXT static void
7552 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7553 {
7554 	if (pmap != kernel_pmap) {
7555 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7556 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7557 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7558 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7559 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7560 #if __ARM_MIXED_PAGE_SIZE__
7561 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7562 #endif
7563 	}
7564 
7565 
7566 #if __ARM_MIXED_PAGE_SIZE__
7567 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7568 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7569 	}
7570 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7571 
7572 
7573 	if (pmap != kernel_pmap) {
7574 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7575 	} else if (!pmap_user_ttb_is_clear()) {
7576 		pmap_clear_user_ttb_internal();
7577 	}
7578 }
7579 
7580 MARK_AS_PMAP_TEXT void
7581 pmap_clear_user_ttb_internal(void)
7582 {
7583 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7584 }
7585 
7586 void
7587 pmap_clear_user_ttb(void)
7588 {
7589 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7590 #if XNU_MONITOR
7591 	pmap_clear_user_ttb_ppl();
7592 #else
7593 	pmap_clear_user_ttb_internal();
7594 #endif
7595 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7596 }
7597 
7598 
7599 #if defined(__arm64__)
7600 /*
7601  * Marker for use in multi-pass fast-fault PV list processing.
7602  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7603  * these functions, as compressed PTEs should never be present in PV lists.
7604  * Note that this only holds true for arm64; for arm32 we don't have enough
7605  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7606  * and WRITEABLE marker depending on whether the PTE is valid.
7607  */
7608 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7609 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7610 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7611 #endif
7612 
7613 
7614 MARK_AS_PMAP_TEXT static boolean_t
7615 arm_force_fast_fault_with_flush_range(
7616 	ppnum_t         ppnum,
7617 	vm_prot_t       allow_mode,
7618 	int             options,
7619 	pmap_tlb_flush_range_t *flush_range)
7620 {
7621 	pmap_paddr_t     phys = ptoa(ppnum);
7622 	pv_entry_t      *pve_p;
7623 	pt_entry_t      *pte_p;
7624 	unsigned int     pai;
7625 	unsigned int     pass1_updated = 0;
7626 	unsigned int     pass2_updated = 0;
7627 	boolean_t        result;
7628 	pv_entry_t     **pv_h;
7629 	bool             is_reusable;
7630 	bool             ref_fault;
7631 	bool             mod_fault;
7632 	bool             clear_write_fault = false;
7633 	bool             ref_aliases_mod = false;
7634 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7635 
7636 	assert(ppnum != vm_page_fictitious_addr);
7637 
7638 	if (!pa_valid(phys)) {
7639 		return FALSE;   /* Not a managed page. */
7640 	}
7641 
7642 	result = TRUE;
7643 	ref_fault = false;
7644 	mod_fault = false;
7645 	pai = pa_index(phys);
7646 	if (__probable(mustsynch)) {
7647 		pvh_lock(pai);
7648 	}
7649 	pv_h = pai_to_pvh(pai);
7650 
7651 #if XNU_MONITOR
7652 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7653 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7654 	}
7655 #endif
7656 	pte_p = PT_ENTRY_NULL;
7657 	pve_p = PV_ENTRY_NULL;
7658 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7659 		pte_p = pvh_ptep(pv_h);
7660 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7661 		pve_p = pvh_pve_list(pv_h);
7662 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7663 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7664 	}
7665 
7666 	is_reusable = ppattr_test_reusable(pai);
7667 
7668 	/*
7669 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7670 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7671 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7672 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7673 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7674 	 * tlb_flush_needed to be true while issue_tlbi is false.
7675 	 */
7676 	bool issue_tlbi = false;
7677 	bool tlb_flush_needed = false;
7678 
7679 	pv_entry_t *orig_pve_p = pve_p;
7680 	pt_entry_t *orig_pte_p = pte_p;
7681 	int pve_ptep_idx = 0;
7682 
7683 	/*
7684 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7685 	 * TLB invalidation in pass 2.
7686 	 */
7687 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7688 		pt_entry_t       spte;
7689 		pt_entry_t       tmplate;
7690 
7691 		if (pve_p != PV_ENTRY_NULL) {
7692 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7693 			if (pte_p == PT_ENTRY_NULL) {
7694 				goto fff_skip_pve_pass1;
7695 			}
7696 		}
7697 
7698 #ifdef PVH_FLAG_IOMMU
7699 		if (pvh_ptep_is_iommu(pte_p)) {
7700 			goto fff_skip_pve_pass1;
7701 		}
7702 #endif
7703 		if (*pte_p == ARM_PTE_EMPTY) {
7704 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7705 		}
7706 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7707 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7708 		}
7709 
7710 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7711 		const pmap_t pmap = ptdp->pmap;
7712 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7713 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7714 
7715 		assert(va >= pmap->min && va < pmap->max);
7716 
7717 		/* update pmap stats and ledgers */
7718 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7719 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7720 		if (is_altacct) {
7721 			/*
7722 			 * We do not track "reusable" status for
7723 			 * "alternate accounting" mappings.
7724 			 */
7725 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7726 		    is_reusable &&
7727 		    is_internal &&
7728 		    pmap != kernel_pmap) {
7729 			/* one less "reusable" */
7730 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7731 			/* one more "internal" */
7732 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7733 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7734 
7735 			/*
7736 			 * Since the page is being marked non-reusable, we assume that it will be
7737 			 * modified soon.  Avoid the cost of another trap to handle the fast
7738 			 * fault when we next write to this page.
7739 			 */
7740 			clear_write_fault = true;
7741 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7742 		    !is_reusable &&
7743 		    is_internal &&
7744 		    pmap != kernel_pmap) {
7745 			/* one more "reusable" */
7746 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7747 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7748 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7749 		}
7750 
7751 		bool wiredskip = pte_is_wired(*pte_p) &&
7752 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7753 
7754 		if (wiredskip) {
7755 			result = FALSE;
7756 			goto fff_skip_pve_pass1;
7757 		}
7758 
7759 		spte = *pte_p;
7760 		tmplate = spte;
7761 
7762 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7763 			/* read protection sets the pte to fault */
7764 			tmplate =  tmplate & ~ARM_PTE_AF;
7765 			ref_fault = true;
7766 		}
7767 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7768 			/* take away write permission if set */
7769 			if (pmap == kernel_pmap) {
7770 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7771 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7772 					pte_set_was_writeable(tmplate, true);
7773 					mod_fault = true;
7774 				}
7775 			} else {
7776 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7777 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7778 					pte_set_was_writeable(tmplate, true);
7779 					mod_fault = true;
7780 				}
7781 			}
7782 		}
7783 
7784 #if MACH_ASSERT && XNU_MONITOR
7785 		if (is_pte_xprr_protected(pmap, spte)) {
7786 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7787 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7788 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7789 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7790 				    ppnum, options, allow_mode);
7791 			}
7792 		}
7793 #endif /* MACH_ASSERT && XNU_MONITOR */
7794 
7795 		if (result && (tmplate != spte)) {
7796 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7797 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7798 				tlb_flush_needed = true;
7799 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7800 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7801 #ifdef ARM_PTE_FF_MARKER
7802 					assert(!(spte & ARM_PTE_FF_MARKER));
7803 					tmplate |= ARM_PTE_FF_MARKER;
7804 					++pass1_updated;
7805 #endif
7806 					issue_tlbi = true;
7807 				}
7808 			}
7809 			write_pte_fast(pte_p, tmplate);
7810 		}
7811 
7812 fff_skip_pve_pass1:
7813 		pte_p = PT_ENTRY_NULL;
7814 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7815 			pve_ptep_idx = 0;
7816 			pve_p = pve_next(pve_p);
7817 		}
7818 	}
7819 
7820 	if (tlb_flush_needed) {
7821 		FLUSH_PTE_STRONG();
7822 	}
7823 
7824 	if (!issue_tlbi) {
7825 		goto fff_finish;
7826 	}
7827 
7828 	/* Pass 2: Issue any required TLB invalidations */
7829 	pve_p = orig_pve_p;
7830 	pte_p = orig_pte_p;
7831 	pve_ptep_idx = 0;
7832 
7833 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7834 		if (pve_p != PV_ENTRY_NULL) {
7835 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7836 			if (pte_p == PT_ENTRY_NULL) {
7837 				goto fff_skip_pve_pass2;
7838 			}
7839 		}
7840 
7841 #ifdef PVH_FLAG_IOMMU
7842 		if (pvh_ptep_is_iommu(pte_p)) {
7843 			goto fff_skip_pve_pass2;
7844 		}
7845 #endif
7846 
7847 #ifdef ARM_PTE_FF_MARKER
7848 		pt_entry_t spte = *pte_p;
7849 
7850 		if (!(spte & ARM_PTE_FF_MARKER)) {
7851 			goto fff_skip_pve_pass2;
7852 		} else {
7853 			spte &= (~ARM_PTE_FF_MARKER);
7854 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
7855 			write_pte_fast(pte_p, spte);
7856 			++pass2_updated;
7857 		}
7858 #endif
7859 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7860 		const pmap_t pmap = ptdp->pmap;
7861 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7862 
7863 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7864 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7865 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
7866 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
7867 		}
7868 
7869 fff_skip_pve_pass2:
7870 		pte_p = PT_ENTRY_NULL;
7871 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7872 			pve_ptep_idx = 0;
7873 			pve_p = pve_next(pve_p);
7874 		}
7875 	}
7876 
7877 fff_finish:
7878 	if (__improbable(pass1_updated != pass2_updated)) {
7879 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
7880 		    __func__, pass1_updated, pass2_updated);
7881 	}
7882 
7883 	/*
7884 	 * If we are using the same approach for ref and mod
7885 	 * faults on this PTE, do not clear the write fault;
7886 	 * this would cause both ref and mod to be set on the
7887 	 * page again, and prevent us from taking ANY read/write
7888 	 * fault on the mapping.
7889 	 */
7890 	if (clear_write_fault && !ref_aliases_mod) {
7891 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
7892 	}
7893 	if (tlb_flush_needed) {
7894 		if (flush_range) {
7895 			/* Delayed flush. Signal to the caller that the flush is needed. */
7896 			flush_range->ptfr_flush_needed = true;
7897 		} else {
7898 			sync_tlb_flush();
7899 		}
7900 	}
7901 
7902 	/* update global "reusable" status for this page */
7903 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7904 		ppattr_clear_reusable(pai);
7905 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7906 		ppattr_set_reusable(pai);
7907 	}
7908 
7909 	if (mod_fault) {
7910 		ppattr_set_modfault(pai);
7911 	}
7912 	if (ref_fault) {
7913 		ppattr_set_reffault(pai);
7914 	}
7915 	if (__probable(mustsynch)) {
7916 		pvh_unlock(pai);
7917 	}
7918 	return result;
7919 }
7920 
7921 MARK_AS_PMAP_TEXT boolean_t
7922 arm_force_fast_fault_internal(
7923 	ppnum_t         ppnum,
7924 	vm_prot_t       allow_mode,
7925 	int             options)
7926 {
7927 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7928 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7929 	}
7930 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
7931 }
7932 
7933 /*
7934  *	Routine:	arm_force_fast_fault
7935  *
7936  *	Function:
7937  *		Force all mappings for this page to fault according
7938  *		to the access modes allowed, so we can gather ref/modify
7939  *		bits again.
7940  */
7941 
7942 boolean_t
7943 arm_force_fast_fault(
7944 	ppnum_t         ppnum,
7945 	vm_prot_t       allow_mode,
7946 	int             options,
7947 	__unused void   *arg)
7948 {
7949 	pmap_paddr_t    phys = ptoa(ppnum);
7950 
7951 	assert(ppnum != vm_page_fictitious_addr);
7952 
7953 	if (!pa_valid(phys)) {
7954 		return FALSE;   /* Not a managed page. */
7955 	}
7956 
7957 #if XNU_MONITOR
7958 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
7959 #else
7960 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
7961 #endif
7962 }
7963 
7964 /*
7965  *	Routine:	arm_clear_fast_fault
7966  *
7967  *	Function:
7968  *		Clear pending force fault for all mappings for this page based on
7969  *		the observed fault type, update ref/modify bits.
7970  */
7971 MARK_AS_PMAP_TEXT static boolean_t
7972 arm_clear_fast_fault(
7973 	ppnum_t ppnum,
7974 	vm_prot_t fault_type,
7975 	pt_entry_t *pte_p)
7976 {
7977 	pmap_paddr_t    pa = ptoa(ppnum);
7978 	pv_entry_t     *pve_p;
7979 	unsigned int    pai;
7980 	boolean_t       result;
7981 	bool            tlb_flush_needed = false;
7982 	pv_entry_t    **pv_h;
7983 	unsigned int    npve = 0;
7984 	unsigned int    pass1_updated = 0;
7985 	unsigned int    pass2_updated = 0;
7986 
7987 	assert(ppnum != vm_page_fictitious_addr);
7988 
7989 	if (!pa_valid(pa)) {
7990 		return FALSE;   /* Not a managed page. */
7991 	}
7992 
7993 	result = FALSE;
7994 	pai = pa_index(pa);
7995 	pvh_assert_locked(pai);
7996 	pv_h = pai_to_pvh(pai);
7997 
7998 	pve_p = PV_ENTRY_NULL;
7999 	if (pte_p == PT_ENTRY_NULL) {
8000 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8001 			pte_p = pvh_ptep(pv_h);
8002 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8003 			pve_p = pvh_pve_list(pv_h);
8004 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8005 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8006 		}
8007 	}
8008 
8009 	pv_entry_t *orig_pve_p = pve_p;
8010 	pt_entry_t *orig_pte_p = pte_p;
8011 	int pve_ptep_idx = 0;
8012 
8013 	/*
8014 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8015 	 * TLB invalidation in pass 2.
8016 	 */
8017 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8018 		pt_entry_t spte;
8019 		pt_entry_t tmplate;
8020 
8021 		if (pve_p != PV_ENTRY_NULL) {
8022 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8023 			if (pte_p == PT_ENTRY_NULL) {
8024 				goto cff_skip_pve_pass1;
8025 			}
8026 		}
8027 
8028 #ifdef PVH_FLAG_IOMMU
8029 		if (pvh_ptep_is_iommu(pte_p)) {
8030 			goto cff_skip_pve_pass1;
8031 		}
8032 #endif
8033 		if (*pte_p == ARM_PTE_EMPTY) {
8034 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8035 		}
8036 
8037 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8038 		const pmap_t pmap = ptdp->pmap;
8039 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8040 
8041 		assert(va >= pmap->min && va < pmap->max);
8042 
8043 		spte = *pte_p;
8044 		tmplate = spte;
8045 
8046 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8047 			{
8048 				if (pmap == kernel_pmap) {
8049 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8050 				} else {
8051 					assert(pmap->type != PMAP_TYPE_NESTED);
8052 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8053 				}
8054 			}
8055 
8056 			tmplate |= ARM_PTE_AF;
8057 
8058 			pte_set_was_writeable(tmplate, false);
8059 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8060 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8061 			tmplate = spte | ARM_PTE_AF;
8062 
8063 			{
8064 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8065 			}
8066 		}
8067 
8068 #if MACH_ASSERT && XNU_MONITOR
8069 		if (is_pte_xprr_protected(pmap, spte)) {
8070 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8071 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8072 				    "ppnum=0x%x, fault_type=0x%x",
8073 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8074 				    ppnum, fault_type);
8075 			}
8076 		}
8077 #endif /* MACH_ASSERT && XNU_MONITOR */
8078 
8079 		assert(spte != ARM_PTE_TYPE_FAULT);
8080 		if (spte != tmplate) {
8081 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8082 #ifdef ARM_PTE_FF_MARKER
8083 				assert(!(spte & ARM_PTE_FF_MARKER));
8084 				tmplate |= ARM_PTE_FF_MARKER;
8085 				++pass1_updated;
8086 #endif
8087 				tlb_flush_needed = true;
8088 			}
8089 			write_pte_fast(pte_p, tmplate);
8090 			result = TRUE;
8091 		}
8092 
8093 cff_skip_pve_pass1:
8094 		pte_p = PT_ENTRY_NULL;
8095 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8096 			pve_ptep_idx = 0;
8097 			pve_p = pve_next(pve_p);
8098 			++npve;
8099 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8100 				break;
8101 			}
8102 		}
8103 	}
8104 
8105 	if (!tlb_flush_needed) {
8106 		goto cff_finish;
8107 	}
8108 
8109 	FLUSH_PTE_STRONG();
8110 
8111 	/* Pass 2: Issue any required TLB invalidations */
8112 	pve_p = orig_pve_p;
8113 	pte_p = orig_pte_p;
8114 	pve_ptep_idx = 0;
8115 	npve = 0;
8116 
8117 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8118 		if (pve_p != PV_ENTRY_NULL) {
8119 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8120 			if (pte_p == PT_ENTRY_NULL) {
8121 				goto cff_skip_pve_pass2;
8122 			}
8123 		}
8124 
8125 #ifdef PVH_FLAG_IOMMU
8126 		if (pvh_ptep_is_iommu(pte_p)) {
8127 			goto cff_skip_pve_pass2;
8128 		}
8129 #endif
8130 
8131 #ifdef ARM_PTE_FF_MARKER
8132 		pt_entry_t spte = *pte_p;
8133 
8134 		if (!(spte & ARM_PTE_FF_MARKER)) {
8135 			goto cff_skip_pve_pass2;
8136 		} else {
8137 			spte &= (~ARM_PTE_FF_MARKER);
8138 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8139 			write_pte_fast(pte_p, spte);
8140 			++pass2_updated;
8141 		}
8142 #endif
8143 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8144 		const pmap_t pmap = ptdp->pmap;
8145 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8146 
8147 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8148 
8149 cff_skip_pve_pass2:
8150 		pte_p = PT_ENTRY_NULL;
8151 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8152 			pve_ptep_idx = 0;
8153 			pve_p = pve_next(pve_p);
8154 			++npve;
8155 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8156 				break;
8157 			}
8158 		}
8159 	}
8160 
8161 cff_finish:
8162 	if (__improbable(pass1_updated != pass2_updated)) {
8163 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8164 		    __func__, pass1_updated, pass2_updated);
8165 	}
8166 	if (tlb_flush_needed) {
8167 		sync_tlb_flush();
8168 	}
8169 	return result;
8170 }
8171 
8172 /*
8173  * Determine if the fault was induced by software tracking of
8174  * modify/reference bits.  If so, re-enable the mapping (and set
8175  * the appropriate bits).
8176  *
8177  * Returns KERN_SUCCESS if the fault was induced and was
8178  * successfully handled.
8179  *
8180  * Returns KERN_FAILURE if the fault was not induced and
8181  * the function was unable to deal with it.
8182  *
8183  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8184  * disallows this type of access.
8185  *
8186  * Returns KERN_ABORTED if the pmap lock is taken and a
8187  * preemption is pending.
8188  *
8189  */
8190 MARK_AS_PMAP_TEXT kern_return_t
8191 arm_fast_fault_internal(
8192 	pmap_t pmap,
8193 	vm_map_address_t va,
8194 	vm_prot_t fault_type,
8195 	__unused bool was_af_fault,
8196 	__unused bool from_user)
8197 {
8198 	kern_return_t   result = KERN_FAILURE;
8199 	pt_entry_t     *ptep;
8200 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8201 	unsigned int    pai;
8202 	pmap_paddr_t    pa;
8203 	validate_pmap_mutable(pmap);
8204 
8205 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8206 		return KERN_ABORTED;
8207 	}
8208 
8209 	/*
8210 	 * If the entry doesn't exist, is completely invalid, or is already
8211 	 * valid, we can't fix it here.
8212 	 */
8213 
8214 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8215 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8216 	if (ptep != PT_ENTRY_NULL) {
8217 		while (true) {
8218 			spte = *((volatile pt_entry_t*)ptep);
8219 
8220 			pa = pte_to_pa(spte);
8221 
8222 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8223 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8224 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8225 				return result;
8226 			}
8227 
8228 			if (!pa_valid(pa)) {
8229 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8230 #if XNU_MONITOR
8231 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8232 					return KERN_PROTECTION_FAILURE;
8233 				} else
8234 #endif
8235 				return result;
8236 			}
8237 			pai = pa_index(pa);
8238 			pvh_lock(pai);
8239 			if (*ptep == spte) {
8240 				/*
8241 				 * Double-check the spte value, as we care about the AF bit.
8242 				 * It's also possible that pmap_page_protect() transitioned the
8243 				 * PTE to compressed/empty before we grabbed the PVH lock.
8244 				 */
8245 				break;
8246 			}
8247 			pvh_unlock(pai);
8248 		}
8249 	} else {
8250 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8251 		return result;
8252 	}
8253 
8254 
8255 	if ((result != KERN_SUCCESS) &&
8256 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8257 		/*
8258 		 * An attempted access will always clear ref/mod fault state, as
8259 		 * appropriate for the fault type.  arm_clear_fast_fault will
8260 		 * update the associated PTEs for the page as appropriate; if
8261 		 * any PTEs are updated, we redrive the access.  If the mapping
8262 		 * does not actually allow for the attempted access, the
8263 		 * following fault will (hopefully) fail to update any PTEs, and
8264 		 * thus cause arm_fast_fault to decide that it failed to handle
8265 		 * the fault.
8266 		 */
8267 		if (ppattr_test_reffault(pai)) {
8268 			ppattr_clear_reffault(pai);
8269 		}
8270 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8271 			ppattr_clear_modfault(pai);
8272 		}
8273 
8274 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8275 			/*
8276 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8277 			 * cost of not doing so is a another fault in a case
8278 			 * that should already result in an exception.
8279 			 */
8280 			result = KERN_SUCCESS;
8281 		}
8282 	}
8283 
8284 	/*
8285 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8286 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8287 	 * on mappings of the same page
8288 	 */
8289 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8290 		uintptr_t ap_ro, ap_rw, ap_x;
8291 		if (pmap == kernel_pmap) {
8292 			ap_ro = ARM_PTE_AP(AP_RONA);
8293 			ap_rw = ARM_PTE_AP(AP_RWNA);
8294 			ap_x = ARM_PTE_NX;
8295 		} else {
8296 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8297 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8298 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8299 		}
8300 		/*
8301 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8302 		 * hardware they may be xPRR-protected, in which case they'll be handled
8303 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8304 		 * handling path currently does not call arm_fast_fault() without at least
8305 		 * VM_PROT_READ in fault_type.
8306 		 */
8307 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8308 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8309 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8310 				result = KERN_SUCCESS;
8311 			}
8312 		}
8313 	}
8314 
8315 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8316 		/*
8317 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8318 		 * another pending PV list operation or an excessively large PV list.
8319 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8320 		 * taking a fault on the same mapping.
8321 		 */
8322 		result = KERN_SUCCESS;
8323 	}
8324 
8325 	pvh_unlock(pai);
8326 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8327 	return result;
8328 }
8329 
8330 kern_return_t
8331 arm_fast_fault(
8332 	pmap_t pmap,
8333 	vm_map_address_t va,
8334 	vm_prot_t fault_type,
8335 	bool was_af_fault,
8336 	__unused bool from_user)
8337 {
8338 	kern_return_t   result = KERN_FAILURE;
8339 
8340 	if (va < pmap->min || va >= pmap->max) {
8341 		return result;
8342 	}
8343 
8344 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8345 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8346 	    from_user);
8347 
8348 	do {
8349 #if XNU_MONITOR
8350 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8351 #else
8352 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8353 #endif
8354 	} while (result == KERN_ABORTED);
8355 
8356 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8357 
8358 	return result;
8359 }
8360 
8361 void
8362 pmap_copy_page(
8363 	ppnum_t psrc,
8364 	ppnum_t pdst)
8365 {
8366 	bcopy_phys((addr64_t) (ptoa(psrc)),
8367 	    (addr64_t) (ptoa(pdst)),
8368 	    PAGE_SIZE);
8369 }
8370 
8371 
8372 /*
8373  *	pmap_copy_page copies the specified (machine independent) pages.
8374  */
8375 void
8376 pmap_copy_part_page(
8377 	ppnum_t psrc,
8378 	vm_offset_t src_offset,
8379 	ppnum_t pdst,
8380 	vm_offset_t dst_offset,
8381 	vm_size_t len)
8382 {
8383 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8384 	    (addr64_t) (ptoa(pdst) + dst_offset),
8385 	    len);
8386 }
8387 
8388 
8389 /*
8390  *	pmap_zero_page zeros the specified (machine independent) page.
8391  */
8392 void
8393 pmap_zero_page(
8394 	ppnum_t pn)
8395 {
8396 	assert(pn != vm_page_fictitious_addr);
8397 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8398 }
8399 
8400 /*
8401  *	pmap_zero_part_page
8402  *	zeros the specified (machine independent) part of a page.
8403  */
8404 void
8405 pmap_zero_part_page(
8406 	ppnum_t pn,
8407 	vm_offset_t offset,
8408 	vm_size_t len)
8409 {
8410 	assert(pn != vm_page_fictitious_addr);
8411 	assert(offset + len <= PAGE_SIZE);
8412 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8413 }
8414 
8415 void
8416 pmap_map_globals(
8417 	void)
8418 {
8419 	pt_entry_t      *ptep, pte;
8420 
8421 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8422 	assert(ptep != PT_ENTRY_NULL);
8423 	assert(*ptep == ARM_PTE_EMPTY);
8424 
8425 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8426 #if __ARM_KERNEL_PROTECT__
8427 	pte |= ARM_PTE_NG;
8428 #endif /* __ARM_KERNEL_PROTECT__ */
8429 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8430 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8431 	*ptep = pte;
8432 	FLUSH_PTE();
8433 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8434 
8435 #if KASAN
8436 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8437 #endif
8438 }
8439 
8440 vm_offset_t
8441 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8442 {
8443 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8444 		panic("%s: invalid index %u", __func__, index);
8445 	}
8446 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8447 }
8448 
8449 MARK_AS_PMAP_TEXT unsigned int
8450 pmap_map_cpu_windows_copy_internal(
8451 	ppnum_t pn,
8452 	vm_prot_t prot,
8453 	unsigned int wimg_bits)
8454 {
8455 	pt_entry_t      *ptep = NULL, pte;
8456 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8457 	unsigned int    cpu_num;
8458 	unsigned int    i;
8459 	vm_offset_t     cpu_copywindow_vaddr = 0;
8460 	bool            need_strong_sync = false;
8461 
8462 #if XNU_MONITOR
8463 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8464 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8465 #endif
8466 
8467 #if XNU_MONITOR
8468 #ifdef  __ARM_COHERENT_IO__
8469 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8470 		panic("%s: attempted to map a managed page, "
8471 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8472 		    __FUNCTION__,
8473 		    pn, prot, wimg_bits);
8474 	}
8475 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8476 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8477 	}
8478 
8479 #else /* __ARM_COHERENT_IO__ */
8480 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8481 #endif /* __ARM_COHERENT_IO__ */
8482 #endif /* XNU_MONITOR */
8483 	cpu_num = pmap_cpu_data->cpu_number;
8484 
8485 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8486 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8487 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8488 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8489 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8490 			break;
8491 		}
8492 	}
8493 	if (i == CPUWINDOWS_MAX) {
8494 		panic("pmap_map_cpu_windows_copy: out of window");
8495 	}
8496 
8497 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8498 #if __ARM_KERNEL_PROTECT__
8499 	pte |= ARM_PTE_NG;
8500 #endif /* __ARM_KERNEL_PROTECT__ */
8501 
8502 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8503 
8504 	if (prot & VM_PROT_WRITE) {
8505 		pte |= ARM_PTE_AP(AP_RWNA);
8506 	} else {
8507 		pte |= ARM_PTE_AP(AP_RONA);
8508 	}
8509 
8510 	write_pte_fast(ptep, pte);
8511 	/*
8512 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8513 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8514 	 */
8515 	FLUSH_PTE_STRONG();
8516 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8517 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8518 
8519 	return i;
8520 }
8521 
8522 unsigned int
8523 pmap_map_cpu_windows_copy(
8524 	ppnum_t pn,
8525 	vm_prot_t prot,
8526 	unsigned int wimg_bits)
8527 {
8528 #if XNU_MONITOR
8529 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8530 #else
8531 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8532 #endif
8533 }
8534 
8535 MARK_AS_PMAP_TEXT void
8536 pmap_unmap_cpu_windows_copy_internal(
8537 	unsigned int index)
8538 {
8539 	pt_entry_t      *ptep;
8540 	unsigned int    cpu_num;
8541 	vm_offset_t     cpu_copywindow_vaddr = 0;
8542 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8543 
8544 	cpu_num = pmap_cpu_data->cpu_number;
8545 
8546 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8547 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8548 	 * (which are likely to have been on I/O memory) are complete before
8549 	 * tearing down the mapping. */
8550 	__builtin_arm_dsb(DSB_SY);
8551 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8552 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8553 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8554 }
8555 
8556 void
8557 pmap_unmap_cpu_windows_copy(
8558 	unsigned int index)
8559 {
8560 #if XNU_MONITOR
8561 	return pmap_unmap_cpu_windows_copy_ppl(index);
8562 #else
8563 	return pmap_unmap_cpu_windows_copy_internal(index);
8564 #endif
8565 }
8566 
8567 #if XNU_MONITOR
8568 
8569 MARK_AS_PMAP_TEXT void
8570 pmap_invoke_with_page(
8571 	ppnum_t page_number,
8572 	void *ctx,
8573 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8574 {
8575 	#pragma unused(page_number, ctx, callback)
8576 }
8577 
8578 /*
8579  * Loop over every pmap_io_range (I/O ranges marked as owned by
8580  * the PPL in the device tree) and conditionally call callback() on each range
8581  * that needs to be included in the hibernation image.
8582  *
8583  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8584  *                 context is needed in the callback.
8585  * @param callback Callback function invoked on each range (gated by flag).
8586  */
8587 MARK_AS_PMAP_TEXT void
8588 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8589 {
8590 	extern const pmap_io_range_t* io_attr_table;
8591 	extern const unsigned int num_io_rgns;
8592 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8593 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8594 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8595 		}
8596 	}
8597 }
8598 
8599 /**
8600  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8601  * PPL-owned page. Otherwise, do nothing.
8602  *
8603  * @param addr Physical address of the page to set the HASHED flag on.
8604  */
8605 MARK_AS_PMAP_TEXT void
8606 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8607 {
8608 	/* Ignore non-managed kernel memory. */
8609 	if (!pa_valid(addr)) {
8610 		return;
8611 	}
8612 
8613 	const unsigned int pai = pa_index(addr);
8614 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8615 		pv_entry_t **pv_h = pai_to_pvh(pai);
8616 
8617 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8618 		pvh_lock(pai);
8619 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8620 		pvh_unlock(pai);
8621 	}
8622 }
8623 
8624 /**
8625  * Loop through every physical page in the system and clear out the HASHED flag
8626  * on every PPL-owned page. That flag is used to keep track of which pages have
8627  * been hashed into the hibernation image during the hibernation entry process.
8628  *
8629  * The HASHED flag needs to be cleared out between hibernation cycles because the
8630  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8631  * image with the HASHED flag set on certain pages. It's important to clear the
8632  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8633  * into the hibernation image can't be compromised across hibernation cycles.
8634  */
8635 MARK_AS_PMAP_TEXT void
8636 pmap_clear_ppl_hashed_flag_all(void)
8637 {
8638 	const unsigned int last_index = pa_index(vm_last_phys);
8639 	pv_entry_t **pv_h = NULL;
8640 
8641 	for (int pai = 0; pai < last_index; ++pai) {
8642 		pv_h = pai_to_pvh(pai);
8643 
8644 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8645 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8646 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8647 			pvh_lock(pai);
8648 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8649 			pvh_unlock(pai);
8650 		}
8651 	}
8652 }
8653 
8654 /**
8655  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8656  * ppl_hib driver will call this after all wired pages have been copied into the
8657  * hibernation image.
8658  */
8659 MARK_AS_PMAP_TEXT void
8660 pmap_check_ppl_hashed_flag_all(void)
8661 {
8662 	const unsigned int last_index = pa_index(vm_last_phys);
8663 	pv_entry_t **pv_h = NULL;
8664 
8665 	for (int pai = 0; pai < last_index; ++pai) {
8666 		pv_h = pai_to_pvh(pai);
8667 
8668 		/**
8669 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8670 		 * the pages that contain the PMAP stacks.
8671 		 */
8672 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8673 		    (pai < pa_index(pmap_stacks_end_pa));
8674 
8675 		if (!is_pmap_stack &&
8676 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8677 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8678 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8679 		}
8680 	}
8681 }
8682 
8683 #endif /* XNU_MONITOR */
8684 
8685 /*
8686  * Indicate that a pmap is intended to be used as a nested pmap
8687  * within one or more larger address spaces.  This must be set
8688  * before pmap_nest() is called with this pmap as the 'subordinate'.
8689  */
8690 MARK_AS_PMAP_TEXT void
8691 pmap_set_nested_internal(
8692 	pmap_t pmap)
8693 {
8694 	validate_pmap_mutable(pmap);
8695 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8696 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8697 		    __func__, pmap, pmap->type);
8698 	}
8699 	pmap->type = PMAP_TYPE_NESTED;
8700 	pmap_get_pt_ops(pmap)->free_id(pmap);
8701 }
8702 
8703 void
8704 pmap_set_nested(
8705 	pmap_t pmap)
8706 {
8707 #if XNU_MONITOR
8708 	pmap_set_nested_ppl(pmap);
8709 #else
8710 	pmap_set_nested_internal(pmap);
8711 #endif
8712 }
8713 
8714 /*
8715  * pmap_trim_range(pmap, start, end)
8716  *
8717  * pmap  = pmap to operate on
8718  * start = start of the range
8719  * end   = end of the range
8720  *
8721  * Attempts to deallocate TTEs for the given range in the nested range.
8722  */
8723 MARK_AS_PMAP_TEXT static void
8724 pmap_trim_range(
8725 	pmap_t pmap,
8726 	addr64_t start,
8727 	addr64_t end)
8728 {
8729 	addr64_t cur;
8730 	addr64_t nested_region_start;
8731 	addr64_t nested_region_end;
8732 	addr64_t adjusted_start;
8733 	addr64_t adjusted_end;
8734 	addr64_t adjust_offmask;
8735 	tt_entry_t * tte_p;
8736 	pt_entry_t * pte_p;
8737 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8738 
8739 	if (__improbable(end < start)) {
8740 		panic("%s: invalid address range, "
8741 		    "pmap=%p, start=%p, end=%p",
8742 		    __func__,
8743 		    pmap, (void*)start, (void*)end);
8744 	}
8745 
8746 	nested_region_start = pmap->nested_region_addr;
8747 	nested_region_end = nested_region_start + pmap->nested_region_size;
8748 
8749 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8750 		panic("%s: range outside nested region %p-%p, "
8751 		    "pmap=%p, start=%p, end=%p",
8752 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8753 		    pmap, (void*)start, (void*)end);
8754 	}
8755 
8756 	/* Contract the range to TT page boundaries. */
8757 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8758 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8759 	adjusted_end = end & ~adjust_offmask;
8760 
8761 	/* Iterate over the range, trying to remove TTEs. */
8762 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8763 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8764 
8765 		tte_p = pmap_tte(pmap, cur);
8766 
8767 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8768 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
8769 
8770 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8771 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8772 				/* Deallocate for the nested map. */
8773 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8774 			} else if (pmap->type == PMAP_TYPE_USER) {
8775 				/**
8776 				 * Just remove for the parent map. If the leaf table pointed
8777 				 * to by the TTE being removed (owned by the nested pmap)
8778 				 * has any mappings, then this call will panic. This
8779 				 * enforces the policy that tables being trimmed must be
8780 				 * empty to prevent possible use-after-free attacks.
8781 				 */
8782 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8783 			} else {
8784 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8785 			}
8786 		} else {
8787 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8788 		}
8789 	}
8790 
8791 	/* Remove empty L2 TTs. */
8792 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8793 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8794 
8795 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8796 		/* For each L1 entry in our range... */
8797 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8798 
8799 		bool remove_tt1e = true;
8800 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8801 		tt_entry_t * tt2e_start;
8802 		tt_entry_t * tt2e_end;
8803 		tt_entry_t * tt2e_p;
8804 		tt_entry_t tt1e;
8805 
8806 		if (tt1e_p == NULL) {
8807 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8808 			continue;
8809 		}
8810 
8811 		tt1e = *tt1e_p;
8812 
8813 		if (tt1e == ARM_TTE_TYPE_FAULT) {
8814 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8815 			continue;
8816 		}
8817 
8818 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
8819 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
8820 
8821 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
8822 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
8823 				/*
8824 				 * If any TTEs are populated, don't remove the
8825 				 * L1 TT.
8826 				 */
8827 				remove_tt1e = false;
8828 			}
8829 		}
8830 
8831 		if (remove_tt1e) {
8832 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
8833 		} else {
8834 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8835 		}
8836 	}
8837 }
8838 
8839 /**
8840  * State machine for multi-step pmap trimming. Trimming is the action of
8841  * deallocating the TTEs of the shared region of pmaps down to a given range.
8842  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
8843  * disabling preemption for too long. These steps include computing the bounds
8844  * of the shared region, trimming the head of the "grand", trimming the tail of
8845  * the "grand", and trimming the "subord". Some of the steps can be skipped under
8846  * different conditions.
8847  *
8848  * @param grand the pmap in which the pages are nested
8849  * @param subord the pmap from which the pages are shared, or nested
8850  * @param vstart start of the used range in "grand"
8851  * @param size size of the used range
8852  * @param state the current state of the state machine
8853  *
8854  * @return the next state of the state machine, to be used in the next call
8855  *         into this function.
8856  */
8857 MARK_AS_PMAP_TEXT pmap_trim_state_t
8858 pmap_trim_internal(
8859 	pmap_t grand,
8860 	pmap_t subord,
8861 	addr64_t vstart,
8862 	uint64_t size,
8863 	pmap_trim_state_t state)
8864 {
8865 	/* Validation needs to be done regardless of state. */
8866 	addr64_t vend;
8867 
8868 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
8869 		panic("%s: grand addr wraps around, "
8870 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8871 		    __func__, grand, subord, (void*)vstart, size, state);
8872 	}
8873 
8874 	validate_pmap_mutable(grand);
8875 	validate_pmap(subord);
8876 
8877 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8878 		panic("%s: subord is of non-nestable type 0x%hhx, "
8879 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8880 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
8881 	}
8882 
8883 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
8884 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8885 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8886 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
8887 	}
8888 
8889 	if (__improbable(grand->nested_pmap != subord)) {
8890 		panic("%s: grand->nested != subord, "
8891 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8892 		    __func__, grand, subord, (void*)vstart, size, state);
8893 	}
8894 
8895 	if (__improbable((size != 0) &&
8896 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
8897 		panic("%s: grand range not in nested region, "
8898 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8899 		    __func__, grand, subord, (void*)vstart, size, state);
8900 	}
8901 
8902 	/* Trimming starts with figuring out the bounds for the grand. */
8903 	if (state == PMAP_TRIM_STATE_START) {
8904 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8905 
8906 		/**
8907 		 * The "nested_has_no_bounds_ref" flag is set by `pmap_nest()` if the subord is nested into
8908 		 * the grand when the bounds are not known yet. Therefore, if it is not set, either any nesting
8909 		 * has not happened, or trimming has been done, or nesting has been done with bounds known so
8910 		 * the "extra" region was not nested in the first place. Anyway, trimming is not needed so
8911 		 * we exit early with PMAP_TRIM_STATE_DONE.
8912 		 */
8913 		if (!grand->nested_has_no_bounds_ref) {
8914 			assert(subord->nested_bounds_set);
8915 
8916 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
8917 			if (!grand->nested_bounds_set) {
8918 				/* Inherit the bounds from subord. */
8919 				grand->nested_region_true_start = subord->nested_region_true_start;
8920 				grand->nested_region_true_end = subord->nested_region_true_end;
8921 				grand->nested_bounds_set = true;
8922 			}
8923 
8924 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8925 
8926 			/* Now that the grand has bounds, we are done. */
8927 			return PMAP_TRIM_STATE_DONE;
8928 		}
8929 
8930 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
8931 		if ((!subord->nested_bounds_set) && size) {
8932 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8933 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8934 
8935 			subord->nested_region_true_start = vstart;
8936 			subord->nested_region_true_end = vend;
8937 			subord->nested_region_true_start &= ~adjust_offmask;
8938 
8939 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
8940 				panic("%s: padded true end wraps around, "
8941 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8942 				    __func__, grand, subord, (void*)vstart, size, state);
8943 			}
8944 
8945 			subord->nested_region_true_end &= ~adjust_offmask;
8946 			subord->nested_bounds_set = true;
8947 		}
8948 
8949 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
8950 		if (subord->nested_bounds_set) {
8951 			/* Inherit the bounds from subord. */
8952 			grand->nested_region_true_start = subord->nested_region_true_start;
8953 			grand->nested_region_true_end = subord->nested_region_true_end;
8954 			grand->nested_bounds_set = true;
8955 
8956 			/* If we know the bounds, we can trim the pmap. */
8957 			grand->nested_has_no_bounds_ref = false;
8958 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8959 
8960 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
8961 		} else {
8962 			/* Don't trim if we don't know the bounds. */
8963 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8964 
8965 			return PMAP_TRIM_STATE_DONE;
8966 		}
8967 	}
8968 
8969 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
8970 	if (!grand->nested_bounds_set) {
8971 		panic("%s: !grand->nested_bounds_set, "
8972 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8973 		    __func__, grand, subord, (void*)vstart, size, state);
8974 	}
8975 
8976 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
8977 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
8978 
8979 #if XNU_MONITOR
8980 		if (pmap_pending_preemption()) {
8981 			return PMAP_TRIM_STATE_GRAND_AFTER;
8982 		}
8983 #endif
8984 
8985 		state = PMAP_TRIM_STATE_GRAND_AFTER;
8986 	}
8987 
8988 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
8989 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
8990 
8991 #if XNU_MONITOR
8992 		if (pmap_pending_preemption()) {
8993 			return PMAP_TRIM_STATE_SUBORD;
8994 		}
8995 #endif
8996 
8997 		state = PMAP_TRIM_STATE_SUBORD;
8998 	}
8999 
9000 	/* START state is guaranteed to compute the bounds for the subord. */
9001 	if (!subord->nested_bounds_set) {
9002 		panic("%s: !subord->nested_bounds_set, "
9003 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9004 		    __func__, grand, subord, (void*)vstart, size, state);
9005 	}
9006 
9007 	if (state == PMAP_TRIM_STATE_SUBORD) {
9008 		pmap_trim_subord(subord);
9009 	}
9010 
9011 	return PMAP_TRIM_STATE_DONE;
9012 }
9013 
9014 MARK_AS_PMAP_TEXT static void
9015 pmap_trim_self(pmap_t pmap)
9016 {
9017 	if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9018 		/* If we have a no bounds ref, we need to drop it. */
9019 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9020 		pmap->nested_has_no_bounds_ref = false;
9021 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9022 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9023 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9024 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9025 
9026 		if (nested_bounds_set) {
9027 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9028 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9029 		}
9030 		/*
9031 		 * Try trimming the nested pmap, in case we had the
9032 		 * last reference.
9033 		 */
9034 		pmap_trim_subord(pmap->nested_pmap);
9035 	}
9036 }
9037 
9038 /*
9039  * pmap_trim_subord(grand, subord)
9040  *
9041  * grand  = pmap that we have nested subord in
9042  * subord = nested pmap we are attempting to trim
9043  *
9044  * Trims subord if possible
9045  */
9046 MARK_AS_PMAP_TEXT static void
9047 pmap_trim_subord(pmap_t subord)
9048 {
9049 	bool contract_subord = false;
9050 
9051 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9052 
9053 	subord->nested_no_bounds_refcnt--;
9054 
9055 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9056 		/* If this was the last no bounds reference, trim subord. */
9057 		contract_subord = true;
9058 	}
9059 
9060 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9061 
9062 	if (contract_subord) {
9063 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9064 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9065 	}
9066 }
9067 
9068 /**
9069  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9070  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9071  * disabling preemption for too long.
9072  *
9073  * @note When we load the shared region we always create pages tables for the
9074  *       entire region. In practice, the shared cache may use just a portion
9075  *       of that. Before we know the bounds of the shared region, it can
9076  *       already be mapped into processes. Therefore, once the bounds are
9077  *       known, "trimming" comes in handy to remove the unnecessary page
9078  *       tables in the processes the shared region is mapped in, and eventually
9079  *       those in the shared region itself. Note that the shared region must
9080  *       be trimmed after the user processes because it has the L3 entries
9081  *       everyone else is pointing to.
9082  *
9083  * @param grand the pmap in which the pages are nested
9084  * @param subord the pmap from which the pages are shared, or nested
9085  * @param vstart start of the used range in "grand"
9086  * @param size size of the used range
9087  */
9088 void
9089 pmap_trim(
9090 	pmap_t grand,
9091 	pmap_t subord,
9092 	addr64_t vstart,
9093 	uint64_t size)
9094 {
9095 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9096 
9097 #if XNU_MONITOR
9098 	/* On PPL systems, drives the state machine until its done. */
9099 	while (state != PMAP_TRIM_STATE_DONE) {
9100 		__assert_only pmap_trim_state_t old_state = state;
9101 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9102 
9103 		/* Are we making progress? */
9104 		assert(old_state != state);
9105 	}
9106 
9107 	pmap_ledger_check_balance(grand);
9108 	pmap_ledger_check_balance(subord);
9109 #else
9110 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9111 
9112 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9113 	assert(state == PMAP_TRIM_STATE_DONE);
9114 #endif
9115 }
9116 
9117 #if HAS_APPLE_PAC
9118 void *
9119 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9120 {
9121 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9122 		panic("attempt to sign user pointer without process independent key");
9123 	}
9124 
9125 	void *res = NULL;
9126 	uint64_t current_intr_state = pmap_interrupts_disable();
9127 
9128 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9129 
9130 	__compiler_materialize_and_prevent_reordering_on(value);
9131 	switch (key) {
9132 	case ptrauth_key_asia:
9133 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9134 		break;
9135 	case ptrauth_key_asda:
9136 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9137 		break;
9138 	default:
9139 		__builtin_unreachable();
9140 	}
9141 	__compiler_materialize_and_prevent_reordering_on(res);
9142 
9143 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9144 
9145 	pmap_interrupts_restore(current_intr_state);
9146 
9147 	return res;
9148 }
9149 
9150 void *
9151 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9152 {
9153 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9154 }
9155 
9156 void *
9157 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9158 {
9159 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9160 		panic("attempt to auth user pointer without process independent key");
9161 	}
9162 
9163 	void *res = NULL;
9164 	uint64_t current_intr_state = pmap_interrupts_disable();
9165 
9166 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9167 	__compiler_materialize_and_prevent_reordering_on(value);
9168 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9169 	__compiler_materialize_and_prevent_reordering_on(res);
9170 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9171 
9172 	pmap_interrupts_restore(current_intr_state);
9173 
9174 	return res;
9175 }
9176 
9177 void *
9178 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9179 {
9180 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9181 }
9182 #endif /* HAS_APPLE_PAC */
9183 
9184 /*
9185  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9186  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9187  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9188  * return value, to indicate where a preempted [un]nest operation should resume.
9189  * When the return value contains the ending address of the nested region with
9190  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9191  */
9192 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9193 
9194 /*
9195  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9196  *
9197  *	grand  = the pmap that we will nest subord into
9198  *	subord = the pmap that goes into the grand
9199  *	vstart  = start of range in pmap to be inserted
9200  *	size   = Size of nest area (up to 16TB)
9201  *
9202  *	Inserts a pmap into another.  This is used to implement shared segments.
9203  *
9204  */
9205 
9206 /**
9207  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9208  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9209  * This function operates in 3 main phases:
9210  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9211  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9212  *    the mapping range are present in subord.
9213  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9214  *    contains pointers to subord's leaf-level pagetable pages for the specified
9215  *    VA range.
9216  *
9217  * This function may return early due to pending AST_URGENT preemption; if so
9218  * it will indicate the need to be re-entered.
9219  *
9220  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9221  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9222  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9223  * @param size twig-aligned size of the nesting range
9224  * @param vrestart the twig-aligned starting address of the current call.  May contain
9225  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9226  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9227  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9228  *
9229  * @return the virtual address at which to restart the operation, possibly including
9230  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9231  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9232  */
9233 MARK_AS_PMAP_TEXT vm_map_offset_t
9234 pmap_nest_internal(
9235 	pmap_t grand,
9236 	pmap_t subord,
9237 	addr64_t vstart,
9238 	uint64_t size,
9239 	vm_map_offset_t vrestart,
9240 	kern_return_t *krp)
9241 {
9242 	kern_return_t kr = KERN_FAILURE;
9243 	vm_map_offset_t vaddr;
9244 	tt_entry_t     *stte_p;
9245 	tt_entry_t     *gtte_p;
9246 	unsigned int    nested_region_asid_bitmap_size;
9247 	unsigned int*   nested_region_asid_bitmap;
9248 	int             expand_options = 0;
9249 	bool            deref_subord = true;
9250 
9251 	addr64_t vend;
9252 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9253 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9254 	}
9255 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9256 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9257 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9258 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9259 	}
9260 
9261 	assert(krp != NULL);
9262 	validate_pmap_mutable(grand);
9263 	validate_pmap(subord);
9264 #if XNU_MONITOR
9265 	/*
9266 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9267 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9268 	 * be in the process of being destroyed.  If destruction is already committed,
9269 	 * then the check of ref_count below will cover us.  If destruction is initiated
9270 	 * during or after this call, then pmap_destroy() will catch the non-zero
9271 	 * nested_count.
9272 	 */
9273 	os_atomic_inc(&subord->nested_count, relaxed);
9274 	os_atomic_thread_fence(seq_cst);
9275 #endif
9276 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9277 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9278 	}
9279 
9280 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9281 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9282 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9283 	}
9284 
9285 #if XNU_MONITOR
9286 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9287 #endif
9288 
9289 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9290 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9291 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9292 		    grand, vstart, size, (unsigned long long)vrestart);
9293 	}
9294 
9295 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9296 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9297 	}
9298 
9299 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9300 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9301 	}
9302 
9303 	if (subord->nested_region_asid_bitmap == NULL) {
9304 		nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9305 
9306 #if XNU_MONITOR
9307 		pmap_paddr_t pa = 0;
9308 
9309 		if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9310 			panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9311 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9312 			    __FUNCTION__, nested_region_asid_bitmap_size,
9313 			    grand, subord, vstart, size);
9314 		}
9315 
9316 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9317 
9318 		if (kr != KERN_SUCCESS) {
9319 			goto nest_cleanup;
9320 		}
9321 
9322 		assert(pa);
9323 
9324 		nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9325 #else
9326 		nested_region_asid_bitmap = kalloc_data(
9327 			nested_region_asid_bitmap_size * sizeof(unsigned int),
9328 			Z_WAITOK | Z_ZERO);
9329 #endif
9330 
9331 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9332 		if (subord->nested_region_asid_bitmap == NULL) {
9333 			subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9334 			subord->nested_region_addr = vstart;
9335 			subord->nested_region_size = (mach_vm_offset_t) size;
9336 
9337 			/**
9338 			 * Ensure that the rest of the subord->nested_region_* fields are
9339 			 * initialized and visible before setting the nested_region_asid_bitmap
9340 			 * field (which is used as the flag to say that the rest are initialized).
9341 			 */
9342 			__builtin_arm_dmb(DMB_ISHST);
9343 			subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9344 			nested_region_asid_bitmap = NULL;
9345 		}
9346 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9347 		if (nested_region_asid_bitmap != NULL) {
9348 #if XNU_MONITOR
9349 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9350 #else
9351 			kfree_data(nested_region_asid_bitmap,
9352 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9353 #endif
9354 		}
9355 	}
9356 
9357 	/**
9358 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9359 	 * speculated before their initialization.
9360 	 */
9361 	__builtin_arm_dmb(DMB_ISHLD);
9362 
9363 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9364 		uint64_t        new_size;
9365 		unsigned int    new_nested_region_asid_bitmap_size;
9366 		unsigned int*   new_nested_region_asid_bitmap;
9367 
9368 		nested_region_asid_bitmap = NULL;
9369 		nested_region_asid_bitmap_size = 0;
9370 		new_size =  vend - subord->nested_region_addr;
9371 
9372 		/* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9373 		new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9374 
9375 #if XNU_MONITOR
9376 		pmap_paddr_t pa = 0;
9377 
9378 		if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9379 			panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9380 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9381 			    __FUNCTION__, new_nested_region_asid_bitmap_size,
9382 			    grand, subord, vstart, new_size);
9383 		}
9384 
9385 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9386 
9387 		if (kr != KERN_SUCCESS) {
9388 			goto nest_cleanup;
9389 		}
9390 
9391 		assert(pa);
9392 
9393 		new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9394 #else
9395 		new_nested_region_asid_bitmap = kalloc_data(
9396 			new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9397 			Z_WAITOK | Z_ZERO);
9398 #endif
9399 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9400 		if (subord->nested_region_size < new_size) {
9401 			bcopy(subord->nested_region_asid_bitmap,
9402 			    new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9403 			nested_region_asid_bitmap_size  = subord->nested_region_asid_bitmap_size;
9404 			nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9405 			subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9406 			subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9407 			subord->nested_region_size = new_size;
9408 			new_nested_region_asid_bitmap = NULL;
9409 		}
9410 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9411 		if (nested_region_asid_bitmap != NULL) {
9412 #if XNU_MONITOR
9413 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9414 #else
9415 			kfree_data(nested_region_asid_bitmap,
9416 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9417 #endif
9418 		}
9419 		if (new_nested_region_asid_bitmap != NULL) {
9420 #if XNU_MONITOR
9421 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9422 #else
9423 			kfree_data(new_nested_region_asid_bitmap,
9424 			    new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9425 #endif
9426 		}
9427 	}
9428 
9429 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9430 
9431 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9432 		/*
9433 		 * If this is grand's first nesting operation, keep the reference on subord.
9434 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9435 		 */
9436 		deref_subord = false;
9437 
9438 		if (!subord->nested_bounds_set) {
9439 			/*
9440 			 * We are nesting without the shared regions bounds
9441 			 * being known.  We'll have to trim the pmap later.
9442 			 */
9443 			grand->nested_has_no_bounds_ref = true;
9444 			subord->nested_no_bounds_refcnt++;
9445 		}
9446 
9447 		grand->nested_region_addr = vstart;
9448 		grand->nested_region_size = (mach_vm_offset_t) size;
9449 	} else {
9450 		if (__improbable(grand->nested_pmap != subord)) {
9451 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9452 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9453 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9454 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9455 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9456 		}
9457 	}
9458 
9459 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9460 	if (vaddr < subord->nested_region_true_start) {
9461 		vaddr = subord->nested_region_true_start;
9462 	}
9463 
9464 	addr64_t true_end = vend;
9465 	if (true_end > subord->nested_region_true_end) {
9466 		true_end = subord->nested_region_true_end;
9467 	}
9468 	__unused unsigned int ttecount = 0;
9469 
9470 	if (vrestart & PMAP_NEST_GRAND) {
9471 		goto nest_grand;
9472 	}
9473 
9474 	while (vaddr < true_end) {
9475 		stte_p = pmap_tte(subord, vaddr);
9476 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9477 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9478 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9479 
9480 			if (kr != KERN_SUCCESS) {
9481 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9482 				goto done;
9483 			}
9484 
9485 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9486 		}
9487 		vaddr += pt_attr_twig_size(pt_attr);
9488 		vrestart = vaddr;
9489 		++ttecount;
9490 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9491 		    pmap_pending_preemption())) {
9492 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9493 			kr = KERN_SUCCESS;
9494 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9495 			goto done;
9496 		}
9497 	}
9498 	/*
9499 	 * copy TTEs from subord pmap into grand pmap
9500 	 */
9501 
9502 	vaddr = (vm_map_offset_t) vstart;
9503 	if (vaddr < subord->nested_region_true_start) {
9504 		vaddr = subord->nested_region_true_start;
9505 	}
9506 	vrestart = vaddr | PMAP_NEST_GRAND;
9507 
9508 nest_grand:
9509 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9510 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9511 	while (vaddr < true_end) {
9512 		stte_p = pmap_tte(subord, vaddr);
9513 		gtte_p = pmap_tte(grand, vaddr);
9514 		if (gtte_p == PT_ENTRY_NULL) {
9515 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9516 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9517 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9518 
9519 			if (kr != KERN_SUCCESS) {
9520 				goto done;
9521 			}
9522 
9523 			gtte_p = pmap_tt2e(grand, vaddr);
9524 		}
9525 		/* Don't leak a page table page.  Don't violate break-before-make. */
9526 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9527 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9528 			    __func__, gtte_p, grand);
9529 		}
9530 		*gtte_p = *stte_p;
9531 
9532 		vaddr += pt_attr_twig_size(pt_attr);
9533 		vrestart = vaddr | PMAP_NEST_GRAND;
9534 		++ttecount;
9535 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9536 		    pmap_pending_preemption())) {
9537 			break;
9538 		}
9539 	}
9540 	if (vaddr >= true_end) {
9541 		vrestart = vend | PMAP_NEST_GRAND;
9542 	}
9543 
9544 	kr = KERN_SUCCESS;
9545 done:
9546 
9547 	FLUSH_PTE();
9548 	__builtin_arm_isb(ISB_SY);
9549 
9550 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9551 #if XNU_MONITOR
9552 nest_cleanup:
9553 	if (kr != KERN_SUCCESS) {
9554 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9555 		*krp = kr;
9556 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9557 	}
9558 #else
9559 	if (kr != KERN_SUCCESS) {
9560 		*krp = kr;
9561 	}
9562 #endif
9563 	if (deref_subord) {
9564 #if XNU_MONITOR
9565 		os_atomic_dec(&subord->nested_count, relaxed);
9566 #endif
9567 		pmap_destroy_internal(subord);
9568 	}
9569 	return vrestart;
9570 }
9571 
9572 kern_return_t
9573 pmap_nest(
9574 	pmap_t grand,
9575 	pmap_t subord,
9576 	addr64_t vstart,
9577 	uint64_t size)
9578 {
9579 	kern_return_t kr = KERN_SUCCESS;
9580 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9581 	vm_map_offset_t vend = vaddr + size;
9582 	__unused vm_map_offset_t vlast = vaddr;
9583 
9584 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9585 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9586 	    VM_KERNEL_ADDRHIDE(vstart));
9587 
9588 	pmap_verify_preemptible();
9589 #if XNU_MONITOR
9590 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9591 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9592 		if (kr == KERN_RESOURCE_SHORTAGE) {
9593 			pmap_alloc_page_for_ppl(0);
9594 			kr = KERN_SUCCESS;
9595 		} else if (kr == KERN_ABORTED) {
9596 			/* Reset kr to KERN_SUCCESS and try again. */
9597 			kr = KERN_SUCCESS;
9598 		} else if (kr != KERN_SUCCESS) {
9599 			break;
9600 		} else if (vaddr == vlast) {
9601 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9602 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9603 		}
9604 		vlast = vaddr;
9605 	}
9606 
9607 	pmap_ledger_check_balance(grand);
9608 	pmap_ledger_check_balance(subord);
9609 #else
9610 	/**
9611 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9612 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9613 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9614 	 */
9615 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9616 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9617 	}
9618 #endif
9619 
9620 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9621 
9622 	return kr;
9623 }
9624 
9625 /*
9626  *	kern_return_t pmap_unnest(grand, vaddr)
9627  *
9628  *	grand  = the pmap that will have the virtual range unnested
9629  *	vaddr  = start of range in pmap to be unnested
9630  *	size   = size of range in pmap to be unnested
9631  *
9632  */
9633 
9634 kern_return_t
9635 pmap_unnest(
9636 	pmap_t grand,
9637 	addr64_t vaddr,
9638 	uint64_t size)
9639 {
9640 	return pmap_unnest_options(grand, vaddr, size, 0);
9641 }
9642 
9643 /**
9644  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9645  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9646  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9647  * still have the region nested.  The mappings in 'grand' will be left empty
9648  * with the assumption that they will be demand-filled by subsequent access faults.
9649  *
9650  * This function operates in 2 main phases:
9651  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9652  *    them non-global.
9653  * 2. Clearing of the twig-level TTEs for the address range in grand.
9654  *
9655  * This function may return early due to pending AST_URGENT preemption; if so
9656  * it will indicate the need to be re-entered.
9657  *
9658  * @param grand pmap from which to unnest mappings
9659  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9660  * @param size twig-aligned size of the nested range
9661  * @param vrestart the page-aligned starting address of the current call.  May contain
9662  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9663  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9664  *        grand is being torn down and step 1) above is not needed.
9665  *
9666  * @return the virtual address at which to restart the operation, possibly including
9667  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9668  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9669  */
9670 MARK_AS_PMAP_TEXT vm_map_offset_t
9671 pmap_unnest_options_internal(
9672 	pmap_t grand,
9673 	addr64_t vaddr,
9674 	uint64_t size,
9675 	vm_map_offset_t vrestart,
9676 	unsigned int option)
9677 {
9678 	vm_map_offset_t start;
9679 	vm_map_offset_t addr;
9680 	tt_entry_t     *tte_p;
9681 	unsigned int    current_index;
9682 	unsigned int    start_index;
9683 	unsigned int    max_index;
9684 	unsigned int    entry_count = 0;
9685 
9686 	addr64_t vend;
9687 	addr64_t true_end;
9688 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9689 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9690 	}
9691 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9692 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9693 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9694 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9695 	}
9696 
9697 	validate_pmap_mutable(grand);
9698 
9699 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9700 
9701 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9702 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9703 		    (unsigned long long)vaddr, (unsigned long long)size);
9704 	}
9705 
9706 	if (__improbable(grand->nested_pmap == NULL)) {
9707 		panic("%s: %p has no nested pmap", __func__, grand);
9708 	}
9709 
9710 	true_end = vend;
9711 	if (true_end > grand->nested_pmap->nested_region_true_end) {
9712 		true_end = grand->nested_pmap->nested_region_true_end;
9713 	}
9714 
9715 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
9716 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9717 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9718 		}
9719 
9720 		pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9721 
9722 		start = vrestart;
9723 		if (start < grand->nested_pmap->nested_region_true_start) {
9724 			start = grand->nested_pmap->nested_region_true_start;
9725 		}
9726 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9727 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9728 		bool flush_tlb = false;
9729 
9730 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9731 			pt_entry_t  *bpte, *cpte;
9732 
9733 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9734 
9735 			bpte = pmap_pte(grand->nested_pmap, addr);
9736 
9737 			/*
9738 			 * If we've re-entered this function partway through unnesting a leaf region, the
9739 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
9740 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
9741 			 * address.
9742 			 */
9743 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
9744 			    (addr & pt_attr_twig_offmask(pt_attr))) {
9745 				/*
9746 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
9747 				 * the nested pmap in this region will now be marked non-global.  Do this
9748 				 * before marking any of the PTEs within the region as non-global to avoid
9749 				 * the possibility of pmap_enter() subsequently inserting a global mapping
9750 				 * in the region, which could lead to a TLB conflict if a non-global entry
9751 				 * is later inserted for the same VA in a pmap which has fully unnested this
9752 				 * region.
9753 				 */
9754 				setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
9755 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
9756 					pmap_paddr_t    pa;
9757 					unsigned int    pai = 0;
9758 					boolean_t               managed = FALSE;
9759 					pt_entry_t  spte;
9760 
9761 					if ((*cpte != ARM_PTE_TYPE_FAULT)
9762 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
9763 						spte = *((volatile pt_entry_t*)cpte);
9764 						while (!managed) {
9765 							pa = pte_to_pa(spte);
9766 							if (!pa_valid(pa)) {
9767 								break;
9768 							}
9769 							pai = pa_index(pa);
9770 							pvh_lock(pai);
9771 							spte = *((volatile pt_entry_t*)cpte);
9772 							pa = pte_to_pa(spte);
9773 							if (pai == pa_index(pa)) {
9774 								managed = TRUE;
9775 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
9776 							}
9777 							pvh_unlock(pai);
9778 						}
9779 
9780 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
9781 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
9782 							flush_tlb = true;
9783 						}
9784 
9785 						if (managed) {
9786 							pvh_assert_locked(pai);
9787 							pvh_unlock(pai);
9788 						}
9789 					}
9790 
9791 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9792 					vrestart = addr;
9793 					++entry_count;
9794 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9795 					    pmap_pending_preemption())) {
9796 						goto unnest_subord_done;
9797 					}
9798 				}
9799 			}
9800 			addr = vlim;
9801 			vrestart = addr;
9802 			++entry_count;
9803 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9804 			    pmap_pending_preemption())) {
9805 				break;
9806 			}
9807 		}
9808 
9809 unnest_subord_done:
9810 		if (flush_tlb) {
9811 			FLUSH_PTE_STRONG();
9812 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
9813 		}
9814 
9815 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9816 		if (current_index < max_index) {
9817 			return vrestart;
9818 		}
9819 	}
9820 
9821 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9822 
9823 	/*
9824 	 * invalidate all pdes for segment at vaddr in pmap grand
9825 	 */
9826 	if (vrestart & PMAP_NEST_GRAND) {
9827 		addr = vrestart & ~PMAP_NEST_GRAND;
9828 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
9829 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
9830 		}
9831 	} else {
9832 		addr = vaddr;
9833 		vrestart = vaddr | PMAP_NEST_GRAND;
9834 	}
9835 
9836 	if (addr < grand->nested_pmap->nested_region_true_start) {
9837 		addr = grand->nested_pmap->nested_region_true_start;
9838 	}
9839 
9840 	while (addr < true_end) {
9841 		tte_p = pmap_tte(grand, addr);
9842 		/*
9843 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
9844 		 * so it's possible that a region we're trying to unnest may not have been
9845 		 * nested in the first place.
9846 		 */
9847 		if (tte_p != NULL) {
9848 			*tte_p = ARM_TTE_TYPE_FAULT;
9849 		}
9850 		addr += pt_attr_twig_size(pt_attr);
9851 		vrestart = addr | PMAP_NEST_GRAND;
9852 		++entry_count;
9853 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9854 		    pmap_pending_preemption())) {
9855 			break;
9856 		}
9857 	}
9858 	if (addr >= true_end) {
9859 		vrestart = vend | PMAP_NEST_GRAND;
9860 	}
9861 
9862 	FLUSH_PTE_STRONG();
9863 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
9864 
9865 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9866 
9867 	return vrestart;
9868 }
9869 
9870 kern_return_t
9871 pmap_unnest_options(
9872 	pmap_t grand,
9873 	addr64_t vaddr,
9874 	uint64_t size,
9875 	unsigned int option)
9876 {
9877 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
9878 	vm_map_offset_t vend = vaddr + size;
9879 	__unused vm_map_offset_t vlast = vrestart;
9880 
9881 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9882 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9883 
9884 	pmap_verify_preemptible();
9885 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
9886 #if XNU_MONITOR
9887 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
9888 		if (vrestart == vlast) {
9889 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9890 			    __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
9891 		}
9892 		vlast = vrestart;
9893 #else
9894 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
9895 #endif
9896 	}
9897 
9898 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9899 
9900 	return KERN_SUCCESS;
9901 }
9902 
9903 boolean_t
9904 pmap_adjust_unnest_parameters(
9905 	__unused pmap_t p,
9906 	__unused vm_map_offset_t *s,
9907 	__unused vm_map_offset_t *e)
9908 {
9909 	return TRUE; /* to get to log_unnest_badness()... */
9910 }
9911 
9912 #if PMAP_FORK_NEST
9913 /**
9914  * Perform any necessary pre-nesting of the parent's shared region at fork()
9915  * time.
9916  *
9917  * @note This should only be called from vm_map_fork().
9918  *
9919  * @param old_pmap The pmap of the parent task.
9920  * @param new_pmap The pmap of the child task.
9921  * @param nesting_start An output parameter that is updated with the start
9922  *                      address of the range that was pre-nested
9923  * @param nesting_end An output parameter that is updated with the end
9924  *                      address of the range that was pre-nested
9925  *
9926  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9927  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
9928  */
9929 kern_return_t
9930 pmap_fork_nest(
9931 	pmap_t old_pmap,
9932 	pmap_t new_pmap,
9933 	vm_map_offset_t *nesting_start,
9934 	vm_map_offset_t *nesting_end)
9935 {
9936 	if (old_pmap == NULL || new_pmap == NULL) {
9937 		return KERN_INVALID_ARGUMENT;
9938 	}
9939 	if (old_pmap->nested_pmap == NULL) {
9940 		return KERN_SUCCESS;
9941 	}
9942 	pmap_nest(new_pmap,
9943 	    old_pmap->nested_pmap,
9944 	    old_pmap->nested_region_addr,
9945 	    old_pmap->nested_region_size);
9946 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
9947 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
9948 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
9949 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
9950 	    new_pmap->nested_pmap,
9951 	    new_pmap->nested_region_addr,
9952 	    new_pmap->nested_region_size,
9953 	    old_pmap->nested_pmap,
9954 	    old_pmap->nested_region_addr,
9955 	    old_pmap->nested_region_size);
9956 	*nesting_start = old_pmap->nested_region_addr;
9957 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
9958 	return KERN_SUCCESS;
9959 }
9960 #endif /* PMAP_FORK_NEST */
9961 
9962 /*
9963  * disable no-execute capability on
9964  * the specified pmap
9965  */
9966 #if DEVELOPMENT || DEBUG
9967 void
9968 pmap_disable_NX(
9969 	pmap_t pmap)
9970 {
9971 	pmap->nx_enabled = FALSE;
9972 }
9973 #else
9974 void
9975 pmap_disable_NX(
9976 	__unused pmap_t pmap)
9977 {
9978 }
9979 #endif
9980 
9981 /*
9982  * flush a range of hardware TLB entries.
9983  * NOTE: assumes the smallest TLB entry in use will be for
9984  * an ARM small page (4K).
9985  */
9986 
9987 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
9988 
9989 #if __ARM_RANGE_TLBI__
9990 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9991 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_PAGES
9992 #else
9993 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
9994 #endif // __ARM_RANGE_TLBI__
9995 
9996 static void
9997 flush_mmu_tlb_region_asid_async(
9998 	vm_offset_t va,
9999 	size_t length,
10000 	pmap_t pmap,
10001 	bool last_level_only __unused)
10002 {
10003 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10004 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10005 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10006 	uint32_t    asid;
10007 
10008 	asid = pmap->hw_asid;
10009 
10010 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10011 		boolean_t       flush_all = FALSE;
10012 
10013 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10014 			flush_all = TRUE;
10015 		}
10016 		if (flush_all) {
10017 			flush_mmu_tlb_async();
10018 		} else {
10019 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10020 		}
10021 		return;
10022 	}
10023 #if __ARM_RANGE_TLBI__
10024 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10025 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10026 		if (pmap->type == PMAP_TYPE_NESTED) {
10027 			flush_mmu_tlb_allrange_async(va, last_level_only);
10028 		} else {
10029 			flush_mmu_tlb_range_async(va, last_level_only);
10030 		}
10031 		return;
10032 	}
10033 #endif
10034 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10035 	va = tlbi_asid(asid) | tlbi_addr(va);
10036 
10037 	if (pmap->type == PMAP_TYPE_NESTED) {
10038 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10039 	} else {
10040 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10041 	}
10042 }
10043 
10044 MARK_AS_PMAP_TEXT static void
10045 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10046 {
10047 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10048 }
10049 
10050 void
10051 flush_mmu_tlb_region(
10052 	vm_offset_t va,
10053 	unsigned length)
10054 {
10055 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10056 	sync_tlb_flush();
10057 }
10058 
10059 unsigned int
10060 pmap_cache_attributes(
10061 	ppnum_t pn)
10062 {
10063 	pmap_paddr_t    paddr;
10064 	unsigned int    pai;
10065 	unsigned int    result;
10066 	pp_attr_t       pp_attr_current;
10067 
10068 	paddr = ptoa(pn);
10069 
10070 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10071 
10072 	if (!pa_valid(paddr)) {
10073 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10074 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10075 	}
10076 
10077 	result = VM_WIMG_DEFAULT;
10078 
10079 	pai = pa_index(paddr);
10080 
10081 	pp_attr_current = pp_attr_table[pai];
10082 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10083 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10084 	}
10085 	return result;
10086 }
10087 
10088 MARK_AS_PMAP_TEXT static void
10089 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10090 {
10091 	if ((wimg_bits_prev != wimg_bits_new)
10092 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10093 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10094 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10095 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10096 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10097 		pmap_sync_page_attributes_phys(pn);
10098 	}
10099 
10100 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10101 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10102 	}
10103 }
10104 
10105 MARK_AS_PMAP_TEXT __unused void
10106 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10107 {
10108 	pmap_paddr_t paddr = ptoa(pn);
10109 	const unsigned int pai = pa_index(paddr);
10110 
10111 	if (__improbable(!pa_valid(paddr))) {
10112 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10113 	}
10114 
10115 	pvh_lock(pai);
10116 
10117 #if XNU_MONITOR
10118 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10119 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10120 	}
10121 #endif
10122 
10123 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10124 
10125 	pvh_unlock(pai);
10126 
10127 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10128 }
10129 
10130 void *
10131 pmap_map_compressor_page(ppnum_t pn)
10132 {
10133 #if __ARM_PTE_PHYSMAP__
10134 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10135 	if (cacheattr != VM_WIMG_DEFAULT) {
10136 #if XNU_MONITOR
10137 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10138 #else
10139 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10140 #endif
10141 	}
10142 #endif
10143 	return (void*)phystokv(ptoa(pn));
10144 }
10145 
10146 void
10147 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10148 {
10149 #if __ARM_PTE_PHYSMAP__
10150 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10151 	if (cacheattr != VM_WIMG_DEFAULT) {
10152 #if XNU_MONITOR
10153 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10154 #else
10155 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10156 #endif
10157 	}
10158 #endif
10159 }
10160 
10161 /**
10162  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10163  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10164  *
10165  * @param user_page_list List of pages to be updated.
10166  * @param page_cnt Number of pages in total in user_page_list.
10167  * @param cacheattr The new cache attribute.
10168  *
10169  * @return Success if true is returned.
10170  */
10171 bool
10172 pmap_batch_set_cache_attributes(
10173 	upl_page_info_array_t user_page_list,
10174 	unsigned int page_cnt,
10175 	unsigned int cacheattr)
10176 {
10177 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10178 
10179 	if (page_cnt == 0) {
10180 		return true;
10181 	}
10182 
10183 	batch_set_cache_attr_state_t states;
10184 	states.page_index = 0;
10185 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10186 	states.tlb_flush_pass_needed = false;
10187 	states.rt_cache_flush_pass_needed = false;
10188 
10189 	/* Verify we are being called from a preemptible context. */
10190 	pmap_verify_preemptible();
10191 
10192 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10193 #if XNU_MONITOR
10194 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10195 #else /* !XNU_MONITOR */
10196 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10197 #endif /* XNU_MONITOR */
10198 	}
10199 
10200 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10201 	return true;
10202 }
10203 
10204 /**
10205  * Flushes TLB entries associated with the page specified by paddr, but do not
10206  * issue barriers yet.
10207  *
10208  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10209  */
10210 MARK_AS_PMAP_TEXT static void
10211 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10212 {
10213 #if __ARM_PTE_PHYSMAP__
10214 	/* Flush the physical aperture mappings. */
10215 	const vm_offset_t kva = phystokv(paddr);
10216 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10217 #endif /* __ARM_PTE_PHYSMAP__ */
10218 
10219 	/* Flush the mappings tracked in the ptes. */
10220 	const unsigned int pai = pa_index(paddr);
10221 	pv_entry_t **pv_h = pai_to_pvh(pai);
10222 
10223 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10224 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10225 
10226 	pvh_assert_locked(pai);
10227 
10228 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10229 		pte_p = pvh_ptep(pv_h);
10230 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10231 		pve_p = pvh_pve_list(pv_h);
10232 		pte_p = PT_ENTRY_NULL;
10233 	}
10234 
10235 	int pve_ptep_idx = 0;
10236 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10237 		if (pve_p != PV_ENTRY_NULL) {
10238 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10239 			if (pte_p == PT_ENTRY_NULL) {
10240 				goto flush_tlb_skip_pte;
10241 			}
10242 		}
10243 
10244 #ifdef PVH_FLAG_IOMMU
10245 		if (pvh_ptep_is_iommu(pte_p)) {
10246 			goto flush_tlb_skip_pte;
10247 		}
10248 #endif /* PVH_FLAG_IOMMU */
10249 		pmap_t pmap = ptep_get_pmap(pte_p);
10250 		vm_map_address_t va = ptep_get_va(pte_p);
10251 
10252 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10253 
10254 flush_tlb_skip_pte:
10255 		pte_p = PT_ENTRY_NULL;
10256 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10257 			pve_ptep_idx = 0;
10258 			pve_p = pve_next(pve_p);
10259 		}
10260 	}
10261 }
10262 
10263 /**
10264  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10265  *
10266  * @param pai The Physical Address Index of the entry.
10267  * @param cacheattr The new cache attribute.
10268  */
10269 MARK_AS_PMAP_TEXT static void
10270 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10271 {
10272 	pvh_assert_locked(pai);
10273 
10274 	pp_attr_t pp_attr_current, pp_attr_template;
10275 	do {
10276 		pp_attr_current = pp_attr_table[pai];
10277 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10278 
10279 		/**
10280 		 * WIMG bits should only be updated under the PVH lock, but we should do
10281 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10282 		 */
10283 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10284 }
10285 
10286 /**
10287  * Batch updates the cache attributes of a list of pages in three passes.
10288  *
10289  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10290  * In pass two, TLB entries are flushed for each page in the list if necessary.
10291  * In pass three, caches are cleaned for each page in the list if necessary.
10292  *
10293  * When running in PPL, this function may decide to return to the caller in response
10294  * to AST_URGENT.
10295  *
10296  * @param user_page_list List of pages to be updated.
10297  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10298  * @param page_cnt Number of pages in total in user_page_list.
10299  * @param cacheattr The new cache attributes.
10300  *
10301  * @return The new state of the state machine.
10302  */
10303 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10304 pmap_batch_set_cache_attributes_internal(
10305 #if XNU_MONITOR
10306 	volatile upl_page_info_t *user_page_list,
10307 #else /* !XNU_MONITOR */
10308 	upl_page_info_array_t user_page_list,
10309 #endif /* XNU_MONITOR */
10310 	batch_set_cache_attr_state_t states,
10311 	unsigned int page_cnt,
10312 	unsigned int cacheattr)
10313 {
10314 	uint64_t page_index = states.page_index;
10315 	uint64_t state = states.state;
10316 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10317 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10318 
10319 	/* For verifying progress. */
10320 	__assert_only const uint64_t page_index_old = page_index;
10321 	__assert_only const uint64_t state_old = state;
10322 
10323 	/* Assert page_index and state are within their range. */
10324 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10325 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10326 	}
10327 
10328 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10329 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10330 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10331 		while (page_index < page_cnt) {
10332 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10333 			const pmap_paddr_t paddr = ptoa(pn);
10334 
10335 			if (!pa_valid(paddr)) {
10336 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10337 			}
10338 
10339 			const unsigned int pai = pa_index(paddr);
10340 
10341 			/* Lock the page. */
10342 			pvh_lock(pai);
10343 
10344 #if XNU_MONITOR
10345 			if (ppattr_pa_test_monitor(paddr)) {
10346 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10347 			}
10348 #endif /* XNU_MONITOR */
10349 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10350 
10351 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10352 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10353 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10354 			}
10355 
10356 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10357 
10358 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10359 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10360 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10361 			}
10362 
10363 			/* Update the cache attributes in PTE and PP_ATTR table. */
10364 			if (wimg_bits_new != wimg_bits_prev) {
10365 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10366 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10367 			}
10368 
10369 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10370 				rt_cache_flush_pass_needed = true;
10371 			}
10372 
10373 			pvh_unlock(pai);
10374 
10375 			page_index++;
10376 
10377 #if XNU_MONITOR
10378 			/**
10379 			 * Check for AST_URGENT every page, as the pve list search in cache
10380 			 * update can take non-constant time.
10381 			 */
10382 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10383 				goto pbscai_exit;
10384 			}
10385 #endif /* XNU_MONITOR */
10386 		}
10387 
10388 		/* page_index == page_cnt && !pmap_pending_preemption() */
10389 		if (tlb_flush_pass_needed) {
10390 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10391 		} else if (rt_cache_flush_pass_needed) {
10392 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10393 		} else {
10394 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10395 		}
10396 		page_index = 0;
10397 
10398 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10399 		FLUSH_PTE_STRONG();
10400 
10401 #if XNU_MONITOR
10402 		if (__improbable(pmap_pending_preemption())) {
10403 			goto pbscai_exit;
10404 		}
10405 #endif /* XNU_MONITOR */
10406 	}
10407 
10408 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10409 		/**
10410 		 * Pass 2: for each physical page and for each mapping, we need to flush
10411 		 * the TLB for it.
10412 		 */
10413 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10414 		while (page_index < page_cnt) {
10415 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10416 
10417 			const pmap_paddr_t paddr = ptoa(pn);
10418 			if (!pa_valid(paddr)) {
10419 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10420 			}
10421 
10422 			const unsigned int pai = pa_index(paddr);
10423 
10424 			pvh_lock(pai);
10425 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10426 			pvh_unlock(pai);
10427 
10428 			page_index++;
10429 
10430 #if XNU_MONITOR
10431 			/**
10432 			 * Check for AST_URGENT every page, as the pve list search in cache
10433 			 * update can take non-constant time.
10434 			 */
10435 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10436 				goto pbscai_exit;
10437 			}
10438 #endif /* XNU_MONITOR */
10439 		}
10440 
10441 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10442 
10443 		if (rt_cache_flush_pass_needed) {
10444 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10445 		} else {
10446 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10447 		}
10448 		page_index = 0;
10449 
10450 #if XNU_MONITOR
10451 		if (__improbable(pmap_pending_preemption())) {
10452 			goto pbscai_exit;
10453 		}
10454 #endif /* XNU_MONITOR */
10455 	}
10456 
10457 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10458 		/* Pass 3: Flush the cache if the page is recently set to RT */
10459 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10460 #if !XNU_MONITOR
10461 		/**
10462 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10463 		 * in the state where DC by VA instructions remain enabled.
10464 		 */
10465 		disable_preemption();
10466 #endif /* !XNU_MONITOR */
10467 
10468 		assert(get_preemption_level() > 0);
10469 
10470 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10471 		/**
10472 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10473 		 * and the host will handle cache maintenance for it. So we don't need to
10474 		 * worry about enabling the ops here for AVP.
10475 		 */
10476 		enable_dc_mva_ops();
10477 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10478 
10479 		while (page_index < page_cnt) {
10480 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10481 
10482 			if (!pa_valid(paddr)) {
10483 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10484 			}
10485 
10486 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10487 
10488 			page_index++;
10489 
10490 #if XNU_MONITOR
10491 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10492 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10493 				disable_dc_mva_ops();
10494 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10495 				goto pbscai_exit;
10496 			}
10497 #endif /* XNU_MONITOR */
10498 		}
10499 
10500 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10501 		disable_dc_mva_ops();
10502 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10503 
10504 #if !XNU_MONITOR
10505 		enable_preemption();
10506 #endif /* !XNU_MONITOR */
10507 
10508 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10509 		page_index = 0;
10510 	}
10511 
10512 #if XNU_MONITOR
10513 pbscai_exit:
10514 #endif /* XNU_MONITOR */
10515 	/* Assert page_index and state are within their range. */
10516 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10517 
10518 	/* Make sure we are making progress in this call. */
10519 	assert(page_index > page_index_old || state > state_old);
10520 
10521 	batch_set_cache_attr_state_t states_new;
10522 	states_new.page_index = page_index;
10523 	states_new.state = state;
10524 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10525 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10526 	return states_new;
10527 }
10528 
10529 MARK_AS_PMAP_TEXT static void
10530 pmap_set_cache_attributes_priv(
10531 	ppnum_t pn,
10532 	unsigned int cacheattr,
10533 	boolean_t external __unused)
10534 {
10535 	pmap_paddr_t    paddr;
10536 	unsigned int    pai;
10537 	pp_attr_t       pp_attr_current;
10538 	pp_attr_t       pp_attr_template;
10539 	unsigned int    wimg_bits_prev, wimg_bits_new;
10540 
10541 	paddr = ptoa(pn);
10542 
10543 	if (!pa_valid(paddr)) {
10544 		return;                         /* Not a managed page. */
10545 	}
10546 
10547 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10548 		cacheattr = VM_WIMG_DEFAULT;
10549 	}
10550 
10551 	pai = pa_index(paddr);
10552 
10553 	pvh_lock(pai);
10554 
10555 #if XNU_MONITOR
10556 	if (external && ppattr_pa_test_monitor(paddr)) {
10557 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10558 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10559 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10560 	}
10561 #endif
10562 
10563 	do {
10564 		pp_attr_current = pp_attr_table[pai];
10565 		wimg_bits_prev = VM_WIMG_DEFAULT;
10566 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10567 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10568 		}
10569 
10570 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10571 
10572 		/**
10573 		 * WIMG bits should only be updated under the PVH lock, but we should do
10574 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10575 		 */
10576 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10577 
10578 	wimg_bits_new = VM_WIMG_DEFAULT;
10579 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10580 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10581 	}
10582 
10583 	if (wimg_bits_new != wimg_bits_prev) {
10584 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10585 	}
10586 
10587 	pvh_unlock(pai);
10588 
10589 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10590 }
10591 
10592 MARK_AS_PMAP_TEXT void
10593 pmap_set_cache_attributes_internal(
10594 	ppnum_t pn,
10595 	unsigned int cacheattr)
10596 {
10597 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10598 }
10599 
10600 void
10601 pmap_set_cache_attributes(
10602 	ppnum_t pn,
10603 	unsigned int cacheattr)
10604 {
10605 #if XNU_MONITOR
10606 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10607 #else
10608 	pmap_set_cache_attributes_internal(pn, cacheattr);
10609 #endif
10610 }
10611 
10612 /**
10613  * Updates the page numbered ppnum to have attribute specified by attributes.
10614  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10615  * The necessity of the TLB flush is returned in case this function is called
10616  * in a batched manner and the TLB flush is intended to be done at a different
10617  * timing.
10618  *
10619  * @param ppnum Page Number of the page to be updated.
10620  * @param attributes The new cache attributes.
10621  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10622  *        immediately.
10623  *
10624  * @return Returns true if a TLB flush is needed for this update regardless of
10625  *         whether a flush has occurred already.
10626  */
10627 MARK_AS_PMAP_TEXT bool
10628 pmap_update_cache_attributes_locked(
10629 	ppnum_t ppnum,
10630 	unsigned attributes,
10631 	bool perform_tlbi)
10632 {
10633 	pmap_paddr_t    phys = ptoa(ppnum);
10634 	pv_entry_t      *pve_p;
10635 	pt_entry_t      *pte_p;
10636 	pv_entry_t      **pv_h;
10637 	pt_entry_t      tmplate;
10638 	unsigned int    pai;
10639 	boolean_t       tlb_flush_needed = false;
10640 
10641 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10642 
10643 	if (pmap_panic_dev_wimg_on_managed) {
10644 		switch (attributes & VM_WIMG_MASK) {
10645 		case VM_WIMG_IO:                        // nGnRnE
10646 		case VM_WIMG_POSTED:                    // nGnRE
10647 		/* supported on DRAM, but slow, so we disallow */
10648 
10649 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10650 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10651 			/* unsupported on DRAM */
10652 
10653 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10654 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10655 			break;
10656 
10657 		default:
10658 			/* not device type memory, all good */
10659 
10660 			break;
10661 		}
10662 	}
10663 
10664 #if __ARM_PTE_PHYSMAP__
10665 	vm_offset_t kva = phystokv(phys);
10666 	pte_p = pmap_pte(kernel_pmap, kva);
10667 
10668 	tmplate = *pte_p;
10669 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10670 #if XNU_MONITOR
10671 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10672 #else
10673 	tmplate |= wimg_to_pte(attributes, phys);
10674 #endif
10675 	if (tmplate & ARM_PTE_HINT_MASK) {
10676 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10677 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10678 	}
10679 
10680 	if (perform_tlbi) {
10681 		write_pte_strong(pte_p, tmplate);
10682 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10683 	} else {
10684 		write_pte_fast(pte_p, tmplate);
10685 	}
10686 	tlb_flush_needed = true;
10687 #endif
10688 
10689 	pai = pa_index(phys);
10690 
10691 	pv_h = pai_to_pvh(pai);
10692 
10693 	pte_p = PT_ENTRY_NULL;
10694 	pve_p = PV_ENTRY_NULL;
10695 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10696 		pte_p = pvh_ptep(pv_h);
10697 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10698 		pve_p = pvh_pve_list(pv_h);
10699 		pte_p = PT_ENTRY_NULL;
10700 	}
10701 
10702 	int pve_ptep_idx = 0;
10703 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10704 		vm_map_address_t va;
10705 		pmap_t          pmap;
10706 
10707 		if (pve_p != PV_ENTRY_NULL) {
10708 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10709 			if (pte_p == PT_ENTRY_NULL) {
10710 				goto cache_skip_pve;
10711 			}
10712 		}
10713 
10714 #ifdef PVH_FLAG_IOMMU
10715 		if (pvh_ptep_is_iommu(pte_p)) {
10716 			goto cache_skip_pve;
10717 		}
10718 #endif
10719 		pmap = ptep_get_pmap(pte_p);
10720 		va = ptep_get_va(pte_p);
10721 
10722 		tmplate = *pte_p;
10723 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10724 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10725 
10726 		if (perform_tlbi) {
10727 			write_pte_strong(pte_p, tmplate);
10728 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10729 		} else {
10730 			write_pte_fast(pte_p, tmplate);
10731 		}
10732 		tlb_flush_needed = true;
10733 
10734 cache_skip_pve:
10735 		pte_p = PT_ENTRY_NULL;
10736 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10737 			pve_ptep_idx = 0;
10738 			pve_p = pve_next(pve_p);
10739 		}
10740 	}
10741 	if (perform_tlbi && tlb_flush_needed) {
10742 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10743 	}
10744 
10745 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10746 
10747 	return tlb_flush_needed;
10748 }
10749 
10750 /**
10751  * Mark a pmap as being dedicated to use for a commpage mapping.
10752  * The pmap itself will never be activated on a CPU; its mappings will
10753  * only be embedded in userspace pmaps at a fixed virtual address.
10754  *
10755  * @param pmap the pmap to mark as belonging to a commpage.
10756  */
10757 static void
10758 pmap_set_commpage(pmap_t pmap)
10759 {
10760 #if XNU_MONITOR
10761 	assert(!pmap_ppl_locked_down);
10762 #endif
10763 	assert(pmap->type == PMAP_TYPE_USER);
10764 	pmap->type = PMAP_TYPE_COMMPAGE;
10765 	/*
10766 	 * Free the pmap's ASID.  This pmap should not ever be directly
10767 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
10768 	 * ASID space contention but will also cause pmap_switch() to panic
10769 	 * if an attacker tries to activate this pmap.  Disable preemption to
10770 	 * accommodate the *_nopreempt spinlock in free_asid().
10771 	 */
10772 	mp_disable_preemption();
10773 	pmap_get_pt_ops(pmap)->free_id(pmap);
10774 	mp_enable_preemption();
10775 }
10776 
10777 static void
10778 pmap_update_tt3e(
10779 	pmap_t pmap,
10780 	vm_address_t address,
10781 	tt_entry_t template)
10782 {
10783 	tt_entry_t *ptep, pte;
10784 
10785 	ptep = pmap_tt3e(pmap, address);
10786 	if (ptep == NULL) {
10787 		panic("%s: no ptep?", __FUNCTION__);
10788 	}
10789 
10790 	pte = *ptep;
10791 	pte = tte_to_pa(pte) | template;
10792 	write_pte_strong(ptep, pte);
10793 }
10794 
10795 /* Note absence of non-global bit */
10796 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10797 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10798 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10799 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10800 
10801 /* Note absence of non-global bit and no-execute bit.  */
10802 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10803 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10804 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10805 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10806 
10807 void
10808 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10809     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10810 {
10811 	kern_return_t kr;
10812 	pmap_paddr_t data_pa = 0; // data address
10813 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10814 	pmap_paddr_t text_pa = 0; // text address
10815 
10816 	*kernel_data_addr = 0;
10817 	*kernel_text_addr = 0;
10818 	*user_text_addr = 0;
10819 
10820 #if XNU_MONITOR
10821 	data_pa = pmap_alloc_page_for_kern(0);
10822 	assert(data_pa);
10823 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10824 	ro_data_pa = pmap_alloc_page_for_kern(0);
10825 	assert(ro_data_pa);
10826 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
10827 #if CONFIG_ARM_PFZ
10828 	text_pa = pmap_alloc_page_for_kern(0);
10829 	assert(text_pa);
10830 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10831 #endif
10832 
10833 #else /* XNU_MONITOR */
10834 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10835 	/*
10836 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
10837 	 * mapped at page granularity, so a separate page for kernel RO data would not
10838 	 * be useful.
10839 	 */
10840 	ro_data_pa = data_pa;
10841 #if CONFIG_ARM_PFZ
10842 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10843 #endif
10844 
10845 #endif /* XNU_MONITOR */
10846 
10847 	/*
10848 	 * In order to avoid burning extra pages on mapping the shared page, we
10849 	 * create a dedicated pmap for the shared page.  We forcibly nest the
10850 	 * translation tables from this pmap into other pmaps.  The level we
10851 	 * will nest at depends on the MMU configuration (page size, TTBR range,
10852 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10853 	 *
10854 	 * Note that this is NOT "the nested pmap" (which is used to nest the
10855 	 * shared cache).
10856 	 *
10857 	 * Note that we update parameters of the entry for our unique needs (NG
10858 	 * entry, etc.).
10859 	 */
10860 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10861 	assert(commpage_pmap_default != NULL);
10862 	pmap_set_commpage(commpage_pmap_default);
10863 
10864 	/* The user 64-bit mappings... */
10865 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10866 	assert(kr == KERN_SUCCESS);
10867 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10868 
10869 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10870 	assert(kr == KERN_SUCCESS);
10871 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10872 #if CONFIG_ARM_PFZ
10873 	/* User mapping of comm page text section for 64 bit mapping only
10874 	 *
10875 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10876 	 * user processes to get this page mapped in, they should never call into
10877 	 * this page.
10878 	 *
10879 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10880 	 * is slid in the same L3 as the data commpage.  It is either outside the
10881 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10882 	 * it is reserved and unavailable to mach VM for future mappings.
10883 	 */
10884 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
10885 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10886 
10887 	vm_map_address_t commpage_text_va = 0;
10888 
10889 	do {
10890 		int text_leaf_index = random() % num_ptes;
10891 
10892 		// Generate a VA for the commpage text with the same root and twig index as data
10893 		// comm page, but with new leaf index we've just generated.
10894 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10895 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10896 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10897 
10898 	// Assert that this is empty
10899 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
10900 	assert(ptep != PT_ENTRY_NULL);
10901 	assert(*ptep == ARM_TTE_EMPTY);
10902 
10903 	// At this point, we've found the address we want to insert our comm page at
10904 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10905 	assert(kr == KERN_SUCCESS);
10906 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10907 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10908 
10909 	*user_text_addr = commpage_text_va;
10910 #endif
10911 
10912 	/* ...and the user 32-bit mappings. */
10913 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10914 	assert(kr == KERN_SUCCESS);
10915 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10916 
10917 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10918 	assert(kr == KERN_SUCCESS);
10919 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10920 #if __ARM_MIXED_PAGE_SIZE__
10921 	/**
10922 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10923 	 * new set of page tables that point to the exact same 16K shared page as
10924 	 * before. Only the first 4K of the 16K shared page is mapped since that's
10925 	 * the only part that contains relevant data.
10926 	 */
10927 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10928 	assert(commpage_pmap_4k != NULL);
10929 	pmap_set_commpage(commpage_pmap_4k);
10930 
10931 	/* The user 64-bit mappings... */
10932 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10933 	assert(kr == KERN_SUCCESS);
10934 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10935 
10936 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10937 	assert(kr == KERN_SUCCESS);
10938 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10939 
10940 	/* ...and the user 32-bit mapping. */
10941 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10942 	assert(kr == KERN_SUCCESS);
10943 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10944 
10945 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10946 	assert(kr == KERN_SUCCESS);
10947 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10948 #endif
10949 
10950 	/* For manipulation in kernel, go straight to physical page */
10951 	*kernel_data_addr = phystokv(data_pa);
10952 	assert(commpage_ro_data_kva == 0);
10953 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
10954 	assert(commpage_text_kva == 0);
10955 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
10956 }
10957 
10958 
10959 /*
10960  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10961  * with user controlled TTEs for regions that aren't explicitly reserved by the
10962  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10963  */
10964 #if (ARM_PGSHIFT == 14)
10965 /**
10966  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10967  * commpage completely above the maximum 32-bit userspace VA.
10968  */
10969 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10970 
10971 /**
10972  * Normally there'd be an assert to check that 64-bit devices with 64-bit
10973  * userspace VAs can nest the commpage completely above the maximum 64-bit
10974  * userpace VA, but that technically isn't true on macOS. On those systems, the
10975  * commpage lives within the userspace VA range, but is protected by the VM as
10976  * a reserved region (see vm_reserved_regions[] definition for more info).
10977  */
10978 
10979 #elif (ARM_PGSHIFT == 12)
10980 /**
10981  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10982  * above the maximum userspace VA.
10983  */
10984 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10985 #else
10986 #error Nested shared page mapping is unsupported on this config
10987 #endif
10988 
10989 MARK_AS_PMAP_TEXT kern_return_t
10990 pmap_insert_commpage_internal(
10991 	pmap_t pmap)
10992 {
10993 	kern_return_t kr = KERN_SUCCESS;
10994 	vm_offset_t commpage_vaddr;
10995 	pt_entry_t *ttep, *src_ttep;
10996 	int options = 0;
10997 	pmap_t commpage_pmap = commpage_pmap_default;
10998 
10999 	/* Validate the pmap input before accessing its data. */
11000 	validate_pmap_mutable(pmap);
11001 
11002 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11003 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11004 
11005 #if __ARM_MIXED_PAGE_SIZE__
11006 #if !__ARM_16K_PG__
11007 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11008 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11009 #endif /* !__ARM_16K_PG__ */
11010 
11011 	/* Choose the correct shared page pmap to use. */
11012 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11013 	if (pmap_page_size == 16384) {
11014 		commpage_pmap = commpage_pmap_default;
11015 	} else if (pmap_page_size == 4096) {
11016 		commpage_pmap = commpage_pmap_4k;
11017 	} else {
11018 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11019 	}
11020 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11021 
11022 #if XNU_MONITOR
11023 	options |= PMAP_OPTIONS_NOWAIT;
11024 #endif /* XNU_MONITOR */
11025 
11026 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11027 #error We assume a single page.
11028 #endif
11029 
11030 	if (pmap_is_64bit(pmap)) {
11031 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11032 	} else {
11033 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11034 	}
11035 
11036 
11037 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11038 
11039 	/*
11040 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11041 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11042 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11043 	 * to "nest".
11044 	 *
11045 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11046 	 * nesting just means inserting pointers to pre-allocated tables inside of
11047 	 * the passed in pmap to allow us to share page tables (which map the shared
11048 	 * page) for every task. This saves at least one page of memory per process
11049 	 * compared to creating new page tables in every process for mapping the
11050 	 * shared page.
11051 	 */
11052 
11053 	/**
11054 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11055 	 * page's tables into place.
11056 	 */
11057 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11058 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11059 
11060 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11061 
11062 		if (kr != KERN_SUCCESS) {
11063 #if XNU_MONITOR
11064 			if (kr == KERN_RESOURCE_SHORTAGE) {
11065 				return kr;
11066 			} else
11067 #endif
11068 			if (kr == KERN_ABORTED) {
11069 				return kr;
11070 			} else {
11071 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11072 			}
11073 		}
11074 
11075 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11076 	}
11077 
11078 	if (*ttep != ARM_PTE_EMPTY) {
11079 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11080 	}
11081 
11082 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11083 
11084 	*ttep = *src_ttep;
11085 	FLUSH_PTE_STRONG();
11086 
11087 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11088 
11089 	return kr;
11090 }
11091 
11092 static void
11093 pmap_unmap_commpage(
11094 	pmap_t pmap)
11095 {
11096 	pt_entry_t *ttep;
11097 	vm_offset_t commpage_vaddr;
11098 	pmap_t commpage_pmap = commpage_pmap_default;
11099 
11100 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11101 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11102 
11103 #if __ARM_MIXED_PAGE_SIZE__
11104 #if !__ARM_16K_PG__
11105 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11106 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11107 #endif /* !__ARM_16K_PG__ */
11108 
11109 	/* Choose the correct shared page pmap to use. */
11110 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11111 	if (pmap_page_size == 16384) {
11112 		commpage_pmap = commpage_pmap_default;
11113 	} else if (pmap_page_size == 4096) {
11114 		commpage_pmap = commpage_pmap_4k;
11115 	} else {
11116 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11117 	}
11118 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11119 
11120 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11121 #error We assume a single page.
11122 #endif
11123 
11124 	if (pmap_is_64bit(pmap)) {
11125 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11126 	} else {
11127 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11128 	}
11129 
11130 
11131 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11132 
11133 	if (ttep == NULL) {
11134 		return;
11135 	}
11136 
11137 	/* It had better be mapped to the shared page. */
11138 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11139 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11140 	}
11141 
11142 	*ttep = ARM_TTE_EMPTY;
11143 	FLUSH_PTE_STRONG();
11144 
11145 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false);
11146 	sync_tlb_flush();
11147 }
11148 
11149 void
11150 pmap_insert_commpage(
11151 	pmap_t pmap)
11152 {
11153 	kern_return_t kr = KERN_FAILURE;
11154 #if XNU_MONITOR
11155 	do {
11156 		kr = pmap_insert_commpage_ppl(pmap);
11157 
11158 		if (kr == KERN_RESOURCE_SHORTAGE) {
11159 			pmap_alloc_page_for_ppl(0);
11160 		}
11161 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11162 
11163 	pmap_ledger_check_balance(pmap);
11164 #else
11165 	do {
11166 		kr = pmap_insert_commpage_internal(pmap);
11167 	} while (kr == KERN_ABORTED);
11168 #endif
11169 
11170 	if (kr != KERN_SUCCESS) {
11171 		panic("%s: failed to insert the shared page, kr=%d, "
11172 		    "pmap=%p",
11173 		    __FUNCTION__, kr,
11174 		    pmap);
11175 	}
11176 }
11177 
11178 static boolean_t
11179 pmap_is_64bit(
11180 	pmap_t pmap)
11181 {
11182 	return pmap->is_64bit;
11183 }
11184 
11185 bool
11186 pmap_is_exotic(
11187 	pmap_t pmap __unused)
11188 {
11189 	return false;
11190 }
11191 
11192 
11193 /* ARMTODO -- an implementation that accounts for
11194  * holes in the physical map, if any.
11195  */
11196 boolean_t
11197 pmap_valid_page(
11198 	ppnum_t pn)
11199 {
11200 	return pa_valid(ptoa(pn));
11201 }
11202 
11203 boolean_t
11204 pmap_bootloader_page(
11205 	ppnum_t pn)
11206 {
11207 	pmap_paddr_t paddr = ptoa(pn);
11208 
11209 	if (pa_valid(paddr)) {
11210 		return FALSE;
11211 	}
11212 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11213 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11214 }
11215 
11216 MARK_AS_PMAP_TEXT boolean_t
11217 pmap_is_empty_internal(
11218 	pmap_t pmap,
11219 	vm_map_offset_t va_start,
11220 	vm_map_offset_t va_end)
11221 {
11222 	vm_map_offset_t block_start, block_end;
11223 	tt_entry_t *tte_p;
11224 
11225 	if (pmap == NULL) {
11226 		return TRUE;
11227 	}
11228 
11229 	validate_pmap(pmap);
11230 
11231 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11232 	unsigned int initial_not_in_kdp = not_in_kdp;
11233 
11234 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11235 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11236 	}
11237 
11238 
11239 	/* TODO: This will be faster if we increment ttep at each level. */
11240 	block_start = va_start;
11241 
11242 	while (block_start < va_end) {
11243 		pt_entry_t     *bpte_p, *epte_p;
11244 		pt_entry_t     *pte_p;
11245 
11246 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11247 		if (block_end > va_end) {
11248 			block_end = va_end;
11249 		}
11250 
11251 		tte_p = pmap_tte(pmap, block_start);
11252 		if ((tte_p != PT_ENTRY_NULL)
11253 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11254 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11255 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11256 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11257 
11258 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11259 				if (*pte_p != ARM_PTE_EMPTY) {
11260 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11261 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11262 					}
11263 					return FALSE;
11264 				}
11265 			}
11266 		}
11267 		block_start = block_end;
11268 	}
11269 
11270 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11271 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11272 	}
11273 
11274 	return TRUE;
11275 }
11276 
11277 boolean_t
11278 pmap_is_empty(
11279 	pmap_t pmap,
11280 	vm_map_offset_t va_start,
11281 	vm_map_offset_t va_end)
11282 {
11283 #if XNU_MONITOR
11284 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11285 #else
11286 	return pmap_is_empty_internal(pmap, va_start, va_end);
11287 #endif
11288 }
11289 
11290 vm_map_offset_t
11291 pmap_max_offset(
11292 	boolean_t               is64,
11293 	unsigned int    option)
11294 {
11295 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11296 }
11297 
11298 vm_map_offset_t
11299 pmap_max_64bit_offset(
11300 	__unused unsigned int option)
11301 {
11302 	vm_map_offset_t max_offset_ret = 0;
11303 
11304 #if defined(__arm64__)
11305 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11306 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11307 		max_offset_ret = arm64_pmap_max_offset_default;
11308 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11309 		max_offset_ret = min_max_offset;
11310 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11311 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11312 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11313 		if (arm64_pmap_max_offset_default) {
11314 			max_offset_ret = arm64_pmap_max_offset_default;
11315 		} else if (max_mem > 0xC0000000) {
11316 			// devices with > 3GB of memory
11317 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11318 		} else if (max_mem > 0x40000000) {
11319 			// devices with > 1GB and <= 3GB of memory
11320 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11321 		} else {
11322 			// devices with <= 1 GB of memory
11323 			max_offset_ret = min_max_offset;
11324 		}
11325 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11326 		if (arm64_pmap_max_offset_default) {
11327 			// Allow the boot-arg to override jumbo size
11328 			max_offset_ret = arm64_pmap_max_offset_default;
11329 		} else {
11330 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11331 		}
11332 	} else {
11333 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11334 	}
11335 
11336 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11337 	assert(max_offset_ret >= min_max_offset);
11338 #else
11339 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11340 #endif
11341 
11342 	return max_offset_ret;
11343 }
11344 
11345 vm_map_offset_t
11346 pmap_max_32bit_offset(
11347 	unsigned int option)
11348 {
11349 	vm_map_offset_t max_offset_ret = 0;
11350 
11351 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11352 		max_offset_ret = arm_pmap_max_offset_default;
11353 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11354 		max_offset_ret = VM_MAX_ADDRESS;
11355 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11356 		max_offset_ret = VM_MAX_ADDRESS;
11357 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11358 		if (arm_pmap_max_offset_default) {
11359 			max_offset_ret = arm_pmap_max_offset_default;
11360 		} else if (max_mem > 0x20000000) {
11361 			max_offset_ret = VM_MAX_ADDRESS;
11362 		} else {
11363 			max_offset_ret = VM_MAX_ADDRESS;
11364 		}
11365 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11366 		max_offset_ret = VM_MAX_ADDRESS;
11367 	} else {
11368 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11369 	}
11370 
11371 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11372 	return max_offset_ret;
11373 }
11374 
11375 #if CONFIG_DTRACE
11376 /*
11377  * Constrain DTrace copyin/copyout actions
11378  */
11379 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11380 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11381 
11382 kern_return_t
11383 dtrace_copyio_preflight(
11384 	__unused addr64_t va)
11385 {
11386 	if (current_map() == kernel_map) {
11387 		return KERN_FAILURE;
11388 	} else {
11389 		return KERN_SUCCESS;
11390 	}
11391 }
11392 
11393 kern_return_t
11394 dtrace_copyio_postflight(
11395 	__unused addr64_t va)
11396 {
11397 	return KERN_SUCCESS;
11398 }
11399 #endif /* CONFIG_DTRACE */
11400 
11401 
11402 void
11403 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11404 {
11405 }
11406 
11407 
11408 void
11409 pmap_flush(
11410 	__unused pmap_flush_context *cpus_to_flush)
11411 {
11412 	/* not implemented yet */
11413 	return;
11414 }
11415 
11416 #if XNU_MONITOR
11417 
11418 /*
11419  * Enforce that the address range described by kva and nbytes is not currently
11420  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11421  * unintentionally writing to PPL-owned memory.
11422  */
11423 void
11424 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11425 {
11426 	vm_offset_t end;
11427 	if (os_add_overflow(kva, nbytes, &end)) {
11428 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11429 	}
11430 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11431 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11432 		pp_attr_t attr;
11433 		unsigned int pai = pa_index(pa);
11434 		if (ckva == phystokv(pa)) {
11435 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11436 		}
11437 		do {
11438 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11439 			if (attr & PP_ATTR_MONITOR) {
11440 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11441 			}
11442 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11443 	}
11444 }
11445 
11446 void
11447 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11448 {
11449 	vm_offset_t end;
11450 	if (os_add_overflow(kva, nbytes, &end)) {
11451 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11452 	}
11453 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11454 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11455 
11456 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11457 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11458 		}
11459 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11460 		ppattr_pa_clear_no_monitor(pa);
11461 	}
11462 }
11463 
11464 /**
11465  * Lock down a page, making all mappings read-only, and preventing further
11466  * mappings or removal of this particular kva's mapping. Effectively, it makes
11467  * the physical page at kva immutable (see the ppl_writable parameter for an
11468  * exception to this).
11469  *
11470  * @param kva Valid address to any mapping of the physical page to lockdown.
11471  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11472  * @param ppl_writable True if the PPL should still be able to write to the page
11473  *                     using the physical aperture mapping. False will make the
11474  *                     page read-only for both the kernel and PPL in the
11475  *                     physical aperture.
11476  */
11477 
11478 MARK_AS_PMAP_TEXT static void
11479 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11480 {
11481 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11482 }
11483 
11484 /**
11485  * Lock down a page, giving all mappings the specified maximum permissions, and
11486  * preventing further mappings or removal of this particular kva's mapping.
11487  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11488  * parameter for an exception to this).
11489  *
11490  * @param kva Valid address to any mapping of the physical page to lockdown.
11491  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11492  * @param ppl_writable True if the PPL should still be able to write to the page
11493  *                     using the physical aperture mapping. False will make the
11494  *                     page read-only for both the kernel and PPL in the
11495  *                     physical aperture.
11496  * @param prot Maximum permissions to allow in existing alias mappings
11497  */
11498 MARK_AS_PMAP_TEXT static void
11499 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11500 {
11501 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11502 	const unsigned int pai = pa_index(pa);
11503 
11504 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11505 	pvh_lock(pai);
11506 	pv_entry_t **pvh = pai_to_pvh(pai);
11507 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11508 
11509 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11510 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11511 	}
11512 
11513 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11514 		panic("%s: %#lx already locked down/executable (%#llx)",
11515 		    __func__, kva, (uint64_t)pvh_flags);
11516 	}
11517 
11518 
11519 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11520 
11521 	/* Update the physical aperture mapping to prevent kernel write access. */
11522 	const unsigned int new_xprr_perm =
11523 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11524 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11525 
11526 	pvh_unlock(pai);
11527 
11528 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11529 
11530 	/**
11531 	 * Double-check that the mapping didn't change physical addresses before the
11532 	 * LOCKDOWN flag was set (there is a brief window between the above
11533 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11534 	 *
11535 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11536 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11537 	 * page without the LOCKDOWN flag already set (so any future mappings can
11538 	 * only be RO, and no existing mappings can be removed).
11539 	 */
11540 	if (kvtophys_nofail(kva) != pa) {
11541 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11542 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11543 	}
11544 }
11545 
11546 /**
11547  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11548  * kernel once again.
11549  *
11550  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11551  *       to unlockdown a page that was never locked down, will panic.
11552  *
11553  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11554  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11555  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11556  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11557  *                     deviation will result in a panic.
11558  */
11559 MARK_AS_PMAP_TEXT static void
11560 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11561 {
11562 	pvh_assert_locked(pai);
11563 	pv_entry_t **pvh = pai_to_pvh(pai);
11564 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11565 
11566 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11567 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11568 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11569 	}
11570 
11571 
11572 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11573 
11574 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11575 	const unsigned int old_xprr_perm =
11576 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11577 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11578 }
11579 
11580 /**
11581  * Release a page from being locked down to the PPL, making it writable to the
11582  * kernel once again.
11583  *
11584  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11585  *       to unlockdown a page that was never locked down, will panic.
11586  *
11587  * @param kva Valid address to any mapping of the physical page to unlockdown.
11588  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11589  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11590  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11591  *                     deviation will result in a panic.
11592  */
11593 MARK_AS_PMAP_TEXT static void
11594 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11595 {
11596 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11597 	const unsigned int pai = pa_index(pa);
11598 
11599 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11600 	pvh_lock(pai);
11601 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11602 	pvh_unlock(pai);
11603 }
11604 
11605 #else /* XNU_MONITOR */
11606 
11607 void __unused
11608 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11609 {
11610 }
11611 
11612 void __unused
11613 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11614 {
11615 }
11616 
11617 #endif /* !XNU_MONITOR */
11618 
11619 
11620 MARK_AS_PMAP_TEXT static inline void
11621 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11622 {
11623 #if XNU_MONITOR
11624 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11625 #else
11626 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11627 #endif
11628 }
11629 
11630 MARK_AS_PMAP_TEXT static inline void
11631 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11632 {
11633 #if XNU_MONITOR
11634 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11635 #else
11636 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11637 #endif
11638 }
11639 
11640 /**
11641  * Perform basic validation checks on the destination only and
11642  * corresponding offset/sizes prior to writing to a read only allocation.
11643  *
11644  * @note Should be called before writing to an allocation from the read
11645  * only allocator.
11646  *
11647  * @param zid The ID of the zone the allocation belongs to.
11648  * @param va VA of element being modified (destination).
11649  * @param offset Offset being written to, in the element.
11650  * @param new_data_size Size of modification.
11651  *
11652  */
11653 
11654 MARK_AS_PMAP_TEXT static void
11655 pmap_ro_zone_validate_element_dst(
11656 	zone_id_t           zid,
11657 	vm_offset_t         va,
11658 	vm_offset_t         offset,
11659 	vm_size_t           new_data_size)
11660 {
11661 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11662 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11663 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11664 	}
11665 
11666 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11667 
11668 	/* Check element is from correct zone and properly aligned */
11669 	zone_require_ro(zid, elem_size, (void*)va);
11670 
11671 	if (__improbable(new_data_size > (elem_size - offset))) {
11672 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
11673 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11674 	}
11675 	if (__improbable(offset >= elem_size)) {
11676 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
11677 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11678 	}
11679 }
11680 
11681 
11682 /**
11683  * Perform basic validation checks on the source, destination and
11684  * corresponding offset/sizes prior to writing to a read only allocation.
11685  *
11686  * @note Should be called before writing to an allocation from the read
11687  * only allocator.
11688  *
11689  * @param zid The ID of the zone the allocation belongs to.
11690  * @param va VA of element being modified (destination).
11691  * @param offset Offset being written to, in the element.
11692  * @param new_data Pointer to new data (source).
11693  * @param new_data_size Size of modification.
11694  *
11695  */
11696 
11697 MARK_AS_PMAP_TEXT static void
11698 pmap_ro_zone_validate_element(
11699 	zone_id_t           zid,
11700 	vm_offset_t         va,
11701 	vm_offset_t         offset,
11702 	const vm_offset_t   new_data,
11703 	vm_size_t           new_data_size)
11704 {
11705 	vm_offset_t sum = 0;
11706 
11707 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11708 		panic("%s: Integer addition overflow %p + %lu = %lu",
11709 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11710 	}
11711 
11712 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11713 }
11714 
11715 /**
11716  * Ensure that physical page is locked down and pinned, before writing to it.
11717  *
11718  * @note Should be called before writing to an allocation from the read
11719  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11720  * ensure that it is called after the modification.
11721  *
11722  *
11723  * @param pa Physical address of the element being modified.
11724  * @param va Virtual address of element being modified.
11725  * @param size Size of the modification.
11726  *
11727  */
11728 
11729 MARK_AS_PMAP_TEXT static void
11730 pmap_ro_zone_lock_phy_page(
11731 	const pmap_paddr_t  pa,
11732 	vm_offset_t         va,
11733 	vm_size_t           size)
11734 {
11735 	const unsigned int pai = pa_index(pa);
11736 	pvh_lock(pai);
11737 
11738 	/* Ensure that the physical page is locked down */
11739 #if XNU_MONITOR
11740 	pv_entry_t **pvh = pai_to_pvh(pai);
11741 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11742 		panic("%s: Physical page not locked down %llx", __func__, pa);
11743 	}
11744 #endif /* XNU_MONITOR */
11745 
11746 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
11747 	pmap_pin_kernel_pages(va, size);
11748 }
11749 
11750 /**
11751  * Unlock and unpin physical page after writing to it.
11752  *
11753  * @note Should be called after writing to an allocation from the read
11754  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11755  * ensure that it has been called prior to the modification.
11756  *
11757  * @param pa Physical address of the element that was modified.
11758  * @param va Virtual address of element that was modified.
11759  * @param size Size of the modification.
11760  *
11761  */
11762 
11763 MARK_AS_PMAP_TEXT static void
11764 pmap_ro_zone_unlock_phy_page(
11765 	const pmap_paddr_t  pa,
11766 	vm_offset_t         va,
11767 	vm_size_t           size)
11768 {
11769 	const unsigned int pai = pa_index(pa);
11770 	pmap_unpin_kernel_pages(va, size);
11771 	pvh_unlock(pai);
11772 }
11773 
11774 /**
11775  * Function to copy kauth_cred from new_data to kv.
11776  * Function defined in "kern_prot.c"
11777  *
11778  * @note Will be removed upon completion of
11779  * <rdar://problem/72635194> Compiler PAC support for memcpy.
11780  *
11781  * @param kv Address to copy new data to.
11782  * @param new_data Pointer to new data.
11783  *
11784  */
11785 
11786 extern void
11787 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11788 
11789 /**
11790  * Zalloc-specific memcpy that writes through the physical aperture
11791  * and ensures the element being modified is from a read-only zone.
11792  *
11793  * @note Designed to work only with the zone allocator's read-only submap.
11794  *
11795  * @param zid The ID of the zone to allocate from.
11796  * @param va VA of element to be modified.
11797  * @param offset Offset from element.
11798  * @param new_data Pointer to new data.
11799  * @param new_data_size	Size of modification.
11800  *
11801  */
11802 
11803 void
11804 pmap_ro_zone_memcpy(
11805 	zone_id_t           zid,
11806 	vm_offset_t         va,
11807 	vm_offset_t         offset,
11808 	const vm_offset_t   new_data,
11809 	vm_size_t           new_data_size)
11810 {
11811 #if XNU_MONITOR
11812 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11813 #else /* XNU_MONITOR */
11814 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11815 #endif /* XNU_MONITOR */
11816 }
11817 
11818 MARK_AS_PMAP_TEXT void
11819 pmap_ro_zone_memcpy_internal(
11820 	zone_id_t             zid,
11821 	vm_offset_t           va,
11822 	vm_offset_t           offset,
11823 	const vm_offset_t     new_data,
11824 	vm_size_t             new_data_size)
11825 {
11826 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11827 
11828 	if (!new_data || new_data_size == 0) {
11829 		return;
11830 	}
11831 
11832 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11833 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11834 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11835 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11836 }
11837 
11838 /**
11839  * Zalloc-specific function to atomically mutate fields of an element that
11840  * belongs to a read-only zone, via the physcial aperture.
11841  *
11842  * @note Designed to work only with the zone allocator's read-only submap.
11843  *
11844  * @param zid The ID of the zone the element belongs to.
11845  * @param va VA of element to be modified.
11846  * @param offset Offset in element.
11847  * @param op Atomic operation to perform.
11848  * @param value	Mutation value.
11849  *
11850  */
11851 
11852 uint64_t
11853 pmap_ro_zone_atomic_op(
11854 	zone_id_t             zid,
11855 	vm_offset_t           va,
11856 	vm_offset_t           offset,
11857 	zro_atomic_op_t       op,
11858 	uint64_t              value)
11859 {
11860 #if XNU_MONITOR
11861 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11862 #else /* XNU_MONITOR */
11863 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11864 #endif /* XNU_MONITOR */
11865 }
11866 
11867 MARK_AS_PMAP_TEXT uint64_t
11868 pmap_ro_zone_atomic_op_internal(
11869 	zone_id_t             zid,
11870 	vm_offset_t           va,
11871 	vm_offset_t           offset,
11872 	zro_atomic_op_t       op,
11873 	uint64_t              value)
11874 {
11875 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11876 	vm_size_t value_size = op & 0xf;
11877 
11878 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11879 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
11880 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11881 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11882 
11883 	return value;
11884 }
11885 
11886 /**
11887  * bzero for allocations from read only zones, that writes through the
11888  * physical aperture.
11889  *
11890  * @note This is called by the zfree path of all allocations from read
11891  * only zones.
11892  *
11893  * @param zid The ID of the zone the allocation belongs to.
11894  * @param va VA of element to be zeroed.
11895  * @param offset Offset in the element.
11896  * @param size	Size of allocation.
11897  *
11898  */
11899 
11900 void
11901 pmap_ro_zone_bzero(
11902 	zone_id_t       zid,
11903 	vm_offset_t     va,
11904 	vm_offset_t     offset,
11905 	vm_size_t       size)
11906 {
11907 #if XNU_MONITOR
11908 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11909 #else /* XNU_MONITOR */
11910 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
11911 #endif /* XNU_MONITOR */
11912 }
11913 
11914 MARK_AS_PMAP_TEXT void
11915 pmap_ro_zone_bzero_internal(
11916 	zone_id_t       zid,
11917 	vm_offset_t     va,
11918 	vm_offset_t     offset,
11919 	vm_size_t       size)
11920 {
11921 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11922 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11923 	pmap_ro_zone_lock_phy_page(pa, va, size);
11924 	bzero((void*)phystokv(pa), size);
11925 	pmap_ro_zone_unlock_phy_page(pa, va, size);
11926 }
11927 
11928 /**
11929  * Removes write access from the Physical Aperture.
11930  *
11931  * @note For non-PPL devices, it simply makes all virtual mappings RO.
11932  * @note Designed to work only with the zone allocator's read-only submap.
11933  *
11934  * @param va VA of the page to restore write access to.
11935  *
11936  */
11937 MARK_AS_PMAP_TEXT static void
11938 pmap_phys_write_disable(vm_address_t va)
11939 {
11940 #if XNU_MONITOR
11941 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11942 #else /* XNU_MONITOR */
11943 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11944 #endif /* XNU_MONITOR */
11945 }
11946 
11947 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
11948 
11949 MARK_AS_PMAP_TEXT mach_vm_size_t
11950 pmap_query_resident_internal(
11951 	pmap_t                  pmap,
11952 	vm_map_address_t        start,
11953 	vm_map_address_t        end,
11954 	mach_vm_size_t          *compressed_bytes_p)
11955 {
11956 	mach_vm_size_t  resident_bytes = 0;
11957 	mach_vm_size_t  compressed_bytes = 0;
11958 
11959 	pt_entry_t     *bpte, *epte;
11960 	pt_entry_t     *pte_p;
11961 	tt_entry_t     *tte_p;
11962 
11963 	if (pmap == NULL) {
11964 		return PMAP_RESIDENT_INVALID;
11965 	}
11966 
11967 	validate_pmap(pmap);
11968 
11969 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11970 
11971 	/* Ensure that this request is valid, and addresses exactly one TTE. */
11972 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11973 	    (end % pt_attr_page_size(pt_attr)))) {
11974 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11975 	}
11976 
11977 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11978 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11979 	}
11980 
11981 	pmap_lock(pmap, PMAP_LOCK_SHARED);
11982 	tte_p = pmap_tte(pmap, start);
11983 	if (tte_p == (tt_entry_t *) NULL) {
11984 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11985 		return PMAP_RESIDENT_INVALID;
11986 	}
11987 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
11988 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
11989 		bpte = &pte_p[pte_index(pt_attr, start)];
11990 		epte = &pte_p[pte_index(pt_attr, end)];
11991 
11992 		for (; bpte < epte; bpte++) {
11993 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
11994 				compressed_bytes += pt_attr_page_size(pt_attr);
11995 			} else if (pa_valid(pte_to_pa(*bpte))) {
11996 				resident_bytes += pt_attr_page_size(pt_attr);
11997 			}
11998 		}
11999 	}
12000 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12001 
12002 	if (compressed_bytes_p) {
12003 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12004 		*compressed_bytes_p += compressed_bytes;
12005 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12006 	}
12007 
12008 	return resident_bytes;
12009 }
12010 
12011 mach_vm_size_t
12012 pmap_query_resident(
12013 	pmap_t                  pmap,
12014 	vm_map_address_t        start,
12015 	vm_map_address_t        end,
12016 	mach_vm_size_t          *compressed_bytes_p)
12017 {
12018 	mach_vm_size_t          total_resident_bytes;
12019 	mach_vm_size_t          compressed_bytes;
12020 	vm_map_address_t        va;
12021 
12022 
12023 	if (pmap == PMAP_NULL) {
12024 		if (compressed_bytes_p) {
12025 			*compressed_bytes_p = 0;
12026 		}
12027 		return 0;
12028 	}
12029 
12030 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12031 
12032 	total_resident_bytes = 0;
12033 	compressed_bytes = 0;
12034 
12035 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12036 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12037 	    VM_KERNEL_ADDRHIDE(end));
12038 
12039 	va = start;
12040 	while (va < end) {
12041 		vm_map_address_t l;
12042 		mach_vm_size_t resident_bytes;
12043 
12044 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12045 
12046 		if (l > end) {
12047 			l = end;
12048 		}
12049 #if XNU_MONITOR
12050 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12051 #else
12052 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12053 #endif
12054 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12055 			break;
12056 		}
12057 
12058 		total_resident_bytes += resident_bytes;
12059 
12060 		va = l;
12061 	}
12062 
12063 	if (compressed_bytes_p) {
12064 		*compressed_bytes_p = compressed_bytes;
12065 	}
12066 
12067 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12068 	    total_resident_bytes);
12069 
12070 	return total_resident_bytes;
12071 }
12072 
12073 #if MACH_ASSERT
12074 static void
12075 pmap_check_ledgers(
12076 	pmap_t pmap)
12077 {
12078 	int     pid;
12079 	char    *procname;
12080 
12081 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12082 		/*
12083 		 * This pmap was not or is no longer fully associated
12084 		 * with a task (e.g. the old pmap after a fork()/exec() or
12085 		 * spawn()).  Its "ledger" still points at a task that is
12086 		 * now using a different (and active) address space, so
12087 		 * we can't check that all the pmap ledgers are balanced here.
12088 		 *
12089 		 * If the "pid" is set, that means that we went through
12090 		 * pmap_set_process() in task_terminate_internal(), so
12091 		 * this task's ledger should not have been re-used and
12092 		 * all the pmap ledgers should be back to 0.
12093 		 */
12094 		return;
12095 	}
12096 
12097 	pid = pmap->pmap_pid;
12098 	procname = pmap->pmap_procname;
12099 
12100 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12101 }
12102 #endif /* MACH_ASSERT */
12103 
12104 void
12105 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12106 {
12107 }
12108 
12109 /**
12110  * The minimum shared region nesting size is used by the VM to determine when to
12111  * break up large mappings to nested regions. The smallest size that these
12112  * mappings can be broken into is determined by what page table level those
12113  * regions are being nested in at and the size of the page tables.
12114  *
12115  * For instance, if a nested region is nesting at L2 for a process utilizing
12116  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12117  * block entry).
12118  *
12119  * @param pmap The target pmap to determine the block size based on whether it's
12120  *             using 16KB or 4KB page tables.
12121  */
12122 uint64_t
12123 pmap_shared_region_size_min(__unused pmap_t pmap)
12124 {
12125 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12126 
12127 	/**
12128 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12129 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12130 	 * point to shared L3 page tables in the shared region pmap.
12131 	 */
12132 	return pt_attr_twig_size(pt_attr);
12133 }
12134 
12135 boolean_t
12136 pmap_enforces_execute_only(
12137 	pmap_t pmap)
12138 {
12139 	return pmap != kernel_pmap;
12140 }
12141 
12142 MARK_AS_PMAP_TEXT void
12143 pmap_set_vm_map_cs_enforced_internal(
12144 	pmap_t pmap,
12145 	bool new_value)
12146 {
12147 	validate_pmap_mutable(pmap);
12148 	pmap->pmap_vm_map_cs_enforced = new_value;
12149 }
12150 
12151 void
12152 pmap_set_vm_map_cs_enforced(
12153 	pmap_t pmap,
12154 	bool new_value)
12155 {
12156 #if XNU_MONITOR
12157 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12158 #else
12159 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12160 #endif
12161 }
12162 
12163 extern int cs_process_enforcement_enable;
12164 bool
12165 pmap_get_vm_map_cs_enforced(
12166 	pmap_t pmap)
12167 {
12168 	if (cs_process_enforcement_enable) {
12169 		return true;
12170 	}
12171 	return pmap->pmap_vm_map_cs_enforced;
12172 }
12173 
12174 MARK_AS_PMAP_TEXT void
12175 pmap_set_jit_entitled_internal(
12176 	__unused pmap_t pmap)
12177 {
12178 	return;
12179 }
12180 
12181 void
12182 pmap_set_jit_entitled(
12183 	pmap_t pmap)
12184 {
12185 #if XNU_MONITOR
12186 	pmap_set_jit_entitled_ppl(pmap);
12187 #else
12188 	pmap_set_jit_entitled_internal(pmap);
12189 #endif
12190 }
12191 
12192 bool
12193 pmap_get_jit_entitled(
12194 	__unused pmap_t pmap)
12195 {
12196 	return false;
12197 }
12198 
12199 MARK_AS_PMAP_TEXT void
12200 pmap_set_tpro_internal(
12201 	__unused pmap_t pmap)
12202 {
12203 	return;
12204 }
12205 
12206 void
12207 pmap_set_tpro(
12208 	pmap_t pmap)
12209 {
12210 #if XNU_MONITOR
12211 	pmap_set_tpro_ppl(pmap);
12212 #else /* XNU_MONITOR */
12213 	pmap_set_tpro_internal(pmap);
12214 #endif /* XNU_MONITOR */
12215 }
12216 
12217 bool
12218 pmap_get_tpro(
12219 	__unused pmap_t pmap)
12220 {
12221 	return false;
12222 }
12223 
12224 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12225 
12226 MARK_AS_PMAP_TEXT kern_return_t
12227 pmap_query_page_info_internal(
12228 	pmap_t          pmap,
12229 	vm_map_offset_t va,
12230 	int             *disp_p)
12231 {
12232 	pmap_paddr_t    pa;
12233 	int             disp;
12234 	unsigned int    pai;
12235 	pt_entry_t      *pte_p, pte;
12236 	pv_entry_t      **pv_h, *pve_p;
12237 
12238 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12239 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12240 		*disp_p = 0;
12241 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12242 		return KERN_INVALID_ARGUMENT;
12243 	}
12244 
12245 	validate_pmap(pmap);
12246 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12247 
12248 try_again:
12249 	disp = 0;
12250 	pte_p = pmap_pte(pmap, va);
12251 	if (pte_p == PT_ENTRY_NULL) {
12252 		goto done;
12253 	}
12254 	pte = *(volatile pt_entry_t*)pte_p;
12255 	pa = pte_to_pa(pte);
12256 	if (pa == 0) {
12257 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12258 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12259 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12260 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12261 			}
12262 		}
12263 	} else {
12264 		disp |= PMAP_QUERY_PAGE_PRESENT;
12265 		pai = pa_index(pa);
12266 		if (!pa_valid(pa)) {
12267 			goto done;
12268 		}
12269 		pvh_lock(pai);
12270 		if (pte != *(volatile pt_entry_t*)pte_p) {
12271 			/* something changed: try again */
12272 			pvh_unlock(pai);
12273 			pmap_query_page_info_retries++;
12274 			goto try_again;
12275 		}
12276 		pv_h = pai_to_pvh(pai);
12277 		pve_p = PV_ENTRY_NULL;
12278 		int pve_ptep_idx = 0;
12279 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12280 			pve_p = pvh_pve_list(pv_h);
12281 			while (pve_p != PV_ENTRY_NULL &&
12282 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12283 				pve_p = pve_next(pve_p);
12284 			}
12285 		}
12286 
12287 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12288 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12289 		} else if (ppattr_test_reusable(pai)) {
12290 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12291 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12292 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12293 		}
12294 		pvh_unlock(pai);
12295 	}
12296 
12297 done:
12298 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12299 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12300 	*disp_p = disp;
12301 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12302 	return KERN_SUCCESS;
12303 }
12304 
12305 kern_return_t
12306 pmap_query_page_info(
12307 	pmap_t          pmap,
12308 	vm_map_offset_t va,
12309 	int             *disp_p)
12310 {
12311 #if XNU_MONITOR
12312 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12313 #else
12314 	return pmap_query_page_info_internal(pmap, va, disp_p);
12315 #endif
12316 }
12317 
12318 
12319 
12320 uint32_t
12321 pmap_user_va_bits(pmap_t pmap __unused)
12322 {
12323 #if __ARM_MIXED_PAGE_SIZE__
12324 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12325 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12326 #else
12327 	return 64 - T0SZ_BOOT;
12328 #endif
12329 }
12330 
12331 uint32_t
12332 pmap_kernel_va_bits(void)
12333 {
12334 	return 64 - T1SZ_BOOT;
12335 }
12336 
12337 static vm_map_size_t
12338 pmap_user_va_size(pmap_t pmap)
12339 {
12340 	return 1ULL << pmap_user_va_bits(pmap);
12341 }
12342 
12343 
12344 
12345 bool
12346 pmap_in_ppl(void)
12347 {
12348 	// Unsupported
12349 	return false;
12350 }
12351 
12352 __attribute__((__noreturn__))
12353 void
12354 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12355 {
12356 	panic("%s called on an unsupported platform.", __FUNCTION__);
12357 }
12358 
12359 void *
12360 pmap_claim_reserved_ppl_page(void)
12361 {
12362 	// Unsupported
12363 	return NULL;
12364 }
12365 
12366 void
12367 pmap_free_reserved_ppl_page(void __unused *kva)
12368 {
12369 	// Unsupported
12370 }
12371 
12372 
12373 #if PMAP_CS_PPL_MONITOR
12374 
12375 /* Immutable part of the trust cache runtime */
12376 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12377 
12378 /* Mutable part of the trust cache runtime */
12379 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12380 
12381 /* Lock for the trust cache runtime */
12382 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12383 
12384 MARK_AS_PMAP_TEXT kern_return_t
12385 pmap_check_trust_cache_runtime_for_uuid_internal(
12386 	const uint8_t check_uuid[kUUIDSize])
12387 {
12388 	kern_return_t ret = KERN_DENIED;
12389 
12390 	if (amfi->TrustCache.version < 3) {
12391 		/* AMFI change hasn't landed in the build */
12392 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12393 		return KERN_NOT_SUPPORTED;
12394 	}
12395 
12396 	/* Lock the runtime as shared */
12397 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12398 
12399 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12400 		&ppl_trust_cache_rt,
12401 		check_uuid,
12402 		NULL);
12403 
12404 	/* Unlock the runtime */
12405 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12406 
12407 	if (tc_ret.error == kTCReturnSuccess) {
12408 		ret = KERN_SUCCESS;
12409 	} else if (tc_ret.error == kTCReturnNotFound) {
12410 		ret = KERN_NOT_FOUND;
12411 	} else {
12412 		ret = KERN_FAILURE;
12413 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12414 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12415 	}
12416 
12417 	return ret;
12418 }
12419 
12420 kern_return_t
12421 pmap_check_trust_cache_runtime_for_uuid(
12422 	const uint8_t check_uuid[kUUIDSize])
12423 {
12424 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12425 }
12426 
12427 MARK_AS_PMAP_TEXT kern_return_t
12428 pmap_load_trust_cache_with_type_internal(
12429 	TCType_t type,
12430 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12431 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12432 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12433 {
12434 	kern_return_t ret = KERN_DENIED;
12435 	pmap_img4_payload_t *payload = NULL;
12436 	size_t img4_payload_len = 0;
12437 	size_t payload_len_aligned = 0;
12438 	size_t manifest_len_aligned = 0;
12439 
12440 	/* Ignore the auxiliary manifest until we add support for it */
12441 	(void)img4_aux_manifest;
12442 	(void)img4_aux_manifest_len;
12443 
12444 
12445 #if PMAP_CS_INCLUDE_CODE_SIGNING
12446 	if (pmap_cs) {
12447 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12448 			panic("trust cache type not loadable from interface: %u", type);
12449 		} else if (type >= kTCTypeTotal) {
12450 			panic("attempted to load an unsupported trust cache type: %u", type);
12451 		}
12452 
12453 		/* Validate entitlement for the calling process */
12454 		if (TCTypeConfig[type].entitlementValue != NULL) {
12455 			const bool entitlement_satisfied = check_entitlement_pmap(
12456 				NULL,
12457 				"com.apple.private.pmap.load-trust-cache",
12458 				TCTypeConfig[type].entitlementValue,
12459 				false,
12460 				true);
12461 
12462 			if (entitlement_satisfied == false) {
12463 				panic("attempted to load trust cache without entitlement: %u", type);
12464 			}
12465 		}
12466 	}
12467 #endif
12468 
12469 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12470 	ret = pmap_reserve_ppl_page();
12471 	if (ret != KERN_SUCCESS) {
12472 		if (ret != KERN_RESOURCE_SHORTAGE) {
12473 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12474 		}
12475 		return ret;
12476 	}
12477 
12478 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12479 	payload_len_aligned = round_page(pmap_img4_payload_len);
12480 	manifest_len_aligned = round_page(img4_manifest_len);
12481 
12482 	/* Ensure we have valid data passed in */
12483 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12484 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12485 
12486 	/*
12487 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12488 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12489 	 * write to that data structure, so we keep the payload PPL writable.
12490 	 */
12491 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12492 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12493 
12494 	/* Should be safe to read from this now */
12495 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12496 
12497 	/* Acquire a writable version of the trust cache data structure */
12498 	TrustCache_t *trust_cache = &payload->trust_cache;
12499 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12500 
12501 	/* Calculate the correct length of the img4 payload */
12502 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12503 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12504 	}
12505 
12506 	/* Exclusively lock the runtime */
12507 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12508 
12509 	/* Load the trust cache */
12510 	TCReturn_t tc_ret = amfi->TrustCache.load(
12511 		&ppl_trust_cache_rt,
12512 		type,
12513 		trust_cache,
12514 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12515 		(const uintptr_t)img4_manifest, img4_manifest_len);
12516 
12517 	/* Unlock the runtime */
12518 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12519 
12520 	if (tc_ret.error == kTCReturnSuccess) {
12521 		ret = KERN_SUCCESS;
12522 	} else {
12523 		if (tc_ret.error == kTCReturnDuplicate) {
12524 			ret = KERN_ALREADY_IN_SET;
12525 		} else {
12526 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12527 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12528 
12529 			ret = KERN_FAILURE;
12530 		}
12531 
12532 		/* Unlock the payload data */
12533 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12534 		trust_cache = NULL;
12535 		payload = NULL;
12536 	}
12537 
12538 	/* Unlock the manifest since it is no longer needed */
12539 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12540 
12541 	/* Return the CoreCrypto reserved page back to the free list */
12542 	pmap_release_reserved_ppl_page();
12543 
12544 	return ret;
12545 }
12546 
12547 kern_return_t
12548 pmap_load_trust_cache_with_type(
12549 	TCType_t type,
12550 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12551 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12552 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12553 {
12554 	kern_return_t ret = KERN_DENIED;
12555 
12556 	ret = pmap_load_trust_cache_with_type_ppl(
12557 		type,
12558 		pmap_img4_payload, pmap_img4_payload_len,
12559 		img4_manifest, img4_manifest_len,
12560 		img4_aux_manifest, img4_aux_manifest_len);
12561 
12562 	while (ret == KERN_RESOURCE_SHORTAGE) {
12563 		/* Allocate a page from the free list */
12564 		pmap_alloc_page_for_ppl(0);
12565 
12566 		/* Attempt the call again */
12567 		ret = pmap_load_trust_cache_with_type_ppl(
12568 			type,
12569 			pmap_img4_payload, pmap_img4_payload_len,
12570 			img4_manifest, img4_manifest_len,
12571 			img4_aux_manifest, img4_aux_manifest_len);
12572 	}
12573 
12574 	return ret;
12575 }
12576 
12577 MARK_AS_PMAP_TEXT kern_return_t
12578 pmap_query_trust_cache_safe(
12579 	TCQueryType_t query_type,
12580 	const uint8_t cdhash[kTCEntryHashSize],
12581 	TrustCacheQueryToken_t *query_token)
12582 {
12583 	kern_return_t ret = KERN_NOT_FOUND;
12584 
12585 	/* Validate the query type preemptively */
12586 	if (query_type >= kTCQueryTypeTotal) {
12587 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12588 		return KERN_INVALID_ARGUMENT;
12589 	}
12590 
12591 	/* Lock the runtime as shared */
12592 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12593 
12594 	TCReturn_t tc_ret = amfi->TrustCache.query(
12595 		&ppl_trust_cache_rt,
12596 		query_type,
12597 		cdhash,
12598 		query_token);
12599 
12600 	/* Unlock the runtime */
12601 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12602 
12603 	if (tc_ret.error == kTCReturnSuccess) {
12604 		ret = KERN_SUCCESS;
12605 	} else if (tc_ret.error == kTCReturnNotFound) {
12606 		ret = KERN_NOT_FOUND;
12607 	} else {
12608 		ret = KERN_FAILURE;
12609 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12610 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12611 	}
12612 
12613 	return ret;
12614 }
12615 
12616 MARK_AS_PMAP_TEXT kern_return_t
12617 pmap_query_trust_cache_internal(
12618 	TCQueryType_t query_type,
12619 	const uint8_t cdhash[kTCEntryHashSize],
12620 	TrustCacheQueryToken_t *query_token)
12621 {
12622 	kern_return_t ret = KERN_NOT_FOUND;
12623 	TrustCacheQueryToken_t query_token_safe = {0};
12624 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12625 
12626 	/* Copy in the CDHash into PPL storage */
12627 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12628 
12629 	/* Query through the safe API since we're in the PPL now */
12630 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12631 
12632 	if (query_token != NULL) {
12633 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12634 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12635 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12636 	}
12637 
12638 	return ret;
12639 }
12640 
12641 kern_return_t
12642 pmap_query_trust_cache(
12643 	TCQueryType_t query_type,
12644 	const uint8_t cdhash[kTCEntryHashSize],
12645 	TrustCacheQueryToken_t *query_token)
12646 {
12647 	kern_return_t ret = KERN_NOT_FOUND;
12648 
12649 	ret = pmap_query_trust_cache_ppl(
12650 		query_type,
12651 		cdhash,
12652 		query_token);
12653 
12654 	return ret;
12655 }
12656 
12657 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
12658 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12659 
12660 MARK_AS_PMAP_TEXT void
12661 pmap_toggle_developer_mode_internal(
12662 	bool state)
12663 {
12664 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12665 
12666 	/*
12667 	 * Only the following state transitions are allowed:
12668 	 * -- not set --> false
12669 	 * -- not set --> true
12670 	 * -- true --> false
12671 	 * -- true --> true
12672 	 * -- false --> false
12673 	 *
12674 	 * We never allow false --> true transitions.
12675 	 */
12676 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
12677 
12678 	if ((current == false) && (state == true) && state_set) {
12679 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
12680 	}
12681 
12682 	/* We're going to update the developer mode state, so update this first */
12683 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
12684 
12685 	/* Update the developer mode state on the system */
12686 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
12687 }
12688 
12689 void
12690 pmap_toggle_developer_mode(
12691 	bool state)
12692 {
12693 	pmap_toggle_developer_mode_ppl(state);
12694 }
12695 
12696 #endif /* PMAP_CS_PPL_MONITOR */
12697 
12698 #if PMAP_CS_INCLUDE_CODE_SIGNING
12699 
12700 static int
12701 pmap_cs_profiles_rbtree_compare(
12702 	void *profile0,
12703 	void *profile1)
12704 {
12705 	if (profile0 < profile1) {
12706 		return -1;
12707 	} else if (profile0 > profile1) {
12708 		return 1;
12709 	}
12710 	return 0;
12711 }
12712 
12713 /* Red-black tree for managing provisioning profiles */
12714 MARK_AS_PMAP_DATA static
12715 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
12716 
12717 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12718 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12719 
12720 /* Lock for the profile red-black tree */
12721 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
12722 
12723 void
12724 pmap_initialize_provisioning_profiles(void)
12725 {
12726 	/* Initialize the profiles red-black tree lock */
12727 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
12728 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
12729 
12730 	/* Initialize the red-black tree itself */
12731 	RB_INIT(&pmap_cs_registered_profiles);
12732 
12733 	printf("initialized PPL provisioning profile data\n");
12734 }
12735 
12736 static bool
12737 pmap_is_testflight_profile(
12738 	pmap_cs_profile_t *profile_obj)
12739 {
12740 	const char *entitlement_name = "beta-reports-active";
12741 	const size_t entitlement_length = strlen(entitlement_name);
12742 	CEQueryOperation_t query[2] = {0};
12743 
12744 	/* If the profile provisions no entitlements, then it isn't a test flight one */
12745 	if (profile_obj->entitlements_ctx == NULL) {
12746 		return false;
12747 	}
12748 
12749 	/* Build our CoreEntitlements query */
12750 	query[0].opcode = kCEOpSelectKey;
12751 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
12752 	query[0].parameters.stringParameter.length = entitlement_length;
12753 	query[1] = CEMatchBool(true);
12754 
12755 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
12756 		profile_obj->entitlements_ctx,
12757 		query, 2);
12758 
12759 	if (ce_err == amfi->CoreEntitlements.kNoError) {
12760 		return true;
12761 	}
12762 
12763 	return false;
12764 }
12765 
12766 static bool
12767 pmap_is_development_profile(
12768 	pmap_cs_profile_t *profile_obj)
12769 {
12770 	/* Check for UPP */
12771 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
12772 		*profile_obj->profile_ctx,
12773 		CESelectDictValue("ProvisionsAllDevices"));
12774 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
12775 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
12776 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
12777 			return false;
12778 		}
12779 	}
12780 
12781 	/* Check for TestFlight profile */
12782 	if (pmap_is_testflight_profile(profile_obj) == true) {
12783 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
12784 		return false;
12785 	}
12786 
12787 	pmap_cs_log_info("%p: development profile", profile_obj);
12788 	return true;
12789 }
12790 
12791 static kern_return_t
12792 pmap_initialize_profile_entitlements(
12793 	pmap_cs_profile_t *profile_obj)
12794 {
12795 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
12796 		*profile_obj->profile_ctx,
12797 		CESelectDictValue("Entitlements"));
12798 
12799 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
12800 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
12801 		profile_obj->entitlements_ctx = NULL;
12802 
12803 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
12804 		return KERN_NOT_FOUND;
12805 	}
12806 
12807 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
12808 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
12809 
12810 	CEValidationResult ce_result = {0};
12811 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
12812 		pmap_cs_core_entitlements_runtime,
12813 		&ce_result,
12814 		der_start, der_end);
12815 	if (ce_err != amfi->CoreEntitlements.kNoError) {
12816 		pmap_cs_log_error("unable to validate profile entitlements: %s",
12817 		    amfi->CoreEntitlements.GetErrorString(ce_err));
12818 
12819 		return KERN_ABORTED;
12820 	}
12821 
12822 	struct CEQueryContext query_ctx = {0};
12823 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
12824 		pmap_cs_core_entitlements_runtime,
12825 		ce_result,
12826 		&query_ctx);
12827 	if (ce_err != amfi->CoreEntitlements.kNoError) {
12828 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
12829 		    amfi->CoreEntitlements.GetErrorString(ce_err));
12830 
12831 		return KERN_ABORTED;
12832 	}
12833 
12834 	/* Setup the entitlements context within the profile object */
12835 	profile_obj->entitlements_ctx_storage = query_ctx;
12836 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
12837 
12838 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
12839 	return KERN_SUCCESS;
12840 }
12841 
12842 kern_return_t
12843 pmap_register_provisioning_profile_internal(
12844 	const vm_address_t payload_addr,
12845 	const vm_size_t payload_size)
12846 {
12847 	kern_return_t ret = KERN_DENIED;
12848 	pmap_cs_profile_t *profile_obj = NULL;
12849 	pmap_profile_payload_t *profile_payload = NULL;
12850 	vm_size_t max_profile_blob_size = 0;
12851 	const uint8_t *profile_content = NULL;
12852 	size_t profile_content_length = 0;
12853 
12854 
12855 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
12856 	ret = pmap_reserve_ppl_page();
12857 	if (ret != KERN_SUCCESS) {
12858 		if (ret != KERN_RESOURCE_SHORTAGE) {
12859 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
12860 		}
12861 		return ret;
12862 	}
12863 
12864 	/* Ensure we have valid data passed in */
12865 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
12866 
12867 	/*
12868 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
12869 	 * data structure used by the PPL to manage the payload. We need to be able to write
12870 	 * to that data structure, so we keep the payload PPL writable.
12871 	 */
12872 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
12873 
12874 	/* Should be safe to read from this now */
12875 	profile_payload = (pmap_profile_payload_t*)payload_addr;
12876 
12877 	/* Ensure the profile blob size provided is valid */
12878 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
12879 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
12880 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
12881 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
12882 	}
12883 
12884 #if PMAP_CS_INCLUDE_INTERNAL_CODE
12885 	const bool allow_development_root_cert = true;
12886 #else
12887 	const bool allow_development_root_cert = false;
12888 #endif
12889 
12890 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
12891 		profile_payload->profile_blob, profile_payload->profile_blob_size,
12892 		allow_development_root_cert,
12893 		&profile_content, &profile_content_length);
12894 
12895 	/* Release the PPL page allocated for CoreCrypto */
12896 	pmap_release_reserved_ppl_page();
12897 
12898 	if (ct_result != 0) {
12899 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
12900 	} else if ((profile_content == NULL) || profile_content_length == 0) {
12901 		panic("PMAP_CS: profile does not have any content: %p | %lu",
12902 		    profile_content, profile_content_length);
12903 	}
12904 
12905 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
12906 		pmap_cs_core_entitlements_runtime,
12907 		CCDER_CONSTRUCTED_SET,
12908 		false,
12909 		profile_content, profile_content + profile_content_length);
12910 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
12911 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
12912 	}
12913 
12914 	/* Acquire a writable version of the profile data structure */
12915 	profile_obj = &profile_payload->profile_obj_storage;
12916 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
12917 
12918 	profile_obj->original_payload = profile_payload;
12919 	profile_obj->profile_ctx_storage = profile_ctx_storage;
12920 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
12921 	os_atomic_store(&profile_obj->reference_count, 0, release);
12922 
12923 	/* Setup the entitlements provisioned by the profile */
12924 	ret = pmap_initialize_profile_entitlements(profile_obj);
12925 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
12926 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
12927 	}
12928 
12929 	/* Setup properties of the profile */
12930 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
12931 
12932 	/* Mark as validated since it passed all checks */
12933 	profile_obj->profile_validated = true;
12934 
12935 	/* Add the profile to the red-black tree */
12936 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
12937 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
12938 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
12939 	}
12940 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
12941 
12942 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
12943 	return KERN_SUCCESS;
12944 }
12945 
12946 kern_return_t
12947 pmap_register_provisioning_profile(
12948 	const vm_address_t payload_addr,
12949 	const vm_size_t payload_size)
12950 {
12951 	kern_return_t ret = KERN_DENIED;
12952 
12953 	ret = pmap_register_provisioning_profile_ppl(
12954 		payload_addr,
12955 		payload_size);
12956 
12957 	while (ret == KERN_RESOURCE_SHORTAGE) {
12958 		/* Allocate a page from the free list */
12959 		pmap_alloc_page_for_ppl(0);
12960 
12961 		/* Attempt the call again */
12962 		ret = pmap_register_provisioning_profile_ppl(
12963 			payload_addr,
12964 			payload_size);
12965 	}
12966 
12967 	return ret;
12968 }
12969 
12970 kern_return_t
12971 pmap_unregister_provisioning_profile_internal(
12972 	pmap_cs_profile_t *profile_obj)
12973 {
12974 	kern_return_t ret = KERN_DENIED;
12975 
12976 	/* Lock the red-black tree exclusively */
12977 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
12978 
12979 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
12980 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
12981 	}
12982 
12983 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
12984 	if (reference_count != 0) {
12985 		ret = KERN_FAILURE;
12986 		goto exit;
12987 	}
12988 
12989 	/* Remove the profile from the red-black tree */
12990 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
12991 
12992 	/* Unregistration was a success */
12993 	ret = KERN_SUCCESS;
12994 
12995 exit:
12996 	/* Unlock the red-black tree */
12997 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
12998 
12999 	if (ret == KERN_SUCCESS) {
13000 		/* Get the original payload address */
13001 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13002 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13003 
13004 		/* Get the original payload size */
13005 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13006 		payload_size = round_page(payload_size);
13007 
13008 		/* Unlock the profile payload */
13009 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13010 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13011 		    profile_payload, payload_size);
13012 
13013 		profile_obj = NULL;
13014 	}
13015 	return ret;
13016 }
13017 
13018 kern_return_t
13019 pmap_unregister_provisioning_profile(
13020 	pmap_cs_profile_t *profile_obj)
13021 {
13022 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13023 }
13024 
13025 kern_return_t
13026 pmap_associate_provisioning_profile_internal(
13027 	pmap_cs_code_directory_t *cd_entry,
13028 	pmap_cs_profile_t *profile_obj)
13029 {
13030 	kern_return_t ret = KERN_DENIED;
13031 
13032 	/* Acquire the lock on the code directory */
13033 	pmap_cs_lock_code_directory(cd_entry);
13034 
13035 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13036 		pmap_cs_log_error("disallowing profile association with verified signature");
13037 		goto exit;
13038 	} else if (cd_entry->profile_obj != NULL) {
13039 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13040 		goto exit;
13041 	}
13042 
13043 	/* Lock the red-black tree as shared */
13044 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13045 
13046 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13047 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13048 	} else if (profile_obj->profile_validated == false) {
13049 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13050 	}
13051 
13052 	/* Associate the profile with the signature */
13053 	cd_entry->profile_obj = profile_obj;
13054 
13055 	/* Increment the reference count on the profile object */
13056 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13057 	if (reference_count == 0) {
13058 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13059 	}
13060 
13061 	/* Unlock the red-black tree */
13062 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13063 
13064 	/* Association was a success */
13065 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13066 	ret = KERN_SUCCESS;
13067 
13068 exit:
13069 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13070 
13071 	return ret;
13072 }
13073 
13074 kern_return_t
13075 pmap_associate_provisioning_profile(
13076 	pmap_cs_code_directory_t *cd_entry,
13077 	pmap_cs_profile_t *profile_obj)
13078 {
13079 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13080 }
13081 
13082 kern_return_t
13083 pmap_disassociate_provisioning_profile_internal(
13084 	pmap_cs_code_directory_t *cd_entry)
13085 {
13086 	pmap_cs_profile_t *profile_obj = NULL;
13087 	kern_return_t ret = KERN_DENIED;
13088 
13089 	/* Acquire the lock on the code directory */
13090 	pmap_cs_lock_code_directory(cd_entry);
13091 
13092 	if (cd_entry->profile_obj == NULL) {
13093 		ret = KERN_NOT_FOUND;
13094 		goto exit;
13095 	}
13096 	profile_obj = cd_entry->profile_obj;
13097 
13098 	/* Disassociate the profile from the signature */
13099 	cd_entry->profile_obj = NULL;
13100 
13101 	/* Disassociation was a success */
13102 	ret = KERN_SUCCESS;
13103 
13104 exit:
13105 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13106 
13107 	if (ret == KERN_SUCCESS) {
13108 		/* Decrement the reference count on the profile object */
13109 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13110 		if (reference_count == UINT32_MAX) {
13111 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13112 		}
13113 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13114 	}
13115 	return ret;
13116 }
13117 
13118 kern_return_t
13119 pmap_disassociate_provisioning_profile(
13120 	pmap_cs_code_directory_t *cd_entry)
13121 {
13122 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13123 }
13124 
13125 kern_return_t
13126 pmap_associate_kernel_entitlements_internal(
13127 	pmap_cs_code_directory_t *cd_entry,
13128 	const void *kernel_entitlements)
13129 {
13130 	kern_return_t ret = KERN_DENIED;
13131 
13132 	if (kernel_entitlements == NULL) {
13133 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13134 	}
13135 
13136 	/* Acquire the lock on the code directory */
13137 	pmap_cs_lock_code_directory(cd_entry);
13138 
13139 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13140 		ret = KERN_DENIED;
13141 		goto out;
13142 	} else if (cd_entry->kernel_entitlements != NULL) {
13143 		ret = KERN_DENIED;
13144 		goto out;
13145 	}
13146 	cd_entry->kernel_entitlements = kernel_entitlements;
13147 
13148 	/* Association was a success */
13149 	ret = KERN_SUCCESS;
13150 
13151 out:
13152 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13153 	return ret;
13154 }
13155 
13156 kern_return_t
13157 pmap_associate_kernel_entitlements(
13158 	pmap_cs_code_directory_t *cd_entry,
13159 	const void *kernel_entitlements)
13160 {
13161 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13162 }
13163 
13164 kern_return_t
13165 pmap_resolve_kernel_entitlements_internal(
13166 	pmap_t pmap,
13167 	const void **kernel_entitlements)
13168 {
13169 	const void *entitlements = NULL;
13170 	pmap_cs_code_directory_t *cd_entry = NULL;
13171 	kern_return_t ret = KERN_DENIED;
13172 
13173 	/* Validate the PMAP object */
13174 	validate_pmap(pmap);
13175 
13176 	/* Take a shared lock on the PMAP */
13177 	pmap_lock(pmap, PMAP_LOCK_SHARED);
13178 
13179 	if (pmap == kernel_pmap) {
13180 		ret = KERN_NOT_FOUND;
13181 		goto out;
13182 	}
13183 
13184 	/*
13185 	 * Acquire the code signature from the PMAP. This function is called when
13186 	 * performing an entitlement check, and since we've confirmed this isn't
13187 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13188 	 * with a code signature.
13189 	 */
13190 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13191 	if (cd_entry == NULL) {
13192 		ret = KERN_NOT_FOUND;
13193 		goto out;
13194 	}
13195 
13196 	entitlements = cd_entry->kernel_entitlements;
13197 	if (entitlements == NULL) {
13198 		ret = KERN_NOT_FOUND;
13199 		goto out;
13200 	}
13201 
13202 	/* Pin and write out the entitlements object pointer */
13203 	if (kernel_entitlements != NULL) {
13204 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13205 		*kernel_entitlements = entitlements;
13206 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13207 	}
13208 
13209 	/* Successfully resolved the entitlements */
13210 	ret = KERN_SUCCESS;
13211 
13212 out:
13213 	/* Unlock the code signature object */
13214 	if (cd_entry != NULL) {
13215 		lck_rw_unlock_shared(&cd_entry->rwlock);
13216 		cd_entry = NULL;
13217 	}
13218 
13219 	/* Unlock the PMAP object */
13220 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13221 
13222 	return ret;
13223 }
13224 
13225 kern_return_t
13226 pmap_resolve_kernel_entitlements(
13227 	pmap_t pmap,
13228 	const void **kernel_entitlements)
13229 {
13230 	return pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13231 }
13232 
13233 kern_return_t
13234 pmap_accelerate_entitlements_internal(
13235 	pmap_cs_code_directory_t *cd_entry)
13236 {
13237 	const coreentitlements_t *CoreEntitlements = NULL;
13238 	const CS_SuperBlob *superblob = NULL;
13239 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13240 	size_t signature_length = 0;
13241 	size_t acceleration_length = 0;
13242 	size_t required_length = 0;
13243 	kern_return_t ret = KERN_DENIED;
13244 
13245 	/* Setup the CoreEntitlements interface */
13246 	CoreEntitlements = &amfi->CoreEntitlements;
13247 
13248 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13249 
13250 	/* Acquire the lock on the code directory */
13251 	pmap_cs_lock_code_directory(cd_entry);
13252 
13253 	/*
13254 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13255 	 * decision we make since this allows us to re-use any unused space within the
13256 	 * locked down code signature region. There is also a decent bit of validation
13257 	 * within the reconstitution function to ensure blobs are ordered and do not
13258 	 * contain any padding around them which can cause issues here.
13259 	 *
13260 	 * This also serves as a check to ensure the signature is trusted.
13261 	 */
13262 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13263 		ret = KERN_DENIED;
13264 		goto out;
13265 	}
13266 
13267 	if (cd_entry->ce_ctx == NULL) {
13268 		ret = KERN_SUCCESS;
13269 		goto out;
13270 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13271 		ret = KERN_SUCCESS;
13272 		goto out;
13273 	}
13274 
13275 	/* We only support accelerating when size <= PAGE_SIZE */
13276 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13277 	if (ce_err != CoreEntitlements->kNoError) {
13278 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13279 			/* Small entitlement blobs aren't eligible */
13280 			ret = KERN_SUCCESS;
13281 			goto out;
13282 		}
13283 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13284 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13285 	} else if (acceleration_length > PAGE_SIZE) {
13286 		ret = KERN_ABORTED;
13287 		goto out;
13288 	}
13289 	assert(acceleration_length > 0);
13290 
13291 	superblob = cd_entry->superblob;
13292 	signature_length = ntohl(superblob->length);
13293 
13294 	/* Adjust the required length for the overhead structure -- can't overflow */
13295 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13296 	if (required_length > PAGE_SIZE) {
13297 		ret = KERN_ABORTED;
13298 		goto out;
13299 	}
13300 
13301 	/*
13302 	 * First we'll check if the code signature has enough space within the locked down
13303 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13304 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13305 	 * free list.
13306 	 *
13307 	 * When we're storing the buffer within the code signature, we also need to make
13308 	 * sure we account for alignment of the buffer.
13309 	 */
13310 	const vm_address_t align_mask = sizeof(void*) - 1;
13311 	size_t required_length_within_sig = required_length + align_mask;
13312 
13313 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13314 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13315 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13316 
13317 		/* We need to resolve to the physical aperture */
13318 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13319 		acceleration_buf = (void*)phystokv(phys_addr);
13320 
13321 		/* Ensure the offset within the page wasn't lost */
13322 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13323 
13324 		acceleration_buf->allocated = false;
13325 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13326 	} else {
13327 		if (required_length <= pmap_cs_blob_limit) {
13328 			struct pmap_cs_blob *bucket = NULL;
13329 			size_t bucket_size = 0;
13330 
13331 			/* Allocate a buffer from the blob allocator */
13332 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13333 			if (ret != KERN_SUCCESS) {
13334 				goto out;
13335 			}
13336 			acceleration_buf = (void*)bucket->blob;
13337 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13338 		} else {
13339 			pmap_paddr_t phys_addr = 0;
13340 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13341 			if (ret != KERN_SUCCESS) {
13342 				goto out;
13343 			}
13344 			acceleration_buf = (void*)phystokv(phys_addr);
13345 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13346 		}
13347 		acceleration_buf->allocated = true;
13348 	}
13349 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13350 	acceleration_buf->length = acceleration_length;
13351 
13352 	/* Take the acceleration buffer lock */
13353 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13354 
13355 	/* Setup the global acceleration buffer state */
13356 	pmap_cs_acceleration_buf = acceleration_buf;
13357 
13358 	/* Accelerate the entitlements */
13359 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13360 	if (ce_err != CoreEntitlements->kNoError) {
13361 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13362 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13363 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13364 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13365 	}
13366 
13367 	/*
13368 	 * The global acceleration buffer lock is unlocked by the allocation function itself
13369 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13370 	 * an assert that the lock is unlocked here since another thread could have acquired
13371 	 * it by now.
13372 	 */
13373 	ret = KERN_SUCCESS;
13374 
13375 out:
13376 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13377 	return ret;
13378 }
13379 
13380 kern_return_t
13381 pmap_accelerate_entitlements(
13382 	pmap_cs_code_directory_t *cd_entry)
13383 {
13384 	kern_return_t ret = KERN_DENIED;
13385 
13386 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
13387 	while (ret == KERN_RESOURCE_SHORTAGE) {
13388 		/* Allocate a page for the PPL */
13389 		pmap_alloc_page_for_ppl(0);
13390 
13391 		/* Try again */
13392 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
13393 	}
13394 
13395 	return ret;
13396 }
13397 
13398 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13399 
13400 MARK_AS_PMAP_TEXT bool
13401 pmap_lookup_in_loaded_trust_caches_internal(
13402 	const uint8_t cdhash[CS_CDHASH_LEN])
13403 {
13404 	kern_return_t kr = KERN_NOT_FOUND;
13405 
13406 #if PMAP_CS_PPL_MONITOR
13407 	/*
13408 	 * If we have the PPL monitor, then this function can only be called from
13409 	 * within the PPL. Calling it directly would've caused a panic, so we can
13410 	 * assume that we're in the PPL here.
13411 	 */
13412 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13413 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13414 
13415 	kr = pmap_query_trust_cache_safe(
13416 		kTCQueryTypeLoadable,
13417 		cdhash_safe,
13418 		NULL);
13419 #else
13420 	kr = query_trust_cache(
13421 		kTCQueryTypeLoadable,
13422 		cdhash,
13423 		NULL);
13424 #endif
13425 
13426 	if (kr == KERN_SUCCESS) {
13427 		return true;
13428 	}
13429 	return false;
13430 }
13431 
13432 bool
13433 pmap_lookup_in_loaded_trust_caches(
13434 	const uint8_t cdhash[CS_CDHASH_LEN])
13435 {
13436 #if XNU_MONITOR
13437 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13438 #else
13439 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13440 #endif
13441 }
13442 
13443 MARK_AS_PMAP_TEXT uint32_t
13444 pmap_lookup_in_static_trust_cache_internal(
13445 	const uint8_t cdhash[CS_CDHASH_LEN])
13446 {
13447 	TrustCacheQueryToken_t query_token = {0};
13448 	kern_return_t kr = KERN_NOT_FOUND;
13449 	uint64_t flags = 0;
13450 	uint8_t hash_type = 0;
13451 
13452 #if PMAP_CS_PPL_MONITOR
13453 	/*
13454 	 * If we have the PPL monitor, then this function can only be called from
13455 	 * within the PPL. Calling it directly would've caused a panic, so we can
13456 	 * assume that we're in the PPL here.
13457 	 */
13458 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13459 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13460 
13461 	kr = pmap_query_trust_cache_safe(
13462 		kTCQueryTypeStatic,
13463 		cdhash_safe,
13464 		&query_token);
13465 #else
13466 	kr = query_trust_cache(
13467 		kTCQueryTypeStatic,
13468 		cdhash,
13469 		&query_token);
13470 #endif
13471 
13472 	if (kr == KERN_SUCCESS) {
13473 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
13474 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13475 
13476 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13477 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13478 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13479 	}
13480 
13481 	return 0;
13482 }
13483 
13484 uint32_t
13485 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13486 {
13487 #if XNU_MONITOR
13488 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13489 #else
13490 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
13491 #endif
13492 }
13493 
13494 #if PMAP_CS_INCLUDE_CODE_SIGNING
13495 
13496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13497 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13498 
13499 MARK_AS_PMAP_TEXT void
13500 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13501 {
13502 
13503 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13504 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13505 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13506 
13507 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13508 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13509 }
13510 
13511 MARK_AS_PMAP_TEXT bool
13512 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13513 {
13514 	bool match = false;
13515 
13516 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13517 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13518 		match = true;
13519 	}
13520 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13521 
13522 	if (match) {
13523 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13524 	}
13525 
13526 	return match;
13527 }
13528 
13529 void
13530 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13531 {
13532 #if XNU_MONITOR
13533 	pmap_set_compilation_service_cdhash_ppl(cdhash);
13534 #else
13535 	pmap_set_compilation_service_cdhash_internal(cdhash);
13536 #endif
13537 }
13538 
13539 bool
13540 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13541 {
13542 #if XNU_MONITOR
13543 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
13544 #else
13545 	return pmap_match_compilation_service_cdhash_internal(cdhash);
13546 #endif
13547 }
13548 
13549 /*
13550  * As part of supporting local signing on the device, we need the PMAP layer
13551  * to store the local signing key so that PMAP_CS can validate with it. We
13552  * store it at the PMAP layer such that it is accessible to both AMFI and
13553  * PMAP_CS should they need it.
13554  */
13555 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13556 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13557 
13558 MARK_AS_PMAP_TEXT void
13559 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13560 {
13561 	bool key_set = false;
13562 
13563 	/*
13564 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13565 	 * a successful exchange means that the local signing public key has _not_ been
13566 	 * set. In case the key has been set, we panic as we would never expect the
13567 	 * kernel to attempt to set the key more than once.
13568 	 */
13569 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13570 
13571 	if (key_set) {
13572 		panic("attempted to set the local signing public key multiple times");
13573 	}
13574 
13575 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13576 	pmap_cs_log_info("set local signing public key");
13577 }
13578 
13579 void
13580 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13581 {
13582 #if XNU_MONITOR
13583 	return pmap_set_local_signing_public_key_ppl(public_key);
13584 #else
13585 	return pmap_set_local_signing_public_key_internal(public_key);
13586 #endif
13587 }
13588 
13589 uint8_t*
13590 pmap_get_local_signing_public_key(void)
13591 {
13592 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13593 
13594 	if (key_set) {
13595 		return pmap_local_signing_public_key;
13596 	}
13597 
13598 	return NULL;
13599 }
13600 
13601 /*
13602  * Locally signed applications need to be explicitly authorized by an entitled application
13603  * before we allow them to run.
13604  */
13605 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13606 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13607 
13608 MARK_AS_PMAP_TEXT void
13609 pmap_unrestrict_local_signing_internal(
13610 	const uint8_t cdhash[CS_CDHASH_LEN])
13611 {
13612 
13613 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13614 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13615 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13616 
13617 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13618 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13619 }
13620 
13621 void
13622 pmap_unrestrict_local_signing(
13623 	const uint8_t cdhash[CS_CDHASH_LEN])
13624 {
13625 #if XNU_MONITOR
13626 	return pmap_unrestrict_local_signing_ppl(cdhash);
13627 #else
13628 	return pmap_unrestrict_local_signing_internal(cdhash);
13629 #endif
13630 }
13631 
13632 #if PMAP_CS
13633 MARK_AS_PMAP_TEXT static void
13634 pmap_restrict_local_signing(void)
13635 {
13636 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13637 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13638 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13639 }
13640 
13641 MARK_AS_PMAP_TEXT static bool
13642 pmap_local_signing_restricted(
13643 	const uint8_t cdhash[CS_CDHASH_LEN])
13644 {
13645 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13646 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13647 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13648 
13649 	return ret != 0;
13650 }
13651 
13652 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13653 #endif
13654 
13655 MARK_AS_PMAP_TEXT void
13656 pmap_footprint_suspend_internal(
13657 	vm_map_t        map,
13658 	boolean_t       suspend)
13659 {
13660 #if DEVELOPMENT || DEBUG
13661 	if (suspend) {
13662 		current_thread()->pmap_footprint_suspended = TRUE;
13663 		map->pmap->footprint_was_suspended = TRUE;
13664 	} else {
13665 		current_thread()->pmap_footprint_suspended = FALSE;
13666 	}
13667 #else /* DEVELOPMENT || DEBUG */
13668 	(void) map;
13669 	(void) suspend;
13670 #endif /* DEVELOPMENT || DEBUG */
13671 }
13672 
13673 void
13674 pmap_footprint_suspend(
13675 	vm_map_t map,
13676 	boolean_t suspend)
13677 {
13678 #if XNU_MONITOR
13679 	pmap_footprint_suspend_ppl(map, suspend);
13680 #else
13681 	pmap_footprint_suspend_internal(map, suspend);
13682 #endif
13683 }
13684 
13685 MARK_AS_PMAP_TEXT void
13686 pmap_nop_internal(pmap_t pmap __unused)
13687 {
13688 	validate_pmap_mutable(pmap);
13689 }
13690 
13691 void
13692 pmap_nop(pmap_t pmap)
13693 {
13694 #if XNU_MONITOR
13695 	pmap_nop_ppl(pmap);
13696 #else
13697 	pmap_nop_internal(pmap);
13698 #endif
13699 }
13700 
13701 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
13702 
13703 struct page_table_dump_header {
13704 	uint64_t pa;
13705 	uint64_t num_entries;
13706 	uint64_t start_va;
13707 	uint64_t end_va;
13708 };
13709 
13710 static kern_return_t
13711 pmap_dump_page_tables_recurse(pmap_t pmap,
13712     const tt_entry_t *ttp,
13713     unsigned int cur_level,
13714     unsigned int level_mask,
13715     uint64_t start_va,
13716     void *buf_start,
13717     void *buf_end,
13718     size_t *bytes_copied)
13719 {
13720 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13721 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
13722 
13723 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
13724 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
13725 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
13726 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
13727 
13728 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
13729 
13730 	if (cur_level == pt_attr_root_level(pt_attr)) {
13731 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
13732 	}
13733 
13734 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
13735 	const tt_entry_t *tt_end = &ttp[num_entries];
13736 
13737 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
13738 		return KERN_INSUFFICIENT_BUFFER_SIZE;
13739 	}
13740 
13741 	if (level_mask & (1U << cur_level)) {
13742 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
13743 		header->pa = ml_static_vtop((vm_offset_t)ttp);
13744 		header->num_entries = num_entries;
13745 		header->start_va = start_va;
13746 		header->end_va = start_va + (num_entries * size);
13747 
13748 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
13749 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
13750 	}
13751 	uint64_t current_va = start_va;
13752 
13753 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
13754 		tt_entry_t tte = *ttep;
13755 
13756 		if (!(tte & valid_mask)) {
13757 			continue;
13758 		}
13759 
13760 		if ((tte & type_mask) == type_block) {
13761 			continue;
13762 		} else {
13763 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
13764 				panic("%s: corrupt entry %#llx at %p, "
13765 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
13766 				    __FUNCTION__, tte, ttep,
13767 				    ttp, cur_level, bufp, buf_end);
13768 			}
13769 
13770 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
13771 
13772 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
13773 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
13774 
13775 			if (recurse_result != KERN_SUCCESS) {
13776 				return recurse_result;
13777 			}
13778 		}
13779 	}
13780 
13781 	return KERN_SUCCESS;
13782 }
13783 
13784 kern_return_t
13785 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
13786 {
13787 	if (not_in_kdp) {
13788 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
13789 	}
13790 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
13791 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
13792 }
13793 
13794 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13795 
13796 kern_return_t
13797 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
13798     unsigned int level_mask __unused, size_t *bytes_copied __unused)
13799 {
13800 	return KERN_NOT_SUPPORTED;
13801 }
13802 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13803 
13804 
13805 #ifdef CONFIG_XNUPOST
13806 #ifdef __arm64__
13807 static volatile bool pmap_test_took_fault = false;
13808 
13809 static bool
13810 pmap_test_fault_handler(arm_saved_state_t * state)
13811 {
13812 	bool retval                 = false;
13813 	uint32_t esr                = get_saved_state_esr(state);
13814 	esr_exception_class_t class = ESR_EC(esr);
13815 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
13816 
13817 	if ((class == ESR_EC_DABORT_EL1) &&
13818 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
13819 		pmap_test_took_fault = true;
13820 		/* return to the instruction immediately after the call to NX page */
13821 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
13822 		retval = true;
13823 	}
13824 
13825 	return retval;
13826 }
13827 
13828 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
13829 static NOKASAN bool
13830 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
13831 {
13832 	pmap_t old_pmap = NULL;
13833 
13834 	pmap_test_took_fault = false;
13835 
13836 	/*
13837 	 * We're potentially switching pmaps without using the normal thread
13838 	 * mechanism; disable interrupts and preemption to avoid any unexpected
13839 	 * memory accesses.
13840 	 */
13841 	uint64_t old_int_state = pmap_interrupts_disable();
13842 	mp_disable_preemption();
13843 
13844 	if (pmap != NULL) {
13845 		old_pmap = current_pmap();
13846 		pmap_switch(pmap);
13847 
13848 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
13849 #if __ARM_PAN_AVAILABLE__
13850 		__builtin_arm_wsr("pan", 0);
13851 #endif /* __ARM_PAN_AVAILABLE__ */
13852 	}
13853 
13854 	ml_expect_fault_begin(pmap_test_fault_handler, va);
13855 
13856 	if (is_write) {
13857 		*((volatile uint64_t*)(va)) = 0xdec0de;
13858 	} else {
13859 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
13860 		(void)tmp;
13861 	}
13862 
13863 	/* Save the fault bool, and undo the gross stuff we did. */
13864 	bool took_fault = pmap_test_took_fault;
13865 	ml_expect_fault_end();
13866 
13867 	if (pmap != NULL) {
13868 #if __ARM_PAN_AVAILABLE__
13869 		__builtin_arm_wsr("pan", 1);
13870 #endif /* __ARM_PAN_AVAILABLE__ */
13871 
13872 		pmap_switch(old_pmap);
13873 	}
13874 
13875 	mp_enable_preemption();
13876 	pmap_interrupts_restore(old_int_state);
13877 	bool retval = (took_fault == should_fault);
13878 	return retval;
13879 }
13880 
13881 static bool
13882 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
13883 {
13884 	bool retval = pmap_test_access(pmap, va, should_fault, false);
13885 
13886 	if (!retval) {
13887 		T_FAIL("%s: %s, "
13888 		    "pmap=%p, va=%p, should_fault=%u",
13889 		    __func__, should_fault ? "did not fault" : "faulted",
13890 		    pmap, (void*)va, (unsigned)should_fault);
13891 	}
13892 
13893 	return retval;
13894 }
13895 
13896 static bool
13897 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
13898 {
13899 	bool retval = pmap_test_access(pmap, va, should_fault, true);
13900 
13901 	if (!retval) {
13902 		T_FAIL("%s: %s, "
13903 		    "pmap=%p, va=%p, should_fault=%u",
13904 		    __func__, should_fault ? "did not fault" : "faulted",
13905 		    pmap, (void*)va, (unsigned)should_fault);
13906 	}
13907 
13908 	return retval;
13909 }
13910 
13911 static bool
13912 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
13913 {
13914 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13915 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
13916 
13917 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
13918 
13919 	if (!retval) {
13920 		T_FAIL("%s: bits=%u, "
13921 		    "pa=%p, should_be_set=%u",
13922 		    __func__, bits,
13923 		    (void*)pa, should_be_set);
13924 	}
13925 
13926 	return retval;
13927 }
13928 
13929 static __attribute__((noinline)) bool
13930 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
13931 {
13932 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
13933 	return retval;
13934 }
13935 
13936 static int
13937 pmap_test_test_config(unsigned int flags)
13938 {
13939 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
13940 	unsigned int map_count = 0;
13941 	unsigned long page_ratio = 0;
13942 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
13943 
13944 	if (!pmap) {
13945 		panic("Failed to allocate pmap");
13946 	}
13947 
13948 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13949 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13950 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13951 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13952 
13953 	if (pmap_page_size <= native_page_size) {
13954 		page_ratio = native_page_size / pmap_page_size;
13955 	} else {
13956 		/*
13957 		 * We claim to support a page_ratio of less than 1, which is
13958 		 * not currently supported by the pmap layer; panic.
13959 		 */
13960 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13961 		    "flags=%u",
13962 		    __func__, native_page_size, pmap_page_size,
13963 		    flags);
13964 	}
13965 
13966 	if (PAGE_RATIO > 1) {
13967 		/*
13968 		 * The kernel is deliberately pretending to have 16KB pages.
13969 		 * The pmap layer has code that supports this, so pretend the
13970 		 * page size is larger than it is.
13971 		 */
13972 		pmap_page_size = PAGE_SIZE;
13973 		native_page_size = PAGE_SIZE;
13974 	}
13975 
13976 	/*
13977 	 * Get two pages from the VM; one to be mapped wired, and one to be
13978 	 * mapped nonwired.
13979 	 */
13980 	vm_page_t unwired_vm_page = vm_page_grab();
13981 	vm_page_t wired_vm_page = vm_page_grab();
13982 
13983 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
13984 		panic("Failed to grab VM pages");
13985 	}
13986 
13987 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
13988 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
13989 
13990 	pmap_paddr_t pa = ptoa(pn);
13991 	pmap_paddr_t wired_pa = ptoa(wired_pn);
13992 
13993 	/*
13994 	 * We'll start mappings at the second twig TT.  This keeps us from only
13995 	 * using the first entry in each TT, which would trivially be address
13996 	 * 0; one of the things we will need to test is retrieving the VA for
13997 	 * a given PTE.
13998 	 */
13999 	vm_map_address_t va_base = pmap_twig_size;
14000 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14001 
14002 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14003 		/*
14004 		 * Not exactly a functional failure, but this test relies on
14005 		 * there being a spare PTE slot we can use to pin the TT.
14006 		 */
14007 		panic("Cannot pin translation table");
14008 	}
14009 
14010 	/*
14011 	 * Create the wired mapping; this will prevent the pmap layer from
14012 	 * reclaiming our test TTs, which would interfere with this test
14013 	 * ("interfere" -> "make it panic").
14014 	 */
14015 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14016 
14017 #if XNU_MONITOR
14018 	/*
14019 	 * If the PPL is enabled, make sure that the kernel cannot write
14020 	 * to PPL memory.
14021 	 */
14022 	if (!pmap_ppl_disable) {
14023 		T_LOG("Validate that kernel cannot write to PPL memory.");
14024 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14025 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14026 	}
14027 #endif
14028 
14029 	/*
14030 	 * Create read-only mappings of the nonwired page; if the pmap does
14031 	 * not use the same page size as the kernel, create multiple mappings
14032 	 * so that the kernel page is fully mapped.
14033 	 */
14034 	for (map_count = 0; map_count < page_ratio; map_count++) {
14035 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14036 	}
14037 
14038 	/* Validate that all the PTEs have the expected PA and VA. */
14039 	for (map_count = 0; map_count < page_ratio; map_count++) {
14040 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14041 
14042 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14043 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14044 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14045 		}
14046 
14047 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14048 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14049 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14050 		}
14051 	}
14052 
14053 	T_LOG("Validate that reads to our mapping do not fault.");
14054 	pmap_test_read(pmap, va_base, false);
14055 
14056 	T_LOG("Validate that writes to our mapping fault.");
14057 	pmap_test_write(pmap, va_base, true);
14058 
14059 	T_LOG("Make the first mapping writable.");
14060 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14061 
14062 	T_LOG("Validate that writes to our mapping do not fault.");
14063 	pmap_test_write(pmap, va_base, false);
14064 
14065 
14066 	T_LOG("Make the first mapping execute-only");
14067 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14068 
14069 
14070 	T_LOG("Validate that reads to our mapping do not fault.");
14071 	pmap_test_read(pmap, va_base, false);
14072 
14073 	T_LOG("Validate that writes to our mapping fault.");
14074 	pmap_test_write(pmap, va_base, true);
14075 
14076 
14077 	/*
14078 	 * For page ratios of greater than 1: validate that writes to the other
14079 	 * mappings still fault.  Remove the mappings afterwards (we're done
14080 	 * with page ratio testing).
14081 	 */
14082 	for (map_count = 1; map_count < page_ratio; map_count++) {
14083 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14084 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14085 	}
14086 
14087 	T_LOG("Mark the page unreferenced and unmodified.");
14088 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14089 	pmap_test_check_refmod(pa, 0);
14090 
14091 	/*
14092 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14093 	 * different protection/fault_type settings, and confirm that the
14094 	 * ref/mod state matches our expectations at each step.
14095 	 */
14096 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14097 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14098 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14099 
14100 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14101 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14102 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14103 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14104 
14105 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14106 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14107 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14108 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14109 
14110 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14111 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14112 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14113 
14114 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14115 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14116 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14117 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14118 
14119 	/*
14120 	 * Shared memory testing; we'll have two mappings; one read-only,
14121 	 * one read-write.
14122 	 */
14123 	vm_map_address_t rw_base = va_base;
14124 	vm_map_address_t ro_base = va_base + pmap_page_size;
14125 
14126 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14127 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14128 
14129 	/*
14130 	 * Test that we take faults as expected for unreferenced/unmodified
14131 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14132 	 * mapping permissions change as expected.
14133 	 */
14134 	T_LOG("!ref/!mod: expect no access");
14135 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14136 	pmap_test_read_write(pmap, ro_base, false, false);
14137 	pmap_test_read_write(pmap, rw_base, false, false);
14138 
14139 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14140 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14141 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14142 	pmap_test_read_write(pmap, ro_base, true, false);
14143 	pmap_test_read_write(pmap, rw_base, true, false);
14144 
14145 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14146 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14147 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14148 	pmap_test_read_write(pmap, ro_base, true, false);
14149 	pmap_test_read_write(pmap, rw_base, true, true);
14150 
14151 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14152 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14153 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14154 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14155 	pmap_test_read_write(pmap, ro_base, true, false);
14156 	pmap_test_read_write(pmap, rw_base, true, true);
14157 
14158 	T_LOG("RW protect both mappings; should not change protections.");
14159 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14160 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14161 	pmap_test_read_write(pmap, ro_base, true, false);
14162 	pmap_test_read_write(pmap, rw_base, true, true);
14163 
14164 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14165 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14166 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14167 	pmap_test_read_write(pmap, ro_base, true, false);
14168 	pmap_test_read_write(pmap, rw_base, true, false);
14169 
14170 	T_LOG("RW protect the page; mappings should not change protections.");
14171 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14172 	pmap_page_protect(pn, VM_PROT_ALL);
14173 	pmap_test_read_write(pmap, ro_base, true, false);
14174 	pmap_test_read_write(pmap, rw_base, true, true);
14175 
14176 	T_LOG("Read protect the page; RW mapping should become RO.");
14177 	pmap_page_protect(pn, VM_PROT_READ);
14178 	pmap_test_read_write(pmap, ro_base, true, false);
14179 	pmap_test_read_write(pmap, rw_base, true, false);
14180 
14181 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14182 	pmap_disconnect(pn);
14183 	if (!pmap_verify_free(pn)) {
14184 		T_FAIL("Page still has mappings");
14185 	}
14186 
14187 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14188 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14189 	pmap_destroy(pmap);
14190 
14191 	T_LOG("Release the pages back to the VM.");
14192 	vm_page_lock_queues();
14193 	vm_page_free(unwired_vm_page);
14194 	vm_page_free(wired_vm_page);
14195 	vm_page_unlock_queues();
14196 
14197 	T_LOG("Testing successful!");
14198 	return 0;
14199 }
14200 #endif /* __arm64__ */
14201 
14202 kern_return_t
14203 pmap_test(void)
14204 {
14205 	T_LOG("Starting pmap_tests");
14206 #ifdef __arm64__
14207 	int flags = 0;
14208 	flags |= PMAP_CREATE_64BIT;
14209 
14210 #if __ARM_MIXED_PAGE_SIZE__
14211 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14212 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14213 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14214 	pmap_test_test_config(flags);
14215 #else /* __ARM_MIXED_PAGE_SIZE__ */
14216 	pmap_test_test_config(flags);
14217 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14218 
14219 #endif /* __arm64__ */
14220 	T_PASS("completed pmap_test successfully");
14221 	return KERN_SUCCESS;
14222 }
14223 #endif /* CONFIG_XNUPOST */
14224 
14225 /*
14226  * The following function should never make it to RELEASE code, since
14227  * it provides a way to get the PPL to modify text pages.
14228  */
14229 #if DEVELOPMENT || DEBUG
14230 
14231 #define ARM_UNDEFINED_INSN 0xe7f000f0
14232 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14233 
14234 /**
14235  * Forcibly overwrite executable text with an illegal instruction.
14236  *
14237  * @note Only used for xnu unit testing.
14238  *
14239  * @param pa The physical address to corrupt.
14240  *
14241  * @return KERN_SUCCESS on success.
14242  */
14243 kern_return_t
14244 pmap_test_text_corruption(pmap_paddr_t pa)
14245 {
14246 #if XNU_MONITOR
14247 	return pmap_test_text_corruption_ppl(pa);
14248 #else /* XNU_MONITOR */
14249 	return pmap_test_text_corruption_internal(pa);
14250 #endif /* XNU_MONITOR */
14251 }
14252 
14253 MARK_AS_PMAP_TEXT kern_return_t
14254 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14255 {
14256 	vm_offset_t va = phystokv(pa);
14257 	unsigned int pai = pa_index(pa);
14258 
14259 	assert(pa_valid(pa));
14260 
14261 	pvh_lock(pai);
14262 
14263 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14264 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14265 #if defined(PVH_FLAG_EXEC)
14266 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14267 
14268 	if (need_ap_twiddle) {
14269 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14270 	}
14271 #endif /* defined(PVH_FLAG_EXEC) */
14272 
14273 	/*
14274 	 * The low bit in an instruction address indicates a THUMB instruction
14275 	 */
14276 	if (va & 1) {
14277 		va &= ~(vm_offset_t)1;
14278 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14279 	} else {
14280 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14281 	}
14282 
14283 #if defined(PVH_FLAG_EXEC)
14284 	if (need_ap_twiddle) {
14285 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14286 	}
14287 #endif /* defined(PVH_FLAG_EXEC) */
14288 
14289 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14290 
14291 	pvh_unlock(pai);
14292 
14293 	return KERN_SUCCESS;
14294 }
14295 
14296 #endif /* DEVELOPMENT || DEBUG */
14297