xref: /xnu-11417.101.15/osfmk/arm/pmap/pmap.c (revision e3723e1f17661b24996789d8afc084c0c3303b26)
1 /*
2  * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 
109 #if HIBERNATION
110 #include <IOKit/IOHibernatePrivate.h>
111 #endif /* HIBERNATION */
112 
113 #ifdef __ARM64_PMAP_SUBPAGE_L1__
114 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
115 #else
116 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
117 #endif
118 
119 #if __ARM_VMSA__ != 8
120 #error Unknown __ARM_VMSA__
121 #endif
122 
123 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
124 
125 extern u_int32_t random(void); /* from <libkern/libkern.h> */
126 
127 static bool alloc_asid(pmap_t pmap);
128 static void free_asid(pmap_t pmap);
129 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
130 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
131 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
132 
133 const struct page_table_ops native_pt_ops =
134 {
135 	.alloc_id = alloc_asid,
136 	.free_id = free_asid,
137 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
138 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
139 	.wimg_to_pte = wimg_to_pte,
140 };
141 
142 const struct page_table_level_info pmap_table_level_info_16k[] =
143 {
144 	[0] = {
145 		.size       = ARM_16K_TT_L0_SIZE,
146 		.offmask    = ARM_16K_TT_L0_OFFMASK,
147 		.shift      = ARM_16K_TT_L0_SHIFT,
148 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
149 		.valid_mask = ARM_TTE_VALID,
150 		.type_mask  = ARM_TTE_TYPE_MASK,
151 		.type_block = ARM_TTE_TYPE_BLOCK
152 	},
153 	[1] = {
154 		.size       = ARM_16K_TT_L1_SIZE,
155 		.offmask    = ARM_16K_TT_L1_OFFMASK,
156 		.shift      = ARM_16K_TT_L1_SHIFT,
157 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
158 		.valid_mask = ARM_TTE_VALID,
159 		.type_mask  = ARM_TTE_TYPE_MASK,
160 		.type_block = ARM_TTE_TYPE_BLOCK
161 	},
162 	[2] = {
163 		.size       = ARM_16K_TT_L2_SIZE,
164 		.offmask    = ARM_16K_TT_L2_OFFMASK,
165 		.shift      = ARM_16K_TT_L2_SHIFT,
166 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
167 		.valid_mask = ARM_TTE_VALID,
168 		.type_mask  = ARM_TTE_TYPE_MASK,
169 		.type_block = ARM_TTE_TYPE_BLOCK
170 	},
171 	[3] = {
172 		.size       = ARM_16K_TT_L3_SIZE,
173 		.offmask    = ARM_16K_TT_L3_OFFMASK,
174 		.shift      = ARM_16K_TT_L3_SHIFT,
175 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
176 		.valid_mask = ARM_PTE_TYPE_VALID,
177 		.type_mask  = ARM_TTE_TYPE_MASK,
178 		.type_block = ARM_TTE_TYPE_L3BLOCK
179 	}
180 };
181 
182 const struct page_table_level_info pmap_table_level_info_4k[] =
183 {
184 	[0] = {
185 		.size       = ARM_4K_TT_L0_SIZE,
186 		.offmask    = ARM_4K_TT_L0_OFFMASK,
187 		.shift      = ARM_4K_TT_L0_SHIFT,
188 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
189 		.valid_mask = ARM_TTE_VALID,
190 		.type_mask  = ARM_TTE_TYPE_MASK,
191 		.type_block = ARM_TTE_TYPE_BLOCK
192 	},
193 	[1] = {
194 		.size       = ARM_4K_TT_L1_SIZE,
195 		.offmask    = ARM_4K_TT_L1_OFFMASK,
196 		.shift      = ARM_4K_TT_L1_SHIFT,
197 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
198 		.valid_mask = ARM_TTE_VALID,
199 		.type_mask  = ARM_TTE_TYPE_MASK,
200 		.type_block = ARM_TTE_TYPE_BLOCK
201 	},
202 	[2] = {
203 		.size       = ARM_4K_TT_L2_SIZE,
204 		.offmask    = ARM_4K_TT_L2_OFFMASK,
205 		.shift      = ARM_4K_TT_L2_SHIFT,
206 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
207 		.valid_mask = ARM_TTE_VALID,
208 		.type_mask  = ARM_TTE_TYPE_MASK,
209 		.type_block = ARM_TTE_TYPE_BLOCK
210 	},
211 	[3] = {
212 		.size       = ARM_4K_TT_L3_SIZE,
213 		.offmask    = ARM_4K_TT_L3_OFFMASK,
214 		.shift      = ARM_4K_TT_L3_SHIFT,
215 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
216 		.valid_mask = ARM_PTE_TYPE_VALID,
217 		.type_mask  = ARM_TTE_TYPE_MASK,
218 		.type_block = ARM_TTE_TYPE_L3BLOCK
219 	}
220 };
221 
222 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
223 {
224 	[0] = { /* Unused */
225 		.size       = ARM_4K_TT_L0_SIZE,
226 		.offmask    = ARM_4K_TT_L0_OFFMASK,
227 		.shift      = ARM_4K_TT_L0_SHIFT,
228 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
229 		.valid_mask = ARM_TTE_VALID,
230 		.type_mask  = ARM_TTE_TYPE_MASK,
231 		.type_block = ARM_TTE_TYPE_BLOCK
232 	},
233 	[1] = { /* Concatenated, so index mask is larger than normal */
234 		.size       = ARM_4K_TT_L1_SIZE,
235 		.offmask    = ARM_4K_TT_L1_OFFMASK,
236 		.shift      = ARM_4K_TT_L1_SHIFT,
237 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
238 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
239 #else
240 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
241 #endif
242 		.valid_mask = ARM_TTE_VALID,
243 		.type_mask  = ARM_TTE_TYPE_MASK,
244 		.type_block = ARM_TTE_TYPE_BLOCK
245 	},
246 	[2] = {
247 		.size       = ARM_4K_TT_L2_SIZE,
248 		.offmask    = ARM_4K_TT_L2_OFFMASK,
249 		.shift      = ARM_4K_TT_L2_SHIFT,
250 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
251 		.valid_mask = ARM_TTE_VALID,
252 		.type_mask  = ARM_TTE_TYPE_MASK,
253 		.type_block = ARM_TTE_TYPE_BLOCK
254 	},
255 	[3] = {
256 		.size       = ARM_4K_TT_L3_SIZE,
257 		.offmask    = ARM_4K_TT_L3_OFFMASK,
258 		.shift      = ARM_4K_TT_L3_SHIFT,
259 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
260 		.valid_mask = ARM_PTE_TYPE_VALID,
261 		.type_mask  = ARM_TTE_TYPE_MASK,
262 		.type_block = ARM_TTE_TYPE_L3BLOCK
263 	}
264 };
265 
266 const struct page_table_attr pmap_pt_attr_4k = {
267 	.pta_level_info = pmap_table_level_info_4k,
268 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
269 #if __ARM_MIXED_PAGE_SIZE__
270 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
271 #else /* __ARM_MIXED_PAGE_SIZE__ */
272 #if __ARM_16K_PG__
273 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
274 #else /* __ARM_16K_PG__ */
275 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
276 #endif /* __ARM_16K_PG__ */
277 #endif /* __ARM_MIXED_PAGE_SIZE__ */
278 	.pta_max_level  = PMAP_TT_L3_LEVEL,
279 	.pta_ops = &native_pt_ops,
280 	.ap_ro = ARM_PTE_AP(AP_RORO),
281 	.ap_rw = ARM_PTE_AP(AP_RWRW),
282 	.ap_rona = ARM_PTE_AP(AP_RONA),
283 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
284 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
285 	.ap_x = ARM_PTE_PNX,
286 #if __ARM_MIXED_PAGE_SIZE__
287 	.pta_tcr_value  = TCR_EL1_4KB,
288 #endif /* __ARM_MIXED_PAGE_SIZE__ */
289 	.pta_page_size  = 4096,
290 	.pta_pagezero_size = 4096,
291 	.pta_page_shift = 12,
292 };
293 
294 const struct page_table_attr pmap_pt_attr_16k = {
295 	.pta_level_info = pmap_table_level_info_16k,
296 	.pta_root_level = PMAP_TT_L1_LEVEL,
297 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
298 	.pta_max_level  = PMAP_TT_L3_LEVEL,
299 	.pta_ops = &native_pt_ops,
300 	.ap_ro = ARM_PTE_AP(AP_RORO),
301 	.ap_rw = ARM_PTE_AP(AP_RWRW),
302 	.ap_rona = ARM_PTE_AP(AP_RONA),
303 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
304 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
305 	.ap_x = ARM_PTE_PNX,
306 #if __ARM_MIXED_PAGE_SIZE__
307 	.pta_tcr_value  = TCR_EL1_16KB,
308 #endif /* __ARM_MIXED_PAGE_SIZE__ */
309 	.pta_page_size  = 16384,
310 	.pta_pagezero_size = 16384,
311 	.pta_page_shift = 14,
312 };
313 
314 #if __ARM_16K_PG__
315 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
316 #else /* !__ARM_16K_PG__ */
317 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
318 #endif /* !__ARM_16K_PG__ */
319 
320 
321 #if DEVELOPMENT || DEBUG
322 int vm_footprint_suspend_allowed = 1;
323 
324 extern int pmap_ledgers_panic;
325 extern int pmap_ledgers_panic_leeway;
326 
327 #endif /* DEVELOPMENT || DEBUG */
328 
329 #if DEVELOPMENT || DEBUG
330 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
331 	(current_thread()->pmap_footprint_suspended)
332 #else /* DEVELOPMENT || DEBUG */
333 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
334 #endif /* DEVELOPMENT || DEBUG */
335 
336 
337 /*
338  * Represents a tlb range that will be flushed before exiting
339  * the ppl.
340  * Used by phys_attribute_clear_range to defer flushing pages in
341  * this range until the end of the operation.
342  */
343 typedef struct pmap_tlb_flush_range {
344 	pmap_t ptfr_pmap;
345 	vm_map_address_t ptfr_start;
346 	vm_map_address_t ptfr_end;
347 	bool ptfr_flush_needed;
348 } pmap_tlb_flush_range_t;
349 
350 #if XNU_MONITOR
351 /*
352  * PPL External References.
353  */
354 extern vm_offset_t   segPPLDATAB;
355 extern unsigned long segSizePPLDATA;
356 extern vm_offset_t   segPPLTEXTB;
357 extern unsigned long segSizePPLTEXT;
358 extern vm_offset_t   segPPLDATACONSTB;
359 extern unsigned long segSizePPLDATACONST;
360 
361 
362 /*
363  * PPL Global Variables
364  */
365 
366 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
367 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
368 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
369 #else
370 const boolean_t pmap_ppl_disable = FALSE;
371 #endif
372 
373 /*
374  * Indicates if the PPL has started applying APRR.
375  * This variable is accessed from various assembly trampolines, so be sure to change
376  * those if you change the size or layout of this variable.
377  */
378 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
379 
380 extern void *pmap_stacks_start;
381 extern void *pmap_stacks_end;
382 
383 #endif /* !XNU_MONITOR */
384 
385 
386 
387 /* Virtual memory region for early allocation */
388 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
389 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
390 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
391 
392 extern uint8_t bootstrap_pagetables[];
393 
394 extern unsigned int not_in_kdp;
395 
396 extern vm_offset_t first_avail;
397 
398 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
399 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
400 extern vm_offset_t     static_memory_end;
401 
402 extern const vm_map_address_t physmap_base;
403 extern const vm_map_address_t physmap_end;
404 
405 extern int maxproc, hard_maxproc;
406 
407 /* The number of address bits one TTBR can cover. */
408 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
409 
410 /*
411  * The bounds on our TTBRs.  These are for sanity checking that
412  * an address is accessible by a TTBR before we attempt to map it.
413  */
414 
415 /* The level of the root of a page table. */
416 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
417 
418 /* The number of entries in the root TT of a page table. */
419 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
420 
421 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
422 const pmap_t    kernel_pmap = &kernel_pmap_store;
423 
424 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
425 
426 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
427 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
428 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
429 
430 typedef struct tt_free_entry {
431 	struct tt_free_entry    *next;
432 } tt_free_entry_t;
433 
434 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
435 
436 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
437 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
438 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
439 #define FREE_PAGE_SIZE_TT_MAX   4
440 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
441 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
442 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
443 
444 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
445 
446 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
447 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
448 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
449 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
450 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
451 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
452 
453 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
454 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
455 
456 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
457 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
458 
459 /* Lock group used for all pmap object locks. */
460 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
461 
462 #if DEVELOPMENT || DEBUG
463 int nx_enabled = 1;                                     /* enable no-execute protection */
464 int allow_data_exec  = 0;                               /* No apps may execute data */
465 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
466 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
467 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
468 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
469 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
470 #else /* DEVELOPMENT || DEBUG */
471 const int nx_enabled = 1;                                       /* enable no-execute protection */
472 const int allow_data_exec  = 0;                         /* No apps may execute data */
473 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
474 #endif /* DEVELOPMENT || DEBUG */
475 
476 /**
477  * This variable is set true during hibernation entry to protect pmap data structures
478  * during image copying, and reset false on hibernation exit.
479  */
480 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
481 
482 #if MACH_ASSERT
483 static void pmap_check_ledgers(pmap_t pmap);
484 #else
485 static inline void
pmap_check_ledgers(__unused pmap_t pmap)486 pmap_check_ledgers(__unused pmap_t pmap)
487 {
488 }
489 #endif /* MACH_ASSERT */
490 
491 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
492 
493 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
494 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
495 
496 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
497 
498 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
499 #if defined(__arm64__)
500 /* end of shared region + 512MB for various purposes */
501 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
502 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
503     "Minimum address space size outside allowable range");
504 
505 // Max offset is 15.375GB for devices with "large" memory config
506 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
507 // Max offset is 11.375GB for devices with "small" memory config
508 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
509 
510 
511 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
512     "Large device address space size outside allowable range");
513 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
514     "Small device address space size outside allowable range");
515 
516 #  ifdef XNU_TARGET_OS_OSX
517 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
518 #  else
519 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
520 #  endif
521 #endif /* __arm64__ */
522 
523 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
524 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
525 #else
526 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
527 #endif
528 
529 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
530 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
531 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
532 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
533 #if !HAS_16BIT_ASID
534 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
535 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
536 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
537 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
538 #else
539 static uint16_t last_allocated_asid = 0;
540 #endif /* !HAS_16BIT_ASID */
541 
542 #if HAS_SPECRES_DEBUGGING
543 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
544 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
545 #endif /* HAS_SPECRES_DEBUGGING */
546 
547 
548 #if __ARM_MIXED_PAGE_SIZE__
549 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
550 #endif
551 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
552 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
553 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
554 
555 /* PTE Define Macros */
556 
557 #define ARM_PTE_IS_COMPRESSED(x, p) \
558 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
559 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
560 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
561 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
562 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
563 
564 #define pte_is_wired(pte)                                                               \
565 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
566 
567 #define pte_was_writeable(pte) \
568 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
569 
570 #define pte_set_was_writeable(pte, was_writeable) \
571 	do {                                         \
572 	        if ((was_writeable)) {               \
573 	                (pte) |= ARM_PTE_WRITEABLE;  \
574 	        } else {                             \
575 	                (pte) &= ~ARM_PTE_WRITEABLE; \
576 	        }                                    \
577 	} while(0)
578 
579 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)580 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
581 {
582 	if (wired) {
583 		*ptep |= ARM_PTE_WIRED;
584 	} else {
585 		*ptep &= ~ARM_PTE_WIRED;
586 	}
587 	/*
588 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
589 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
590 	 * never reclaimed.
591 	 */
592 	if (pmap == kernel_pmap) {
593 		return;
594 	}
595 	unsigned short *ptd_wiredcnt_ptr;
596 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
597 	if (wired) {
598 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
599 	} else {
600 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
601 		if (__improbable(prev_wired == 0)) {
602 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
603 		}
604 	}
605 }
606 
607 #if HAS_FEAT_XS
608 
609 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)610 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
611 {
612 	if (__improbable(pt_attr->stage2)) {
613 		return false;
614 	}
615 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
616 	case CACHE_ATTRINDX_DISABLE_XS:
617 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
618 		return true;
619 	default:
620 		return false;
621 	}
622 }
623 
624 #endif /* HAS_FEAT_XS */
625 
626 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
627 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
628 	arm64_sync_tlb(strong);                                                                               \
629 }
630 
631 /*
632  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
633  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
634  * will observe the updated PTE.
635  */
636 #define FLUSH_PTE()                                                                     \
637 	__builtin_arm_dmb(DMB_ISH);
638 
639 /*
640  * Synchronize updates to PTEs that were previously valid and thus may be cached in
641  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
642  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
643  * program order will not issue until the DSB completes.  Prior loads may be reordered
644  * after the barrier, but their behavior should not be materially affected by the
645  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
646  * matter for loads until the access is re-driven well after the TLB update is
647  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
648  * we should be in a position to handle access faults.  For "voluntary" PTE access
649  * restriction due to unmapping or protection, the decision to restrict access should
650  * have a data dependency on prior loads in order to avoid a data race.
651  */
652 #define FLUSH_PTE_STRONG()                                                             \
653 	__builtin_arm_dsb(DSB_ISHST);
654 
655 /**
656  * Write enough page table entries to map a single VM page. On systems where the
657  * VM page size does not match the hardware page size, multiple page table
658  * entries will need to be written.
659  *
660  * @note This function does not emit a barrier to ensure these page table writes
661  *       have completed before continuing. This is commonly needed. In the case
662  *       where a DMB or DSB barrier is needed, then use the write_pte() and
663  *       write_pte_strong() functions respectively instead of this one.
664  *
665  * @param ptep Pointer to the first page table entry to update.
666  * @param pte The value to write into each page table entry. In the case that
667  *            multiple PTEs are updated to a non-empty value, then the address
668  *            in this value will automatically be incremented for each PTE
669  *            write.
670  */
671 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)672 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
673 {
674 	/**
675 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
676 	 * systems, which is why it's checked at runtime instead of compile time.
677 	 * The "unreachable" warning needs to be suppressed because it still is a
678 	 * compile time constant on some systems.
679 	 */
680 	__unreachable_ok_push
681 	if (TEST_PAGE_RATIO_4) {
682 		if (((uintptr_t)ptep) & 0x1f) {
683 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
684 			    __func__, ptep, (void*)pte);
685 		}
686 
687 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
688 			/**
689 			 * If we're writing an empty/compressed PTE value, then don't
690 			 * auto-increment the address for each PTE write.
691 			 */
692 			*ptep = pte;
693 			*(ptep + 1) = pte;
694 			*(ptep + 2) = pte;
695 			*(ptep + 3) = pte;
696 		} else {
697 			*ptep = pte;
698 			*(ptep + 1) = pte | 0x1000;
699 			*(ptep + 2) = pte | 0x2000;
700 			*(ptep + 3) = pte | 0x3000;
701 		}
702 	} else {
703 		*ptep = pte;
704 	}
705 	__unreachable_ok_pop
706 }
707 
708 /**
709  * Writes enough page table entries to map a single VM page and then ensures
710  * those writes complete by executing a Data Memory Barrier.
711  *
712  * @note The DMB issued by this function is not strong enough to protect against
713  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
714  *       instruction is going to immediately be called after this write, it's
715  *       recommended to call write_pte_strong() instead of this function.
716  *
717  * See the function header for write_pte_fast() for more details on the
718  * parameters.
719  */
720 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)721 write_pte(pt_entry_t *ptep, pt_entry_t pte)
722 {
723 	write_pte_fast(ptep, pte);
724 	FLUSH_PTE();
725 }
726 
727 /**
728  * Writes enough page table entries to map a single VM page and then ensures
729  * those writes complete by executing a Data Synchronization Barrier. This
730  * barrier provides stronger guarantees than the DMB executed by write_pte().
731  *
732  * @note This function is useful if you're going to immediately flush the TLB
733  *       after making the PTE write. A DSB is required to protect against the
734  *       TLB invalidate being reordered before the PTE write.
735  *
736  * See the function header for write_pte_fast() for more details on the
737  * parameters.
738  */
739 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)740 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
741 {
742 	write_pte_fast(ptep, pte);
743 	FLUSH_PTE_STRONG();
744 }
745 
746 /**
747  * Retrieve the pmap structure for the thread running on the current CPU.
748  */
749 pmap_t
current_pmap()750 current_pmap()
751 {
752 	const pmap_t current = vm_map_pmap(current_thread()->map);
753 
754 	assert(current != NULL);
755 
756 #if XNU_MONITOR
757 	/**
758 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
759 	 * decided by kernel-writable memory. This function is used in various parts
760 	 * of the PPL, and besides validating that the pointer returned by this
761 	 * function is indeed a pmap structure, it's also important to ensure that
762 	 * it's actually the current thread's pmap. This is because different pmaps
763 	 * will have access to different entitlements based on the code signature of
764 	 * their loaded process. So if a different user pmap is set in the current
765 	 * thread structure (in an effort to bypass code signing restrictions), even
766 	 * though the structure would validate correctly as it is a real pmap
767 	 * structure, it should fail here.
768 	 *
769 	 * This only needs to occur for user pmaps because the kernel pmap's root
770 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
771 	 * changed so it'd be redundant to check), and its code signing fields are
772 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
773 	 * it shouldn't be possible to set those fields. Due to that, an attacker
774 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
775 	 * this check won't accomplish anything as it doesn't provide any extra code
776 	 * signing entitlements.
777 	 */
778 	if ((current != kernel_pmap) &&
779 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
780 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
781 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
782 	}
783 #endif /* XNU_MONITOR */
784 
785 	return current;
786 }
787 
788 #if DEVELOPMENT || DEBUG
789 
790 /*
791  * Trace levels are controlled by a bitmask in which each
792  * level can be enabled/disabled by the (1<<level) position
793  * in the boot arg
794  * Level 0: PPL extension functionality
795  * Level 1: pmap lifecycle (create/destroy/switch)
796  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
797  * Level 3: internal state management (attributes/fast-fault)
798  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
799  */
800 
801 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
802 
803 #define PMAP_TRACE(level, ...) \
804 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
805 	        KDBG_RELEASE(__VA_ARGS__); \
806 	}
807 #else /* DEVELOPMENT || DEBUG */
808 
809 #define PMAP_TRACE(level, ...)
810 
811 #endif /* DEVELOPMENT || DEBUG */
812 
813 
814 /*
815  * Internal function prototypes (forward declarations).
816  */
817 
818 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
819 
820 static void pmap_set_reference(ppnum_t pn);
821 
822 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
823 
824 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
825 
826 static kern_return_t pmap_expand(
827 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
828 
829 static int pmap_remove_range(
830 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
831 
832 static tt_entry_t *pmap_tt1_allocate(
833 	pmap_t, vm_size_t, unsigned int);
834 
835 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
836 
837 static void pmap_tt1_deallocate(
838 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
839 
840 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
841 
842 static kern_return_t pmap_tt_allocate(
843 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
844 
845 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
846 
847 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
848 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
849 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
850 
851 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
852 
853 
854 static void pmap_unmap_commpage(
855 	pmap_t pmap);
856 
857 static boolean_t
858 pmap_is_64bit(pmap_t);
859 
860 
861 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
862 
863 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
864 
865 static bool pmap_update_cache_attributes_locked(
866 	ppnum_t, unsigned, bool);
867 
868 static boolean_t arm_clear_fast_fault(
869 	ppnum_t ppnum,
870 	vm_prot_t fault_type,
871 	pt_entry_t *pte_p);
872 
873 static void pmap_trim_self(pmap_t pmap);
874 static void pmap_trim_subord(pmap_t subord);
875 
876 
877 /*
878  * Temporary prototypes, while we wait for pmap_enter to move to taking an
879  * address instead of a page number.
880  */
881 static kern_return_t
882 pmap_enter_addr(
883 	pmap_t pmap,
884 	vm_map_address_t v,
885 	pmap_paddr_t pa,
886 	vm_prot_t prot,
887 	vm_prot_t fault_type,
888 	unsigned int flags,
889 	boolean_t wired);
890 
891 kern_return_t
892 pmap_enter_options_addr(
893 	pmap_t pmap,
894 	vm_map_address_t v,
895 	pmap_paddr_t pa,
896 	vm_prot_t prot,
897 	vm_prot_t fault_type,
898 	unsigned int flags,
899 	boolean_t wired,
900 	unsigned int options,
901 	__unused void   *arg,
902 	__unused pmap_mapping_type_t mapping_type);
903 
904 #ifdef CONFIG_XNUPOST
905 kern_return_t pmap_test(void);
906 #endif /* CONFIG_XNUPOST */
907 
908 PMAP_SUPPORT_PROTOTYPES(
909 	kern_return_t,
910 	arm_fast_fault, (pmap_t pmap,
911 	vm_map_address_t va,
912 	vm_prot_t fault_type,
913 	bool was_af_fault,
914 	bool from_user), ARM_FAST_FAULT_INDEX);
915 
916 PMAP_SUPPORT_PROTOTYPES(
917 	boolean_t,
918 	arm_force_fast_fault, (ppnum_t ppnum,
919 	vm_prot_t allow_mode,
920 	int options), ARM_FORCE_FAST_FAULT_INDEX);
921 
922 MARK_AS_PMAP_TEXT static boolean_t
923 arm_force_fast_fault_with_flush_range(
924 	ppnum_t ppnum,
925 	vm_prot_t allow_mode,
926 	int options,
927 	pmap_tlb_flush_range_t *flush_range);
928 
929 /**
930  * Definition of the states driving the batch cache attributes update
931  * state machine.
932  */
933 typedef struct {
934 	uint64_t page_index : 32,           /* The page index to be operated on */
935 	    state : 8,                      /* The current state of the update machine */
936 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
937 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
938 	:0;
939 } batch_set_cache_attr_state_t;
940 
941 /* Possible values of the "state" field. */
942 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
943 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
944 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
945 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
946 
947 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
948 
949 PMAP_SUPPORT_PROTOTYPES(
950 	batch_set_cache_attr_state_t,
951 	pmap_batch_set_cache_attributes, (
952 #if XNU_MONITOR
953 		volatile upl_page_info_t *user_page_list,
954 #else /* !XNU_MONITOR */
955 		upl_page_info_array_t user_page_list,
956 #endif /* XNU_MONITOR */
957 		batch_set_cache_attr_state_t state,
958 		unsigned int page_cnt,
959 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
960 
961 PMAP_SUPPORT_PROTOTYPES(
962 	kern_return_t,
963 	pmap_change_wiring, (pmap_t pmap,
964 	vm_map_address_t v,
965 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
966 
967 PMAP_SUPPORT_PROTOTYPES(
968 	pmap_t,
969 	pmap_create_options, (ledger_t ledger,
970 	vm_map_size_t size,
971 	unsigned int flags,
972 	kern_return_t * kr), PMAP_CREATE_INDEX);
973 
974 PMAP_SUPPORT_PROTOTYPES(
975 	void,
976 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
977 
978 PMAP_SUPPORT_PROTOTYPES(
979 	kern_return_t,
980 	pmap_enter_options, (pmap_t pmap,
981 	vm_map_address_t v,
982 	pmap_paddr_t pa,
983 	vm_prot_t prot,
984 	vm_prot_t fault_type,
985 	unsigned int flags,
986 	boolean_t wired,
987 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
988 
989 PMAP_SUPPORT_PROTOTYPES(
990 	pmap_paddr_t,
991 	pmap_find_pa, (pmap_t pmap,
992 	addr64_t va), PMAP_FIND_PA_INDEX);
993 
994 PMAP_SUPPORT_PROTOTYPES(
995 	kern_return_t,
996 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
997 
998 
999 PMAP_SUPPORT_PROTOTYPES(
1000 	boolean_t,
1001 	pmap_is_empty, (pmap_t pmap,
1002 	vm_map_offset_t va_start,
1003 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1004 
1005 
1006 PMAP_SUPPORT_PROTOTYPES(
1007 	unsigned int,
1008 	pmap_map_cpu_windows_copy, (ppnum_t pn,
1009 	vm_prot_t prot,
1010 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1011 
1012 PMAP_SUPPORT_PROTOTYPES(
1013 	void,
1014 	pmap_ro_zone_memcpy, (zone_id_t zid,
1015 	vm_offset_t va,
1016 	vm_offset_t offset,
1017 	const vm_offset_t new_data,
1018 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1019 
1020 PMAP_SUPPORT_PROTOTYPES(
1021 	uint64_t,
1022 	pmap_ro_zone_atomic_op, (zone_id_t zid,
1023 	vm_offset_t va,
1024 	vm_offset_t offset,
1025 	zro_atomic_op_t op,
1026 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1027 
1028 PMAP_SUPPORT_PROTOTYPES(
1029 	void,
1030 	pmap_ro_zone_bzero, (zone_id_t zid,
1031 	vm_offset_t va,
1032 	vm_offset_t offset,
1033 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1034 
1035 PMAP_SUPPORT_PROTOTYPES(
1036 	vm_map_offset_t,
1037 	pmap_nest, (pmap_t grand,
1038 	pmap_t subord,
1039 	addr64_t vstart,
1040 	uint64_t size,
1041 	vm_map_offset_t vrestart,
1042 	kern_return_t * krp), PMAP_NEST_INDEX);
1043 
1044 PMAP_SUPPORT_PROTOTYPES(
1045 	void,
1046 	pmap_page_protect_options, (ppnum_t ppnum,
1047 	vm_prot_t prot,
1048 	unsigned int options,
1049 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1050 
1051 PMAP_SUPPORT_PROTOTYPES(
1052 	vm_map_address_t,
1053 	pmap_protect_options, (pmap_t pmap,
1054 	vm_map_address_t start,
1055 	vm_map_address_t end,
1056 	vm_prot_t prot,
1057 	unsigned int options,
1058 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1059 
1060 PMAP_SUPPORT_PROTOTYPES(
1061 	kern_return_t,
1062 	pmap_query_page_info, (pmap_t pmap,
1063 	vm_map_offset_t va,
1064 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1065 
1066 PMAP_SUPPORT_PROTOTYPES(
1067 	mach_vm_size_t,
1068 	pmap_query_resident, (pmap_t pmap,
1069 	vm_map_address_t start,
1070 	vm_map_address_t end,
1071 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1072 
1073 PMAP_SUPPORT_PROTOTYPES(
1074 	void,
1075 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1076 
1077 PMAP_SUPPORT_PROTOTYPES(
1078 	vm_map_address_t,
1079 	pmap_remove_options, (pmap_t pmap,
1080 	vm_map_address_t start,
1081 	vm_map_address_t end,
1082 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1083 
1084 
1085 PMAP_SUPPORT_PROTOTYPES(
1086 	void,
1087 	pmap_set_cache_attributes, (ppnum_t pn,
1088 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1089 
1090 PMAP_SUPPORT_PROTOTYPES(
1091 	void,
1092 	pmap_update_compressor_page, (ppnum_t pn,
1093 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1094 
1095 PMAP_SUPPORT_PROTOTYPES(
1096 	void,
1097 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1098 
1099 #if MACH_ASSERT || XNU_MONITOR
1100 PMAP_SUPPORT_PROTOTYPES(
1101 	void,
1102 	pmap_set_process, (pmap_t pmap,
1103 	int pid,
1104 	char *procname), PMAP_SET_PROCESS_INDEX);
1105 #endif
1106 
1107 PMAP_SUPPORT_PROTOTYPES(
1108 	void,
1109 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1110 
1111 PMAP_SUPPORT_PROTOTYPES(
1112 	vm_map_offset_t,
1113 	pmap_unnest_options, (pmap_t grand,
1114 	addr64_t vaddr,
1115 	uint64_t size,
1116 	vm_map_offset_t vrestart,
1117 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1118 
1119 PMAP_SUPPORT_PROTOTYPES(
1120 	void,
1121 	phys_attribute_set, (ppnum_t pn,
1122 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1123 
1124 PMAP_SUPPORT_PROTOTYPES(
1125 	void,
1126 	phys_attribute_clear, (ppnum_t pn,
1127 	unsigned int bits,
1128 	int options,
1129 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1130 
1131 #if __ARM_RANGE_TLBI__
1132 PMAP_SUPPORT_PROTOTYPES(
1133 	vm_map_address_t,
1134 	phys_attribute_clear_range, (pmap_t pmap,
1135 	vm_map_address_t start,
1136 	vm_map_address_t end,
1137 	unsigned int bits,
1138 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1139 #endif /* __ARM_RANGE_TLBI__ */
1140 
1141 
1142 PMAP_SUPPORT_PROTOTYPES(
1143 	void,
1144 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1145 
1146 PMAP_SUPPORT_PROTOTYPES(
1147 	void,
1148 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1149 
1150 PMAP_SUPPORT_PROTOTYPES(
1151 	void,
1152 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1153 
1154 PMAP_SUPPORT_PROTOTYPES(
1155 	void,
1156 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1157 
1158 PMAP_SUPPORT_PROTOTYPES(
1159 	void,
1160 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1161 
1162 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1163 PMAP_SUPPORT_PROTOTYPES(
1164 	void,
1165 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1166 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1167 
1168 /* Definition of the states used by pmap_trim(). */
1169 typedef enum {
1170 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1171 	PMAP_TRIM_STATE_START = 0,
1172 
1173 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1174 	PMAP_TRIM_STATE_GRAND_BEFORE,
1175 
1176 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1177 	PMAP_TRIM_STATE_GRAND_AFTER,
1178 
1179 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1180 	PMAP_TRIM_STATE_SUBORD,
1181 
1182 	/* Marks that trimming is finished. */
1183 	PMAP_TRIM_STATE_DONE,
1184 
1185 	/* Sentry enum for sanity checks. */
1186 	PMAP_TRIM_STATE_COUNT,
1187 } pmap_trim_state_t;
1188 
1189 PMAP_SUPPORT_PROTOTYPES(
1190 	pmap_trim_state_t,
1191 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1192 
1193 #if HAS_APPLE_PAC
1194 PMAP_SUPPORT_PROTOTYPES(
1195 	void *,
1196 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1197 PMAP_SUPPORT_PROTOTYPES(
1198 	void *,
1199 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1200 #endif /* HAS_APPLE_PAC */
1201 
1202 
1203 
1204 
1205 PMAP_SUPPORT_PROTOTYPES(
1206 	kern_return_t,
1207 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1208 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1209 
1210 PMAP_SUPPORT_PROTOTYPES(
1211 	kern_return_t,
1212 	pmap_load_trust_cache_with_type, (TCType_t type,
1213 	const vm_address_t pmap_img4_payload,
1214 	const vm_size_t pmap_img4_payload_len,
1215 	const vm_address_t img4_manifest,
1216 	const vm_size_t img4_manifest_len,
1217 	const vm_address_t img4_aux_manifest,
1218 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1219 
1220 PMAP_SUPPORT_PROTOTYPES(
1221 	void,
1222 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1223 
1224 PMAP_SUPPORT_PROTOTYPES(
1225 	kern_return_t,
1226 	pmap_query_trust_cache, (TCQueryType_t query_type,
1227 	const uint8_t cdhash[kTCEntryHashSize],
1228 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1229 
1230 PMAP_SUPPORT_PROTOTYPES(
1231 	errno_t,
1232 	pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1233 	const void *input_data,
1234 	size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1235 
1236 #if PMAP_CS_INCLUDE_CODE_SIGNING
1237 
1238 PMAP_SUPPORT_PROTOTYPES(
1239 	kern_return_t,
1240 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1241 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1242 
1243 PMAP_SUPPORT_PROTOTYPES(
1244 	kern_return_t,
1245 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1246 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1247 
1248 PMAP_SUPPORT_PROTOTYPES(
1249 	kern_return_t,
1250 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1251 	pmap_cs_profile_t * profile_obj),
1252 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1253 
1254 PMAP_SUPPORT_PROTOTYPES(
1255 	kern_return_t,
1256 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1257 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1258 
1259 PMAP_SUPPORT_PROTOTYPES(
1260 	kern_return_t,
1261 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1262 	const void *kernel_entitlements),
1263 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1264 
1265 PMAP_SUPPORT_PROTOTYPES(
1266 	kern_return_t,
1267 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1268 	const void **kernel_entitlements),
1269 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1270 
1271 PMAP_SUPPORT_PROTOTYPES(
1272 	kern_return_t,
1273 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1274 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1275 
1276 PMAP_SUPPORT_PROTOTYPES(
1277 	kern_return_t,
1278 	pmap_cs_allow_invalid, (pmap_t pmap),
1279 	PMAP_CS_ALLOW_INVALID_INDEX);
1280 
1281 PMAP_SUPPORT_PROTOTYPES(
1282 	void,
1283 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1284 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1285 
1286 PMAP_SUPPORT_PROTOTYPES(
1287 	bool,
1288 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1289 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1290 
1291 PMAP_SUPPORT_PROTOTYPES(
1292 	void,
1293 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1294 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1295 
1296 PMAP_SUPPORT_PROTOTYPES(
1297 	void,
1298 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1299 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1300 
1301 #endif
1302 
1303 PMAP_SUPPORT_PROTOTYPES(
1304 	uint32_t,
1305 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1306 
1307 PMAP_SUPPORT_PROTOTYPES(
1308 	bool,
1309 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1310 
1311 PMAP_SUPPORT_PROTOTYPES(
1312 	void,
1313 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1314 
1315 void pmap_footprint_suspend(vm_map_t    map,
1316     boolean_t   suspend);
1317 PMAP_SUPPORT_PROTOTYPES(
1318 	void,
1319 	pmap_footprint_suspend, (vm_map_t map,
1320 	boolean_t suspend),
1321 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1322 
1323 
1324 
1325 
1326 #if DEVELOPMENT || DEBUG
1327 PMAP_SUPPORT_PROTOTYPES(
1328 	kern_return_t,
1329 	pmap_test_text_corruption, (pmap_paddr_t),
1330 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1331 #endif /* DEVELOPMENT || DEBUG */
1332 
1333 /*
1334  * The low global vector page is mapped at a fixed alias.
1335  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1336  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1337  * to check both addresses anyway for backward compatibility. So for now
1338  * we leave H6 and H7 where they were.
1339  */
1340 #if (ARM_PGSHIFT == 14)
1341 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1342 #else
1343 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1344 #endif
1345 
1346 
1347 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1348 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1349 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1350 
1351 #if XNU_MONITOR
1352 
1353 #if __has_feature(ptrauth_calls)
1354 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1355 #else
1356 #define __ptrauth_ppl_handler
1357 #endif
1358 
1359 /*
1360  * Table of function pointers used for PPL dispatch.
1361  */
1362 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1363 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1364 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1365 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1366 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1367 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1368 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1369 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1370 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1371 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1372 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1373 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1374 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1375 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1376 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1377 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1378 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1379 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1380 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1381 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1382 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1383 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1384 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1385 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1386 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1387 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1388 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1389 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1390 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1391 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1392 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1393 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1394 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1395 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1396 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1397 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1398 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1399 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1400 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1401 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1402 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1403 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1404 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1405 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1406 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1407 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1408 	[PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1409 #if PMAP_CS_INCLUDE_CODE_SIGNING
1410 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1411 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1412 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1413 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1414 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1415 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1416 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1417 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1418 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1419 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1420 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1421 #endif
1422 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1423 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1424 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1425 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1426 #if HAS_APPLE_PAC
1427 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1428 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1429 #endif /* HAS_APPLE_PAC */
1430 #if __ARM_RANGE_TLBI__
1431 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1432 #endif /* __ARM_RANGE_TLBI__ */
1433 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1434 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1435 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1436 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1437 
1438 #if DEVELOPMENT || DEBUG
1439 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1440 #endif /* DEVELOPMENT || DEBUG */
1441 
1442 };
1443 #endif
1444 
1445 #if XNU_MONITOR
1446 /**
1447  * A convenience function for setting protections on a single physical
1448  * aperture or static region mapping without invalidating the TLB.
1449  *
1450  * @note This function does not perform any TLB invalidations. That must be done
1451  *       separately to be able to safely use the updated mapping.
1452  *
1453  * @note This function understands the difference between the VM page size and
1454  *       the kernel page size and will update multiple PTEs if the sizes differ.
1455  *       In other words, enough PTEs will always get updated to change the
1456  *       permissions on a PAGE_SIZE amount of memory.
1457  *
1458  * @note The PVH lock for the physical page represented by this mapping must
1459  *       already be locked.
1460  *
1461  * @note This function assumes the caller has already verified that the PTE
1462  *       pointer does indeed point to a physical aperture or static region page
1463  *       table. Please validate your inputs before passing it along to this
1464  *       function.
1465  *
1466  * @param ptep Pointer to the physical aperture or static region page table to
1467  *             update with a new XPRR index.
1468  * @param expected_perm The XPRR index that is expected to already exist at the
1469  *                      current mapping. If the current index doesn't match this
1470  *                      then the system will panic.
1471  * @param new_perm The new XPRR index to update the mapping with.
1472  */
1473 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1474 pmap_set_pte_xprr_perm(
1475 	pt_entry_t * const ptep,
1476 	unsigned int expected_perm,
1477 	unsigned int new_perm)
1478 {
1479 	assert(ptep != NULL);
1480 
1481 	pt_entry_t spte = *ptep;
1482 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1483 
1484 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1485 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1486 		    __func__, ptep, new_perm, expected_perm);
1487 	}
1488 
1489 	/**
1490 	 * The PTE involved should be valid, should not have the hint bit set, and
1491 	 * should have the expected XPRR index.
1492 	 */
1493 	if (__improbable(!pte_is_valid(spte))) {
1494 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1495 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1496 		    __func__, ptep, spte, new_perm, expected_perm);
1497 	}
1498 
1499 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1500 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1501 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1502 		    __func__, ptep, spte, new_perm, expected_perm);
1503 	}
1504 
1505 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1506 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1507 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1508 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1509 	}
1510 
1511 	pt_entry_t template = spte;
1512 	template &= ~ARM_PTE_XPRR_MASK;
1513 	template |= xprr_perm_to_pte(new_perm);
1514 
1515 	write_pte_strong(ptep, template);
1516 }
1517 
1518 /**
1519  * Update the protections on a single physical aperture mapping and invalidate
1520  * the TLB so the mapping can be used.
1521  *
1522  * @note The PVH lock for the physical page must already be locked.
1523  *
1524  * @param pai The physical address index of the page whose physical aperture
1525  *            mapping will be updated with new permissions.
1526  * @param expected_perm The XPRR index that is expected to already exist at the
1527  *                      current mapping. If the current index doesn't match this
1528  *                      then the system will panic.
1529  * @param new_perm The new XPRR index to update the mapping with.
1530  */
1531 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1532 pmap_set_xprr_perm(
1533 	unsigned int pai,
1534 	unsigned int expected_perm,
1535 	unsigned int new_perm)
1536 {
1537 	pvh_assert_locked(pai);
1538 
1539 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1540 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1541 
1542 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1543 
1544 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1545 	sync_tlb_flush();
1546 }
1547 
1548 /**
1549  * Update the protections on a range of physical aperture or static region
1550  * mappings and invalidate the TLB so the mappings can be used.
1551  *
1552  * @note Static region mappings can only be updated before machine_lockdown().
1553  *       Physical aperture mappings can be updated at any time.
1554  *
1555  * @param start The starting virtual address of the static region or physical
1556  *              aperture range whose permissions will be updated.
1557  * @param end The final (inclusive) virtual address of the static region or
1558  *            physical aperture range whose permissions will be updated.
1559  * @param expected_perm The XPRR index that is expected to already exist at the
1560  *                      current mappings. If the current indices don't match
1561  *                      this then the system will panic.
1562  * @param new_perm The new XPRR index to update the mappings with.
1563  */
1564 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1565 pmap_set_range_xprr_perm(
1566 	vm_address_t start,
1567 	vm_address_t end,
1568 	unsigned int expected_perm,
1569 	unsigned int new_perm)
1570 {
1571 	/**
1572 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1573 	 */
1574 	if (__improbable((start | end) & ARM_PGMASK)) {
1575 		panic_plain("%s: start or end not page aligned, "
1576 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1577 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1578 	}
1579 
1580 	if (__improbable(start > end)) {
1581 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1582 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1583 	}
1584 
1585 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1586 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1587 
1588 	if (__improbable(!(in_physmap || in_static))) {
1589 		panic_plain("%s: address not in static region or physical aperture, "
1590 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1591 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1592 	}
1593 
1594 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1595 		panic_plain("%s: invalid XPRR index, "
1596 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1597 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1598 	}
1599 
1600 	/*
1601 	 * Walk over the PTEs for the given range, and set the protections on those
1602 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1603 	 * one twig entry (whichever twig entry currently maps "va").
1604 	 */
1605 	vm_address_t va = start;
1606 	while (va < end) {
1607 		/**
1608 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1609 		 * PTEs from va to tte_va_end will have their permissions updated.
1610 		 */
1611 		vm_address_t tte_va_end =
1612 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1613 
1614 		if (tte_va_end > end) {
1615 			tte_va_end = end;
1616 		}
1617 
1618 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1619 
1620 		if (ttep == NULL) {
1621 			panic_plain("%s: physical aperture or static region tte is NULL, "
1622 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1623 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1624 		}
1625 
1626 		tt_entry_t tte = *ttep;
1627 
1628 		if (!tte_is_valid_table(tte)) {
1629 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1630 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1631 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1632 		}
1633 
1634 		/* Walk over the given L3 page table page and update the PTEs. */
1635 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1636 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1637 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1638 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1639 
1640 		/**
1641 		 * The current PTE pointer is incremented by the page ratio (ratio of
1642 		 * VM page size to kernel hardware page size) because one call to
1643 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1644 		 * a PAGE_SIZE worth of hardware pages.
1645 		 */
1646 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1647 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1648 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1649 			pvh_lock(pai);
1650 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1651 			pvh_unlock(pai);
1652 		}
1653 
1654 		va = tte_va_end;
1655 	}
1656 
1657 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1658 }
1659 
1660 #endif /* XNU_MONITOR */
1661 
1662 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1663 PMAP_ZINFO_PALLOC(
1664 	pmap_t pmap, int bytes)
1665 {
1666 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1667 }
1668 
1669 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1670 PMAP_ZINFO_PFREE(
1671 	pmap_t pmap,
1672 	int bytes)
1673 {
1674 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1675 }
1676 
1677 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1678 pmap_tt_ledger_credit(
1679 	pmap_t          pmap,
1680 	vm_size_t       size)
1681 {
1682 	if (pmap != kernel_pmap) {
1683 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1684 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1685 	}
1686 }
1687 
1688 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1689 pmap_tt_ledger_debit(
1690 	pmap_t          pmap,
1691 	vm_size_t       size)
1692 {
1693 	if (pmap != kernel_pmap) {
1694 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1695 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1696 	}
1697 }
1698 
1699 static inline void
pmap_update_plru(uint16_t asid_index __unused)1700 pmap_update_plru(uint16_t asid_index __unused)
1701 {
1702 #if !HAS_16BIT_ASID
1703 	if (__probable(pmap_asid_plru)) {
1704 		unsigned plru_index = asid_index >> 6;
1705 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1706 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1707 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1708 		}
1709 	}
1710 #endif /* !HAS_16BIT_ASID */
1711 }
1712 
1713 static bool
alloc_asid(pmap_t pmap)1714 alloc_asid(pmap_t pmap)
1715 {
1716 	int vasid = -1;
1717 	uint16_t hw_asid;
1718 
1719 	pmap_simple_lock(&asid_lock);
1720 
1721 #if !HAS_16BIT_ASID
1722 	if (__probable(pmap_asid_plru)) {
1723 		unsigned plru_index = 0;
1724 		uint64_t lowest_gen = asid_plru_generation[0];
1725 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1726 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1727 			if (asid_plru_generation[i] < lowest_gen) {
1728 				plru_index = i;
1729 				lowest_gen = asid_plru_generation[i];
1730 				lowest_gen_bitmap = asid_plru_bitmap[i];
1731 			}
1732 		}
1733 
1734 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1735 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1736 			if (temp_plru) {
1737 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1738 #if DEVELOPMENT || DEBUG
1739 				++pmap_asid_hits;
1740 #endif
1741 				break;
1742 			}
1743 		}
1744 	}
1745 #else
1746 	/**
1747 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1748 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1749 	 * However, we first try to allocate starting from the position of the most-recently allocated
1750 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1751 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1752 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1753 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1754 	 * logic, without requiring prohibitively expensive RCTX instructions.
1755 	 */
1756 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1757 #endif /* !HAS_16BIT_ASID */
1758 	if (__improbable(vasid < 0)) {
1759 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1760 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1761 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1762 #if DEVELOPMENT || DEBUG
1763 		++pmap_asid_misses;
1764 #endif
1765 	}
1766 	if (__improbable(vasid < 0)) {
1767 		pmap_simple_unlock(&asid_lock);
1768 		return false;
1769 	}
1770 	assert((uint32_t)vasid < pmap_max_asids);
1771 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1772 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1773 #if HAS_16BIT_ASID
1774 	last_allocated_asid = (uint16_t)vasid;
1775 #endif /* HAS_16BIT_ASID */
1776 	pmap_simple_unlock(&asid_lock);
1777 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1778 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1779 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1780 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1781 		 * reassign to a reserved VASID. */
1782 		assert(pmap->sw_asid < UINT8_MAX);
1783 		pmap->sw_asid = UINT8_MAX;
1784 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1785 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1786 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1787 		assert(hw_asid < MAX_HW_ASIDS);
1788 	}
1789 	pmap_update_plru(hw_asid);
1790 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1791 #if __ARM_KERNEL_PROTECT__
1792 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1793 #endif
1794 	pmap->hw_asid = hw_asid;
1795 	return true;
1796 }
1797 
1798 static void
free_asid(pmap_t pmap)1799 free_asid(pmap_t pmap)
1800 {
1801 	unsigned int vasid;
1802 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1803 	if (__improbable(hw_asid == 0)) {
1804 		return;
1805 	}
1806 
1807 #if __ARM_KERNEL_PROTECT__
1808 	hw_asid >>= 1;
1809 #endif
1810 	hw_asid -= 1;
1811 
1812 #if HAS_16BIT_ASID
1813 	vasid = hw_asid;
1814 #else
1815 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1816 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1817 	} else {
1818 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1819 	}
1820 
1821 	if (__probable(pmap_asid_plru)) {
1822 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1823 	}
1824 #endif /* HAS_16BIT_ASID */
1825 	pmap_simple_lock(&asid_lock);
1826 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1827 	bitmap_set(&asid_bitmap[0], vasid);
1828 	pmap_simple_unlock(&asid_lock);
1829 }
1830 
1831 
1832 boolean_t
pmap_valid_address(pmap_paddr_t addr)1833 pmap_valid_address(
1834 	pmap_paddr_t addr)
1835 {
1836 	return pa_valid(addr);
1837 }
1838 
1839 
1840 
1841 
1842 
1843 
1844 /*
1845  *      Map memory at initialization.  The physical addresses being
1846  *      mapped are not managed and are never unmapped.
1847  *
1848  *      For now, VM is already on, we only need to map the
1849  *      specified memory.
1850  */
1851 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1852 pmap_map(
1853 	vm_map_address_t virt,
1854 	vm_offset_t start,
1855 	vm_offset_t end,
1856 	vm_prot_t prot,
1857 	unsigned int flags)
1858 {
1859 	kern_return_t   kr;
1860 	vm_size_t       ps;
1861 
1862 	ps = PAGE_SIZE;
1863 	while (start < end) {
1864 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1865 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1866 
1867 		if (kr != KERN_SUCCESS) {
1868 			panic("%s: failed pmap_enter, "
1869 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1870 			    __FUNCTION__,
1871 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1872 		}
1873 
1874 		virt += ps;
1875 		start += ps;
1876 	}
1877 	return virt;
1878 }
1879 
1880 #if XNU_MONITOR
1881 /**
1882  * Remove kernel writeablity from an IO PTE value if the page is owned by
1883  * guarded mode software.
1884  *
1885  * @param paddr The physical address of the page which has to be non-DRAM.
1886  * @param tmplate The PTE value to be evaluated.
1887  *
1888  * @return A new PTE value with permission bits modified.
1889  */
1890 static inline
1891 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1892 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1893 {
1894 	assert(!pa_valid(paddr));
1895 
1896 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1897 
1898 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1899 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1900 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1901 		switch (xprr_perm) {
1902 		case XPRR_KERN_RO_PERM:
1903 			break;
1904 		case XPRR_KERN_RW_PERM:
1905 			tmplate &= ~ARM_PTE_XPRR_MASK;
1906 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1907 			break;
1908 		default:
1909 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1910 		}
1911 	}
1912 
1913 	return tmplate;
1914 }
1915 #endif /* XNU_MONITOR */
1916 
1917 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1918 pmap_map_bd_with_options(
1919 	vm_map_address_t virt,
1920 	vm_offset_t start,
1921 	vm_offset_t end,
1922 	vm_prot_t prot,
1923 	int32_t options)
1924 {
1925 	pt_entry_t      mem_attr;
1926 
1927 	switch (options & PMAP_MAP_BD_MASK) {
1928 	case PMAP_MAP_BD_WCOMB:
1929 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1930 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1931 		break;
1932 	case PMAP_MAP_BD_POSTED:
1933 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1934 		break;
1935 	case PMAP_MAP_BD_POSTED_REORDERED:
1936 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1937 		break;
1938 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1939 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1940 		break;
1941 	default:
1942 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1943 		break;
1944 	}
1945 
1946 	/* not cacheable and not buffered */
1947 	pt_entry_t tmplate = pa_to_pte(start)
1948 	    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1949 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1950 	    | mem_attr;
1951 
1952 #if __ARM_KERNEL_PROTECT__
1953 	tmplate |= ARM_PTE_NG;
1954 #endif /* __ARM_KERNEL_PROTECT__ */
1955 
1956 	vm_map_address_t vaddr = virt;
1957 	vm_offset_t paddr = start;
1958 	while (paddr < end) {
1959 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1960 		if (ptep == PT_ENTRY_NULL) {
1961 			panic("pmap_map_bd");
1962 		}
1963 
1964 		/**
1965 		 * For every iteration, the paddr encoded in tmplate is incrementing,
1966 		 * but we always start with the original AP bits defined at the top
1967 		 * of the function in tmplate and only modify the AP bits in the pte
1968 		 * variable.
1969 		 */
1970 		pt_entry_t pte;
1971 #if XNU_MONITOR
1972 		if (!pa_valid(paddr)) {
1973 			pte = pmap_construct_io_pte(paddr, tmplate);
1974 		} else {
1975 			pte = tmplate;
1976 		}
1977 #else /* !XNU_MONITOR */
1978 		pte = tmplate;
1979 #endif
1980 
1981 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1982 		write_pte_strong(ptep, pte);
1983 
1984 		pte_increment_pa(tmplate);
1985 		vaddr += PAGE_SIZE;
1986 		paddr += PAGE_SIZE;
1987 	}
1988 
1989 	if (end >= start) {
1990 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1991 	}
1992 
1993 	return vaddr;
1994 }
1995 
1996 /*
1997  *      Back-door routine for mapping kernel VM at initialization.
1998  *      Useful for mapping memory outside the range
1999  *      [vm_first_phys, vm_last_phys] (i.e., devices).
2000  *      Otherwise like pmap_map.
2001  */
2002 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2003 pmap_map_bd(
2004 	vm_map_address_t virt,
2005 	vm_offset_t start,
2006 	vm_offset_t end,
2007 	vm_prot_t prot)
2008 {
2009 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
2010 }
2011 
2012 /*
2013  *      Back-door routine for mapping kernel VM at initialization.
2014  *      Useful for mapping memory specific physical addresses in early
2015  *      boot (i.e., before kernel_map is initialized).
2016  *
2017  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
2018  */
2019 
2020 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2021 pmap_map_high_window_bd(
2022 	vm_offset_t pa_start,
2023 	vm_size_t len,
2024 	vm_prot_t prot)
2025 {
2026 	pt_entry_t              *ptep, pte;
2027 	vm_map_address_t        va_start = VREGION1_START;
2028 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
2029 	vm_map_address_t        va_end;
2030 	vm_map_address_t        va;
2031 	vm_size_t               offset;
2032 
2033 	offset = pa_start & PAGE_MASK;
2034 	pa_start -= offset;
2035 	len += offset;
2036 
2037 	if (len > (va_max - va_start)) {
2038 		panic("%s: area too large, "
2039 		    "pa_start=%p, len=%p, prot=0x%x",
2040 		    __FUNCTION__,
2041 		    (void*)pa_start, (void*)len, prot);
2042 	}
2043 
2044 scan:
2045 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2046 		ptep = pmap_pte(kernel_pmap, va_start);
2047 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2048 		if (!pte_is_valid(*ptep)) {
2049 			break;
2050 		}
2051 	}
2052 	if (va_start > va_max) {
2053 		panic("%s: insufficient pages, "
2054 		    "pa_start=%p, len=%p, prot=0x%x",
2055 		    __FUNCTION__,
2056 		    (void*)pa_start, (void*)len, prot);
2057 	}
2058 
2059 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2060 		ptep = pmap_pte(kernel_pmap, va_end);
2061 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2062 		if (pte_is_valid(*ptep)) {
2063 			va_start = va_end + PAGE_SIZE;
2064 			goto scan;
2065 		}
2066 	}
2067 
2068 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2069 		ptep = pmap_pte(kernel_pmap, va);
2070 		pte = pa_to_pte(pa_start)
2071 		    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2072 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2073 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2074 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2075 #if __ARM_KERNEL_PROTECT__
2076 		pte |= ARM_PTE_NG;
2077 #endif /* __ARM_KERNEL_PROTECT__ */
2078 		write_pte_strong(ptep, pte);
2079 	}
2080 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2081 #if KASAN
2082 	kasan_notify_address(va_start, len);
2083 #endif
2084 	return va_start;
2085 }
2086 
2087 static uint32_t
pmap_compute_max_asids(void)2088 pmap_compute_max_asids(void)
2089 {
2090 	DTEntry entry;
2091 	void const *prop = NULL;
2092 	uint32_t max_asids;
2093 	int err;
2094 	unsigned int prop_size;
2095 
2096 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2097 	assert(err == kSuccess);
2098 
2099 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2100 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2101 		 * we can choose a more flexible default value here. */
2102 		return MAX_ASIDS;
2103 	}
2104 
2105 	if (prop_size != sizeof(max_asids)) {
2106 		panic("pmap-max-asids property is not a 32-bit integer");
2107 	}
2108 
2109 	max_asids = *((uint32_t const *)prop);
2110 #if HAS_16BIT_ASID
2111 	if (max_asids > MAX_HW_ASIDS) {
2112 		panic("pmap-max-asids 0x%x too large", max_asids);
2113 	}
2114 #else
2115 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2116 	max_asids = (max_asids + 63) & ~63UL;
2117 
2118 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2119 		/* currently capped by size of pmap->sw_asid */
2120 		panic("pmap-max-asids 0x%x too large", max_asids);
2121 	}
2122 #endif /* HAS_16BIT_ASID */
2123 	if (max_asids == 0) {
2124 		panic("pmap-max-asids cannot be zero");
2125 	}
2126 	return max_asids;
2127 }
2128 
2129 #if __arm64__
2130 /*
2131  * pmap_get_arm64_prot
2132  *
2133  * return effective armv8 VMSA block protections including
2134  * table AP/PXN/XN overrides of a pmap entry
2135  *
2136  */
2137 
2138 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2139 pmap_get_arm64_prot(
2140 	pmap_t pmap,
2141 	vm_offset_t addr)
2142 {
2143 	tt_entry_t tte = 0;
2144 	unsigned int level = 0;
2145 	uint64_t effective_prot_bits = 0;
2146 	uint64_t aggregate_tte = 0;
2147 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2148 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2149 
2150 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2151 		tte = *pmap_ttne(pmap, level, addr);
2152 
2153 		if (!(tte & ARM_TTE_VALID)) {
2154 			return 0;
2155 		}
2156 
2157 		if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
2158 			/* Block or page mapping; both have the same protection bit layout. */
2159 			break;
2160 		} else if (tte_is_table(tte)) {
2161 			/* All of the table bits we care about are overrides, so just OR them together. */
2162 			aggregate_tte |= tte;
2163 		}
2164 	}
2165 
2166 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2167 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2168 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2169 
2170 	/* Start with the PTE bits. */
2171 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2172 
2173 	/* Table AP bits mask out block/page AP bits */
2174 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2175 
2176 	/* XN/PXN bits can be OR'd in. */
2177 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2178 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2179 
2180 	return effective_prot_bits;
2181 }
2182 #endif /* __arm64__ */
2183 
2184 /**
2185  * Helper macros for accessing the "unnested" and "in-progress" bits in
2186  * pmap->nested_region_unnested_table_bitmap.
2187  */
2188 #define UNNEST_BIT(index) ((index) * 2)
2189 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2190 
2191 
2192 /*
2193  *	Bootstrap the system enough to run with virtual memory.
2194  *
2195  *	The early VM initialization code has already allocated
2196  *	the first CPU's translation table and made entries for
2197  *	all the one-to-one mappings to be found there.
2198  *
2199  *	We must set up the kernel pmap structures, the
2200  *	physical-to-virtual translation lookup tables for the
2201  *	physical memory to be managed (between avail_start and
2202  *	avail_end).
2203  *
2204  *	Map the kernel's code and data, and allocate the system page table.
2205  *	Page_size must already be set.
2206  *
2207  *	Parameters:
2208  *	first_avail	first available physical page -
2209  *			   after kernel page tables
2210  *	avail_start	PA of first managed physical page
2211  *	avail_end	PA of last managed physical page
2212  */
2213 
2214 void
pmap_bootstrap(vm_offset_t vstart)2215 pmap_bootstrap(
2216 	vm_offset_t vstart)
2217 {
2218 	vm_map_offset_t maxoffset;
2219 
2220 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2221 
2222 #if XNU_MONITOR
2223 
2224 #if DEVELOPMENT || DEBUG
2225 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2226 #endif
2227 
2228 #if CONFIG_CSR_FROM_DT
2229 	if (csr_unsafe_kernel_text) {
2230 		pmap_ppl_disable = true;
2231 	}
2232 #endif /* CONFIG_CSR_FROM_DT */
2233 
2234 #endif /* XNU_MONITOR */
2235 
2236 #if DEVELOPMENT || DEBUG
2237 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2238 		kprintf("Kernel traces for pmap operations enabled\n");
2239 	}
2240 #endif
2241 
2242 	/*
2243 	 *	Initialize the kernel pmap.
2244 	 */
2245 #if ARM_PARAMETERIZED_PMAP
2246 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2247 #endif /* ARM_PARAMETERIZED_PMAP */
2248 #if HAS_APPLE_PAC
2249 	kernel_pmap->disable_jop = 0;
2250 #endif /* HAS_APPLE_PAC */
2251 	kernel_pmap->tte = cpu_tte;
2252 	kernel_pmap->ttep = cpu_ttep;
2253 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2254 	kernel_pmap->max = UINTPTR_MAX;
2255 	os_atomic_init(&kernel_pmap->ref_count, 1);
2256 #if XNU_MONITOR
2257 	os_atomic_init(&kernel_pmap->nested_count, 0);
2258 #endif
2259 	kernel_pmap->nx_enabled = TRUE;
2260 #ifdef  __arm64__
2261 	kernel_pmap->is_64bit = TRUE;
2262 #else
2263 	kernel_pmap->is_64bit = FALSE;
2264 #endif
2265 #if CONFIG_ROSETTA
2266 	kernel_pmap->is_rosetta = FALSE;
2267 #endif
2268 
2269 #if ARM_PARAMETERIZED_PMAP
2270 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2271 #endif /* ARM_PARAMETERIZED_PMAP */
2272 
2273 	kernel_pmap->nested_region_addr = 0x0ULL;
2274 	kernel_pmap->nested_region_size = 0x0ULL;
2275 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2276 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2277 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2278 
2279 	kernel_pmap->hw_asid = 0;
2280 	kernel_pmap->sw_asid = 0;
2281 
2282 	pmap_lock_init(kernel_pmap);
2283 
2284 	pmap_max_asids = pmap_compute_max_asids();
2285 #if HAS_16BIT_ASID
2286 	asid_chunk_size = MAX_HW_ASIDS;
2287 #else
2288 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2289 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2290 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2291 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2292 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2293 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2294 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2295 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2296 #endif /* HAS_16BIT_ASIDS */
2297 
2298 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2299 
2300 #if HAS_SPECRES_DEBUGGING
2301 	PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2302 
2303 	if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2304 		panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2305 	}
2306 #endif /* HAS_SPECRES_DEBUGGING */
2307 
2308 	/**
2309 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2310 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2311 	 * space for these data structures.
2312 	 */
2313 	pmap_data_bootstrap();
2314 
2315 	/**
2316 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2317 	 */
2318 	uat_bootstrap();
2319 
2320 
2321 	/**
2322 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2323 	 */
2324 	sart_bootstrap();
2325 
2326 	/**
2327 	 * Don't make any assumptions about the alignment of avail_start before this
2328 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2329 	 */
2330 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2331 
2332 	const pmap_paddr_t pmap_struct_start = avail_start;
2333 
2334 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2335 	avail_start = round_page(avail_start + asid_table_size);
2336 
2337 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2338 
2339 	vm_first_phys = gPhysBase;
2340 	vm_last_phys = trunc_page(avail_end);
2341 
2342 	queue_init(&map_pmap_list);
2343 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2344 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2345 	free_page_size_tt_count = 0;
2346 	free_page_size_tt_max = 0;
2347 	free_tt_list = TT_FREE_ENTRY_NULL;
2348 	free_tt_count = 0;
2349 	free_tt_max = 0;
2350 
2351 	virtual_space_start = vstart;
2352 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2353 
2354 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2355 #if !HAS_16BIT_ASID
2356 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2357 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2358 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2359 #endif /* !HAS_16BIT_ASID */
2360 
2361 
2362 
2363 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2364 		maxoffset = trunc_page(maxoffset);
2365 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2366 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2367 			arm_pmap_max_offset_default = maxoffset;
2368 		}
2369 	}
2370 #if defined(__arm64__)
2371 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2372 		maxoffset = trunc_page(maxoffset);
2373 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2374 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2375 			arm64_pmap_max_offset_default = maxoffset;
2376 		}
2377 	}
2378 #endif
2379 
2380 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2381 
2382 
2383 #if PMAP_CS_PPL_MONITOR
2384 	/* Initialize the PPL trust cache read-write lock */
2385 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2386 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2387 #endif
2388 
2389 #if DEVELOPMENT || DEBUG
2390 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2391 	    &vm_footprint_suspend_allowed,
2392 	    sizeof(vm_footprint_suspend_allowed));
2393 #endif /* DEVELOPMENT || DEBUG */
2394 
2395 #if KASAN
2396 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2397 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2398 #endif /* KASAN */
2399 
2400 	/**
2401 	 * Ensure that avail_start is always left on a page boundary. The calling
2402 	 * code might not perform any alignment before allocating page tables so
2403 	 * this is important.
2404 	 */
2405 	avail_start = round_page(avail_start);
2406 }
2407 
2408 #if XNU_MONITOR
2409 
2410 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2411 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2412 {
2413 	pmap_paddr_t cur_pa;
2414 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2415 		assert(pa_valid(cur_pa));
2416 		ppattr_pa_set_monitor(cur_pa);
2417 	}
2418 }
2419 
2420 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2421 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2422     pmap_paddr_t end_pa,
2423     unsigned int expected_perm,
2424     unsigned int new_perm)
2425 {
2426 	vm_offset_t start_va = phystokv(start_pa);
2427 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2428 
2429 	pa_set_range_monitor(start_pa, end_pa);
2430 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2431 }
2432 
2433 static void
pmap_lockdown_kc(void)2434 pmap_lockdown_kc(void)
2435 {
2436 	extern vm_offset_t vm_kernelcache_base;
2437 	extern vm_offset_t vm_kernelcache_top;
2438 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2439 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2440 	pmap_paddr_t cur_pa = start_pa;
2441 	vm_offset_t cur_va = vm_kernelcache_base;
2442 	while (cur_pa < end_pa) {
2443 		vm_size_t range_size = end_pa - cur_pa;
2444 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2445 		if (ptov_va != cur_va) {
2446 			/*
2447 			 * If the physical address maps back to a virtual address that is non-linear
2448 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2449 			 * reclaimed by the OS and should therefore not be locked down.
2450 			 */
2451 			cur_pa += range_size;
2452 			cur_va += range_size;
2453 			continue;
2454 		}
2455 		unsigned int pai = pa_index(cur_pa);
2456 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2457 
2458 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2459 
2460 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2461 			panic("pai %d already locked down", pai);
2462 		}
2463 
2464 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2465 		cur_pa += ARM_PGBYTES;
2466 		cur_va += ARM_PGBYTES;
2467 	}
2468 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2469 	extern uint64_t ctrr_ro_test;
2470 	extern uint64_t ctrr_nx_test;
2471 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2472 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2473 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2474 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2475 	}
2476 #endif
2477 }
2478 
2479 void
pmap_static_allocations_done(void)2480 pmap_static_allocations_done(void)
2481 {
2482 	pmap_paddr_t monitor_start_pa;
2483 	pmap_paddr_t monitor_end_pa;
2484 
2485 	/*
2486 	 * Protect the bootstrap (V=P and V->P) page tables.
2487 	 *
2488 	 * These bootstrap allocations will be used primarily for page tables.
2489 	 * If we wish to secure the page tables, we need to start by marking
2490 	 * these bootstrap allocations as pages that we want to protect.
2491 	 */
2492 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2493 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2494 
2495 	/* The bootstrap page tables are mapped RW at boostrap. */
2496 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2497 
2498 	/*
2499 	 * We use avail_start as a pointer to the first address that has not
2500 	 * been reserved for bootstrap, so we know which pages to give to the
2501 	 * virtual memory layer.
2502 	 */
2503 	monitor_start_pa = first_avail_phys;
2504 	monitor_end_pa = avail_start;
2505 
2506 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2507 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2508 
2509 	/*
2510 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2511 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2512 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2513 	 * they can't be allocated for other uses.  We don't need a special xPRR
2514 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2515 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2516 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2517 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2518 	 * to believe we are dealing with an user XO page upon performing a translation.
2519 	 */
2520 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2521 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2522 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2523 
2524 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2525 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2526 
2527 	/* PPL data is RW for the PPL, RO for the kernel. */
2528 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2529 
2530 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2531 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2532 
2533 	/* PPL text is RX for the PPL, RO for the kernel. */
2534 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2535 
2536 
2537 	/*
2538 	 * In order to support DTrace, the save areas for the PPL must be
2539 	 * writable.  This is due to the fact that DTrace will try to update
2540 	 * register state.
2541 	 */
2542 	if (pmap_ppl_disable) {
2543 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2544 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2545 
2546 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2547 	}
2548 
2549 
2550 	if (segSizePPLDATACONST > 0) {
2551 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2552 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2553 
2554 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2555 	}
2556 
2557 	/*
2558 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2559 	 * precaution.  The real RW mappings are at a different location with guard pages.
2560 	 */
2561 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2562 
2563 	/* Prevent remapping of the kernelcache */
2564 	pmap_lockdown_kc();
2565 }
2566 
2567 
2568 void
pmap_lockdown_ppl(void)2569 pmap_lockdown_ppl(void)
2570 {
2571 	/* Mark the PPL as being locked down. */
2572 
2573 	mp_disable_preemption(); // for _nopreempt locking operations
2574 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2575 	if (commpage_text_kva != 0) {
2576 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2577 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2578 	}
2579 	mp_enable_preemption();
2580 
2581 	/* Write-protect the kernel RO commpage. */
2582 #error "XPRR configuration error"
2583 }
2584 #endif /* XNU_MONITOR */
2585 
2586 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2587 pmap_virtual_space(
2588 	vm_offset_t *startp,
2589 	vm_offset_t *endp
2590 	)
2591 {
2592 	*startp = virtual_space_start;
2593 	*endp = virtual_space_end;
2594 }
2595 
2596 
2597 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2598 pmap_virtual_region(
2599 	unsigned int region_select,
2600 	vm_map_offset_t *startp,
2601 	vm_map_size_t *size
2602 	)
2603 {
2604 	boolean_t       ret = FALSE;
2605 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2606 	if (region_select == 0) {
2607 		/*
2608 		 * In this config, the bootstrap mappings should occupy their own L2
2609 		 * TTs, as they should be immutable after boot.  Having the associated
2610 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2611 		 * while allowing the rest of the kernel address range to be remapped.
2612 		 */
2613 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2614 #if defined(ARM_LARGE_MEMORY)
2615 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2616 #else
2617 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2618 #endif
2619 		ret = TRUE;
2620 	}
2621 
2622 #if defined(ARM_LARGE_MEMORY)
2623 	if (region_select == 1) {
2624 		*startp = VREGION1_START;
2625 		*size = VREGION1_SIZE;
2626 		ret = TRUE;
2627 	}
2628 #endif
2629 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2630 #if defined(ARM_LARGE_MEMORY)
2631 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2632 	if (region_select == 0) {
2633 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2634 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2635 		ret = TRUE;
2636 	}
2637 
2638 	if (region_select == 1) {
2639 		*startp = VREGION1_START;
2640 		*size = VREGION1_SIZE;
2641 		ret = TRUE;
2642 	}
2643 #else /* !defined(ARM_LARGE_MEMORY) */
2644 	unsigned long low_global_vr_mask = 0;
2645 	vm_map_size_t low_global_vr_size = 0;
2646 
2647 	if (region_select == 0) {
2648 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2649 		if (!TEST_PAGE_SIZE_4K) {
2650 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2651 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2652 		} else {
2653 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2654 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2655 		}
2656 		ret = TRUE;
2657 	}
2658 	if (region_select == 1) {
2659 		*startp = VREGION1_START;
2660 		*size = VREGION1_SIZE;
2661 		ret = TRUE;
2662 	}
2663 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2664 	if (!TEST_PAGE_SIZE_4K) {
2665 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2666 		low_global_vr_size = 0x2000000;
2667 	} else {
2668 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2669 		low_global_vr_size = 0x800000;
2670 	}
2671 
2672 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2673 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2674 		*size = low_global_vr_size;
2675 		ret = TRUE;
2676 	}
2677 
2678 	if (region_select == 3) {
2679 		/* In this config, we allow the bootstrap mappings to occupy the same
2680 		 * page table pages as the heap.
2681 		 */
2682 		*startp = VM_MIN_KERNEL_ADDRESS;
2683 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2684 		ret = TRUE;
2685 	}
2686 #endif /* defined(ARM_LARGE_MEMORY) */
2687 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2688 	return ret;
2689 }
2690 
2691 /*
2692  * Routines to track and allocate physical pages during early boot.
2693  * On most systems that memory runs from first_avail through to avail_end
2694  * with no gaps.
2695  *
2696  * If the system supports ECC and ecc_bad_pages_count > 0, we
2697  * need to skip those pages.
2698  */
2699 
2700 static unsigned int avail_page_count = 0;
2701 static bool need_ram_ranges_init = true;
2702 
2703 
2704 /**
2705  * Checks to see if a given page is in
2706  * the array of known bad pages
2707  *
2708  * @param ppn page number to check
2709  */
2710 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2711 pmap_is_bad_ram(__unused ppnum_t ppn)
2712 {
2713 	return false;
2714 }
2715 
2716 /**
2717  * Prepare bad ram pages to be skipped.
2718  */
2719 
2720 /*
2721  * Initialize the count of available pages. No lock needed here,
2722  * as this code is called while kernel boot up is single threaded.
2723  */
2724 static void
initialize_ram_ranges(void)2725 initialize_ram_ranges(void)
2726 {
2727 	pmap_paddr_t first = first_avail;
2728 	pmap_paddr_t end = avail_end;
2729 
2730 	assert(first <= end);
2731 	assert(first == (first & ~PAGE_MASK));
2732 	assert(end == (end & ~PAGE_MASK));
2733 	avail_page_count = atop(end - first);
2734 
2735 	need_ram_ranges_init = false;
2736 }
2737 
2738 unsigned int
pmap_free_pages(void)2739 pmap_free_pages(
2740 	void)
2741 {
2742 	if (need_ram_ranges_init) {
2743 		initialize_ram_ranges();
2744 	}
2745 	return avail_page_count;
2746 }
2747 
2748 unsigned int
pmap_free_pages_span(void)2749 pmap_free_pages_span(
2750 	void)
2751 {
2752 	if (need_ram_ranges_init) {
2753 		initialize_ram_ranges();
2754 	}
2755 	return (unsigned int)atop(avail_end - first_avail);
2756 }
2757 
2758 
2759 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2760 pmap_next_page_hi(
2761 	ppnum_t            * pnum,
2762 	__unused boolean_t might_free)
2763 {
2764 	return pmap_next_page(pnum);
2765 }
2766 
2767 
2768 boolean_t
pmap_next_page(ppnum_t * pnum)2769 pmap_next_page(
2770 	ppnum_t *pnum)
2771 {
2772 	if (need_ram_ranges_init) {
2773 		initialize_ram_ranges();
2774 	}
2775 
2776 
2777 	if (first_avail != avail_end) {
2778 		*pnum = (ppnum_t)atop(first_avail);
2779 		first_avail += PAGE_SIZE;
2780 		assert(avail_page_count > 0);
2781 		--avail_page_count;
2782 		return TRUE;
2783 	}
2784 	assert(avail_page_count == 0);
2785 	return FALSE;
2786 }
2787 
2788 
2789 /**
2790  * Helper function to check wheter the given physical
2791  * page number is a restricted page.
2792  *
2793  * @param pn the physical page number to query.
2794  */
2795 bool
pmap_is_page_restricted(__unused ppnum_t pn)2796 pmap_is_page_restricted(__unused ppnum_t pn)
2797 {
2798 	return false;
2799 }
2800 
2801 /*
2802  *	Initialize the pmap module.
2803  *	Called by vm_init, to initialize any structures that the pmap
2804  *	system needs to map virtual memory.
2805  */
2806 void
pmap_init(void)2807 pmap_init(
2808 	void)
2809 {
2810 	/*
2811 	 *	Protect page zero in the kernel map.
2812 	 *	(can be overruled by permanent transltion
2813 	 *	table entries at page zero - see arm_vm_init).
2814 	 */
2815 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2816 
2817 	pmap_initialized = TRUE;
2818 
2819 	/*
2820 	 *	Create the zone of physical maps
2821 	 *	and the physical-to-virtual entries.
2822 	 */
2823 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2824 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2825 
2826 
2827 	/*
2828 	 *	Initialize the pmap object (for tracking the vm_page_t
2829 	 *	structures for pages we allocate to be page tables in
2830 	 *	pmap_expand().
2831 	 */
2832 	_vm_object_allocate(mem_size, pmap_object);
2833 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2834 
2835 	/*
2836 	 * The values of [hard_]maxproc may have been scaled, make sure
2837 	 * they are still less than the value of pmap_max_asids.
2838 	 */
2839 	if ((uint32_t)maxproc > pmap_max_asids) {
2840 		maxproc = pmap_max_asids;
2841 	}
2842 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2843 		hard_maxproc = pmap_max_asids;
2844 	}
2845 }
2846 
2847 /**
2848  * Verify that a given physical page contains no mappings (outside of the
2849  * default physical aperture mapping).
2850  *
2851  * @param ppnum Physical page number to check there are no mappings to.
2852  *
2853  * @return True if there are no mappings, false otherwise or if the page is not
2854  *         kernel-managed.
2855  */
2856 bool
pmap_verify_free(ppnum_t ppnum)2857 pmap_verify_free(ppnum_t ppnum)
2858 {
2859 	const pmap_paddr_t pa = ptoa(ppnum);
2860 
2861 	assert(pa != vm_page_fictitious_addr);
2862 
2863 	/* Only mappings to kernel-managed physical memory are tracked. */
2864 	if (!pa_valid(pa)) {
2865 		return false;
2866 	}
2867 
2868 	const unsigned int pai = pa_index(pa);
2869 	pv_entry_t **pvh = pai_to_pvh(pai);
2870 
2871 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2872 }
2873 
2874 #if MACH_ASSERT
2875 /**
2876  * Verify that a given physical page contains no mappings (outside of the
2877  * default physical aperture mapping) and if it does, then panic.
2878  *
2879  * @note It's recommended to use pmap_verify_free() directly when operating in
2880  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2881  *       normally being called from outside of the PPL, and the pv_head_table
2882  *       can't be modified outside of the PPL).
2883  *
2884  * @param ppnum Physical page number to check there are no mappings to.
2885  */
2886 void
pmap_assert_free(ppnum_t ppnum)2887 pmap_assert_free(ppnum_t ppnum)
2888 {
2889 	const pmap_paddr_t pa = ptoa(ppnum);
2890 
2891 	/* Only mappings to kernel-managed physical memory are tracked. */
2892 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2893 		return;
2894 	}
2895 
2896 	const unsigned int pai = pa_index(pa);
2897 	pv_entry_t **pvh = pai_to_pvh(pai);
2898 
2899 	/**
2900 	 * This function is always called from outside of the PPL. Because of this,
2901 	 * the PVH entry can't be locked. This function is generally only called
2902 	 * before the VM reclaims a physical page and shouldn't be creating new
2903 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2904 	 * the worst case is that the system will panic in another way, and we were
2905 	 * already about to panic anyway.
2906 	 */
2907 
2908 	/**
2909 	 * Since pmap_verify_free() returned false, that means there is at least one
2910 	 * mapping left. Let's get some extra info on the first mapping we find to
2911 	 * dump in the panic string (the common case is that there is one spare
2912 	 * mapping that was never unmapped).
2913 	 */
2914 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2915 
2916 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2917 		first_ptep = pvh_ptep(pvh);
2918 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2919 		pv_entry_t *pvep = pvh_pve_list(pvh);
2920 
2921 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2922 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2923 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2924 			if (first_ptep != PT_ENTRY_NULL) {
2925 				break;
2926 			}
2927 		}
2928 
2929 		/* The PVE should have at least one valid PTE. */
2930 		assert(first_ptep != PT_ENTRY_NULL);
2931 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2932 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2933 		    __func__, pvh, pai);
2934 	} else {
2935 		/**
2936 		 * The mapping disappeared between here and the pmap_verify_free() call.
2937 		 * The only way that can happen is if the VM was racing this call with
2938 		 * a call that unmaps PTEs. Operations on this page should not be
2939 		 * occurring at the same time as this check, and unfortunately we can't
2940 		 * lock the PVH entry to prevent it, so just panic instead.
2941 		 */
2942 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2943 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2944 		    __func__, pvh, pai);
2945 	}
2946 
2947 	/* Panic with a unique string identifying the first bad mapping and owner. */
2948 	{
2949 		/* First PTE is mapped by the main CPUs. */
2950 		pmap_t pmap = ptep_get_pmap(first_ptep);
2951 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2952 
2953 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2954 		    "%s CPU mapping (pmap: %p)",
2955 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2956 	}
2957 }
2958 #endif
2959 
2960 
2961 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2962 pmap_root_alloc_size(pmap_t pmap)
2963 {
2964 #pragma unused(pmap)
2965 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2966 	unsigned int root_level = pt_attr_root_level(pt_attr);
2967 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2968 }
2969 
2970 
2971 /*
2972  *	Create and return a physical map.
2973  *
2974  *	If the size specified for the map
2975  *	is zero, the map is an actual physical
2976  *	map, and may be referenced by the
2977  *	hardware.
2978  *
2979  *	If the size specified is non-zero,
2980  *	the map will be used in software only, and
2981  *	is bounded by that size.
2982  */
2983 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2984 pmap_create_options_internal(
2985 	ledger_t ledger,
2986 	vm_map_size_t size,
2987 	unsigned int flags,
2988 	kern_return_t *kr)
2989 {
2990 	unsigned        i;
2991 	unsigned        tte_index_max;
2992 	pmap_t          p;
2993 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2994 #if defined(HAS_APPLE_PAC)
2995 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2996 #endif /* defined(HAS_APPLE_PAC) */
2997 	kern_return_t   local_kr = KERN_SUCCESS;
2998 
2999 	if (size != 0) {
3000 		{
3001 			// Size parameter should only be set for stage 2.
3002 			return PMAP_NULL;
3003 		}
3004 	}
3005 
3006 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3007 		return PMAP_NULL;
3008 	}
3009 
3010 #if XNU_MONITOR
3011 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3012 		goto pmap_create_fail;
3013 	}
3014 
3015 	assert(p != PMAP_NULL);
3016 
3017 	if (ledger) {
3018 		pmap_ledger_validate(ledger);
3019 		pmap_ledger_retain(ledger);
3020 	}
3021 #else
3022 	/*
3023 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
3024 	 *	the translation table of the right size for the pmap.
3025 	 */
3026 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3027 		local_kr = KERN_RESOURCE_SHORTAGE;
3028 		goto pmap_create_fail;
3029 	}
3030 #endif
3031 
3032 	p->ledger = ledger;
3033 
3034 
3035 	p->pmap_vm_map_cs_enforced = false;
3036 	p->min = 0;
3037 
3038 
3039 #if CONFIG_ROSETTA
3040 	if (flags & PMAP_CREATE_ROSETTA) {
3041 		p->is_rosetta = TRUE;
3042 	} else {
3043 		p->is_rosetta = FALSE;
3044 	}
3045 #endif /* CONFIG_ROSETTA */
3046 
3047 #if defined(HAS_APPLE_PAC)
3048 	p->disable_jop = disable_jop;
3049 #endif /* defined(HAS_APPLE_PAC) */
3050 
3051 	p->nested_region_true_start = 0;
3052 	p->nested_region_true_end = ~0;
3053 
3054 	p->nx_enabled = true;
3055 	p->is_64bit = is_64bit;
3056 	p->nested_pmap = PMAP_NULL;
3057 	p->type = PMAP_TYPE_USER;
3058 
3059 #if ARM_PARAMETERIZED_PMAP
3060 	/* Default to the native pt_attr */
3061 	p->pmap_pt_attr = native_pt_attr;
3062 #endif /* ARM_PARAMETERIZED_PMAP */
3063 #if __ARM_MIXED_PAGE_SIZE__
3064 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3065 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3066 	}
3067 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3068 	p->max = pmap_user_va_size(p);
3069 
3070 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3071 		local_kr = KERN_NO_SPACE;
3072 		goto id_alloc_fail;
3073 	}
3074 
3075 	pmap_lock_init(p);
3076 
3077 	p->tt_entry_free = (tt_entry_t *)0;
3078 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3079 
3080 
3081 #if XNU_MONITOR
3082 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3083 #else
3084 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3085 #endif
3086 	if (!(p->tte)) {
3087 		local_kr = KERN_RESOURCE_SHORTAGE;
3088 		goto tt1_alloc_fail;
3089 	}
3090 
3091 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3092 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3093 
3094 	/* nullify the translation table */
3095 	for (i = 0; i < tte_index_max; i++) {
3096 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3097 	}
3098 
3099 	FLUSH_PTE();
3100 
3101 	/*
3102 	 *  initialize the rest of the structure
3103 	 */
3104 	p->nested_region_addr = 0x0ULL;
3105 	p->nested_region_size = 0x0ULL;
3106 	p->nested_region_unnested_table_bitmap = NULL;
3107 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3108 
3109 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3110 	p->nested_no_bounds_refcnt = 0;
3111 	p->nested_bounds_set = false;
3112 
3113 
3114 #if MACH_ASSERT
3115 	p->pmap_pid = 0;
3116 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3117 #endif /* MACH_ASSERT */
3118 #if DEVELOPMENT || DEBUG
3119 	p->footprint_was_suspended = FALSE;
3120 #endif /* DEVELOPMENT || DEBUG */
3121 
3122 #if XNU_MONITOR
3123 	os_atomic_init(&p->nested_count, 0);
3124 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3125 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3126 	os_atomic_thread_fence(release);
3127 #endif
3128 	os_atomic_init(&p->ref_count, 1);
3129 	pmap_simple_lock(&pmaps_lock);
3130 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3131 	pmap_simple_unlock(&pmaps_lock);
3132 
3133 	/*
3134 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3135 	 * which can lead to a concurrent disconnect operation making the balance
3136 	 * transiently negative.  The ledger should still ultimately balance out,
3137 	 * which we still check upon pmap destruction.
3138 	 */
3139 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3140 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3141 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3142 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3143 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3144 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3145 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3146 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3147 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3148 
3149 	return p;
3150 
3151 tt1_alloc_fail:
3152 	pmap_get_pt_ops(p)->free_id(p);
3153 id_alloc_fail:
3154 #if XNU_MONITOR
3155 	pmap_free_pmap(p);
3156 
3157 	if (ledger) {
3158 		pmap_ledger_release(ledger);
3159 	}
3160 #else
3161 	zfree(pmap_zone, p);
3162 #endif
3163 pmap_create_fail:
3164 #if XNU_MONITOR
3165 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3166 #endif
3167 	*kr = local_kr;
3168 #if XNU_MONITOR
3169 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3170 #endif
3171 	return PMAP_NULL;
3172 }
3173 
3174 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3175 pmap_create_options(
3176 	ledger_t ledger,
3177 	vm_map_size_t size,
3178 	unsigned int flags)
3179 {
3180 	pmap_t pmap;
3181 	kern_return_t kr = KERN_SUCCESS;
3182 
3183 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3184 
3185 	ledger_reference(ledger);
3186 
3187 #if XNU_MONITOR
3188 	for (;;) {
3189 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3190 		if (kr != KERN_RESOURCE_SHORTAGE) {
3191 			break;
3192 		}
3193 		assert(pmap == PMAP_NULL);
3194 		pmap_alloc_page_for_ppl(0);
3195 		kr = KERN_SUCCESS;
3196 	}
3197 #else
3198 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3199 #endif
3200 
3201 	if (pmap == PMAP_NULL) {
3202 		ledger_dereference(ledger);
3203 	}
3204 
3205 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3206 
3207 	return pmap;
3208 }
3209 
3210 #if XNU_MONITOR
3211 /*
3212  * This symbol remains in place when the PPL is enabled so that the dispatch
3213  * table does not change from development to release configurations.
3214  */
3215 #endif
3216 #if MACH_ASSERT || XNU_MONITOR
3217 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3218 pmap_set_process_internal(
3219 	__unused pmap_t pmap,
3220 	__unused int pid,
3221 	__unused char *procname)
3222 {
3223 #if MACH_ASSERT
3224 	if (pmap == NULL || pmap->pmap_pid == -1) {
3225 		return;
3226 	}
3227 
3228 	validate_pmap_mutable(pmap);
3229 
3230 	pmap->pmap_pid = pid;
3231 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3232 #endif /* MACH_ASSERT */
3233 }
3234 #endif /* MACH_ASSERT || XNU_MONITOR */
3235 
3236 #if MACH_ASSERT
3237 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3238 pmap_set_process(
3239 	pmap_t pmap,
3240 	int pid,
3241 	char *procname)
3242 {
3243 #if XNU_MONITOR
3244 	pmap_set_process_ppl(pmap, pid, procname);
3245 #else
3246 	pmap_set_process_internal(pmap, pid, procname);
3247 #endif
3248 }
3249 #endif /* MACH_ASSERT */
3250 
3251 /*
3252  * pmap_deallocate_all_leaf_tts:
3253  *
3254  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3255  * removing and deallocating all TTEs.
3256  */
3257 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3258 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3259 {
3260 	tt_entry_t tte = ARM_TTE_EMPTY;
3261 	tt_entry_t * ttep = NULL;
3262 	tt_entry_t * last_ttep = NULL;
3263 
3264 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3265 
3266 	assert(level < pt_attr_leaf_level(pt_attr));
3267 
3268 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3269 
3270 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3271 		tte = *ttep;
3272 
3273 		if (!(tte & ARM_TTE_VALID)) {
3274 			continue;
3275 		}
3276 
3277 		if (tte_is_block(tte)) {
3278 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3279 			    "pmap=%p, first_ttep=%p, level=%u",
3280 			    __FUNCTION__, ttep, (void *)tte,
3281 			    pmap, first_ttep, level);
3282 		}
3283 
3284 		/* Must be valid, type table */
3285 		if (level < pt_attr_twig_level(pt_attr)) {
3286 			/* If we haven't reached the twig level, recurse to the next level. */
3287 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3288 		}
3289 
3290 		/* Remove the TTE. */
3291 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3292 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3293 	}
3294 }
3295 
3296 /*
3297  * We maintain stats and ledgers so that a task's physical footprint is:
3298  * phys_footprint = ((internal - alternate_accounting)
3299  *                   + (internal_compressed - alternate_accounting_compressed)
3300  *                   + iokit_mapped
3301  *                   + purgeable_nonvolatile
3302  *                   + purgeable_nonvolatile_compressed
3303  *                   + page_table)
3304  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3305  */
3306 
3307 /*
3308  *	Retire the given physical map from service.
3309  *	Should only be called if the map contains
3310  *	no valid mappings.
3311  */
3312 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3313 pmap_destroy_internal(
3314 	pmap_t pmap)
3315 {
3316 	if (pmap == PMAP_NULL) {
3317 		return;
3318 	}
3319 
3320 	validate_pmap(pmap);
3321 
3322 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3323 
3324 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3325 	if (ref_count > 0) {
3326 		return;
3327 	} else if (__improbable(ref_count < 0)) {
3328 		panic("pmap %p: refcount underflow", pmap);
3329 	} else if (__improbable(pmap == kernel_pmap)) {
3330 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3331 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3332 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3333 	}
3334 
3335 	/*
3336 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3337 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3338 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3339 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3340 	 * ref_count of 0 and panic.
3341 	 */
3342 	os_atomic_thread_fence(seq_cst);
3343 
3344 #if XNU_MONITOR
3345 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3346 		panic("pmap %p: attempt to destroy while nested", pmap);
3347 	}
3348 	const int max_cpu = ml_get_max_cpu_number();
3349 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3350 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3351 		if (cpu_data == NULL) {
3352 			continue;
3353 		}
3354 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3355 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3356 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3357 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3358 		}
3359 	}
3360 #endif
3361 	pmap_unmap_commpage(pmap);
3362 
3363 	pmap_simple_lock(&pmaps_lock);
3364 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3365 	pmap_simple_unlock(&pmaps_lock);
3366 
3367 	pmap_trim_self(pmap);
3368 
3369 	/*
3370 	 *	Free the memory maps, then the
3371 	 *	pmap structure.
3372 	 */
3373 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3374 
3375 
3376 
3377 	if (pmap->tte) {
3378 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3379 		pmap->tte = (tt_entry_t *) NULL;
3380 		pmap->ttep = 0;
3381 	}
3382 
3383 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3384 
3385 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3386 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3387 		sync_tlb_flush();
3388 	} else {
3389 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3390 		sync_tlb_flush();
3391 		/* return its asid to the pool */
3392 		pmap_get_pt_ops(pmap)->free_id(pmap);
3393 		if (pmap->nested_pmap != NULL) {
3394 #if XNU_MONITOR
3395 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3396 #endif
3397 			/* release the reference we hold on the nested pmap */
3398 			pmap_destroy_internal(pmap->nested_pmap);
3399 		}
3400 	}
3401 
3402 	pmap_check_ledgers(pmap);
3403 
3404 	if (pmap->nested_region_unnested_table_bitmap) {
3405 #if XNU_MONITOR
3406 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3407 #else
3408 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3409 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3410 #endif
3411 	}
3412 
3413 #if XNU_MONITOR
3414 	if (pmap->ledger) {
3415 		pmap_ledger_release(pmap->ledger);
3416 	}
3417 
3418 	pmap_lock_destroy(pmap);
3419 	pmap_free_pmap(pmap);
3420 #else
3421 	pmap_lock_destroy(pmap);
3422 	zfree(pmap_zone, pmap);
3423 #endif
3424 }
3425 
3426 void
pmap_destroy(pmap_t pmap)3427 pmap_destroy(
3428 	pmap_t pmap)
3429 {
3430 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3431 
3432 	ledger_t ledger = pmap->ledger;
3433 
3434 #if XNU_MONITOR
3435 	pmap_destroy_ppl(pmap);
3436 
3437 	pmap_ledger_check_balance(pmap);
3438 #else
3439 	pmap_destroy_internal(pmap);
3440 #endif
3441 
3442 	ledger_dereference(ledger);
3443 
3444 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3445 }
3446 
3447 
3448 /*
3449  *	Add a reference to the specified pmap.
3450  */
3451 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3452 pmap_reference_internal(
3453 	pmap_t pmap)
3454 {
3455 	if (pmap != PMAP_NULL) {
3456 		validate_pmap_mutable(pmap);
3457 		os_atomic_inc(&pmap->ref_count, acquire);
3458 	}
3459 }
3460 
3461 void
pmap_reference(pmap_t pmap)3462 pmap_reference(
3463 	pmap_t pmap)
3464 {
3465 #if XNU_MONITOR
3466 	pmap_reference_ppl(pmap);
3467 #else
3468 	pmap_reference_internal(pmap);
3469 #endif
3470 }
3471 
3472 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3473 pmap_tt1_allocate(
3474 	pmap_t          pmap,
3475 	vm_size_t       size,
3476 	unsigned        option)
3477 {
3478 	tt_entry_t      *tt1 = NULL;
3479 	tt_free_entry_t *tt1_free;
3480 	pmap_paddr_t    pa;
3481 	vm_address_t    va;
3482 	vm_address_t    va_end;
3483 	kern_return_t   ret;
3484 
3485 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3486 		size = PAGE_SIZE;
3487 	}
3488 
3489 	/**
3490 	 * We expect top level translation tables to always fit into a single
3491 	 * physical page. This would also catch a misconfiguration if 4K
3492 	 * concatenated page tables needed more than one physical tt1 page.
3493 	 */
3494 	if (__improbable(size > PAGE_SIZE)) {
3495 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3496 	}
3497 
3498 	pmap_simple_lock(&tt1_lock);
3499 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3500 		free_page_size_tt_count--;
3501 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3502 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3503 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3504 		free_tt_count--;
3505 		tt1 = (tt_entry_t *)free_tt_list;
3506 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3507 	}
3508 	pmap_simple_unlock(&tt1_lock);
3509 
3510 	if (tt1 != NULL) {
3511 		pmap_tt_ledger_credit(pmap, size);
3512 		return (tt_entry_t *)tt1;
3513 	}
3514 
3515 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3516 
3517 	if (ret == KERN_RESOURCE_SHORTAGE) {
3518 		return (tt_entry_t *)0;
3519 	}
3520 
3521 #if XNU_MONITOR
3522 	assert(pa);
3523 #endif
3524 
3525 	if (size < PAGE_SIZE) {
3526 		va = phystokv(pa) + size;
3527 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3528 		tt_free_entry_t *next_free = NULL;
3529 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3530 			tt1_free = (tt_free_entry_t *)va;
3531 			tt1_free->next = next_free;
3532 			next_free = tt1_free;
3533 		}
3534 		pmap_simple_lock(&tt1_lock);
3535 		local_free_list->next = free_tt_list;
3536 		free_tt_list = next_free;
3537 		free_tt_count += ((PAGE_SIZE / size) - 1);
3538 		if (free_tt_count > free_tt_max) {
3539 			free_tt_max = free_tt_count;
3540 		}
3541 		pmap_simple_unlock(&tt1_lock);
3542 	}
3543 
3544 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3545 	 * Depending on the device, this can vary between 512b and 16K. */
3546 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3547 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3548 	pmap_tt_ledger_credit(pmap, size);
3549 
3550 	return (tt_entry_t *) phystokv(pa);
3551 }
3552 
3553 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3554 pmap_tt1_deallocate(
3555 	pmap_t pmap,
3556 	tt_entry_t *tt,
3557 	vm_size_t size,
3558 	unsigned option)
3559 {
3560 	tt_free_entry_t *tt_entry;
3561 
3562 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3563 		size = PAGE_SIZE;
3564 	}
3565 
3566 	tt_entry = (tt_free_entry_t *)tt;
3567 	assert(not_in_kdp);
3568 	pmap_simple_lock(&tt1_lock);
3569 
3570 	if (size < PAGE_SIZE) {
3571 		free_tt_count++;
3572 		if (free_tt_count > free_tt_max) {
3573 			free_tt_max = free_tt_count;
3574 		}
3575 		tt_entry->next = free_tt_list;
3576 		free_tt_list = tt_entry;
3577 	}
3578 
3579 	if (size == PAGE_SIZE) {
3580 		free_page_size_tt_count++;
3581 		if (free_page_size_tt_count > free_page_size_tt_max) {
3582 			free_page_size_tt_max = free_page_size_tt_count;
3583 		}
3584 		tt_entry->next = free_page_size_tt_list;
3585 		free_page_size_tt_list = tt_entry;
3586 	}
3587 
3588 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3589 		pmap_simple_unlock(&tt1_lock);
3590 		pmap_tt_ledger_debit(pmap, size);
3591 		return;
3592 	}
3593 
3594 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3595 		free_page_size_tt_count--;
3596 		tt = (tt_entry_t *)free_page_size_tt_list;
3597 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3598 
3599 		pmap_simple_unlock(&tt1_lock);
3600 
3601 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3602 
3603 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3604 
3605 		pmap_simple_lock(&tt1_lock);
3606 	}
3607 
3608 	pmap_simple_unlock(&tt1_lock);
3609 	pmap_tt_ledger_debit(pmap, size);
3610 }
3611 
3612 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3613 pmap_tt_allocate(
3614 	pmap_t pmap,
3615 	tt_entry_t **ttp,
3616 	unsigned int level,
3617 	unsigned int options)
3618 {
3619 	pmap_paddr_t pa;
3620 	*ttp = NULL;
3621 
3622 	/* Traverse the tt_entry_free list to find a free tt_entry */
3623 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3624 		return KERN_ABORTED;
3625 	}
3626 
3627 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3628 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3629 
3630 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3631 		tt_free_next = tt_free_cur->next;
3632 		tt_free_cur->next = NULL;
3633 		*ttp = (tt_entry_t *)tt_free_cur;
3634 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3635 	}
3636 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3637 
3638 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3639 	if (*ttp == NULL) {
3640 		pt_desc_t       *ptdp;
3641 
3642 		const unsigned int alloc_flags =
3643 		    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3644 		/*
3645 		 *  Allocate a VM page for the level x page table entries.
3646 		 */
3647 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3648 			if (options & PMAP_OPTIONS_NOWAIT) {
3649 				return KERN_RESOURCE_SHORTAGE;
3650 			}
3651 			VM_PAGE_WAIT();
3652 		}
3653 
3654 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3655 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3656 			if (options & PMAP_OPTIONS_NOWAIT) {
3657 				/* Deallocate all allocated resources so far. */
3658 				pmap_pages_free(pa, PAGE_SIZE);
3659 				return KERN_RESOURCE_SHORTAGE;
3660 			}
3661 			VM_PAGE_WAIT();
3662 		}
3663 
3664 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3665 			OSAddAtomic64(1, &alloc_ttepages_count);
3666 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3667 		} else {
3668 			OSAddAtomic64(1, &alloc_ptepages_count);
3669 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3670 		}
3671 
3672 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3673 
3674 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3675 
3676 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3677 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3678 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3679 
3680 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3681 		if (PAGE_SIZE > pmap_page_size) {
3682 			vm_address_t    va;
3683 			vm_address_t    va_end;
3684 
3685 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3686 				/* Deallocate all allocated resources so far. */
3687 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3688 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3689 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3690 				pmap_pages_free(pa, PAGE_SIZE);
3691 				ptd_deallocate(ptdp);
3692 
3693 				return KERN_ABORTED;
3694 			}
3695 
3696 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3697 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3698 				pmap->tt_entry_free = (tt_entry_t *)va;
3699 			}
3700 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3701 		}
3702 
3703 		*ttp = (tt_entry_t *)phystokv(pa);
3704 	}
3705 
3706 #if XNU_MONITOR
3707 	assert(*ttp);
3708 #endif
3709 
3710 	return KERN_SUCCESS;
3711 }
3712 
3713 
3714 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3715 pmap_tt_deallocate(
3716 	pmap_t pmap,
3717 	tt_entry_t *ttp,
3718 	unsigned int level)
3719 {
3720 	pt_desc_t *ptdp;
3721 	ptd_info_t *ptd_info;
3722 	unsigned pt_acc_cnt;
3723 	unsigned i;
3724 	vm_offset_t     free_page = 0;
3725 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3726 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3727 
3728 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3729 
3730 	ptdp = ptep_get_ptd(ttp);
3731 	ptd_info = ptd_get_info(ptdp, ttp);
3732 
3733 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3734 
3735 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3736 		ptd_info->refcnt = 0;
3737 	}
3738 
3739 	if (__improbable(ptd_info->refcnt != 0)) {
3740 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3741 	}
3742 
3743 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3744 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3745 	}
3746 
3747 	if (pt_acc_cnt == 0) {
3748 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3749 		unsigned pt_free_entry_cnt = 1;
3750 
3751 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3752 			tt_free_entry_t *tt_free_list_next;
3753 
3754 			tt_free_list_next = tt_free_list->next;
3755 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3756 				pt_free_entry_cnt++;
3757 			}
3758 			tt_free_list = tt_free_list_next;
3759 		}
3760 		if (pt_free_entry_cnt == max_pt_index) {
3761 			tt_free_entry_t *tt_free_list_cur;
3762 
3763 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3764 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3765 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3766 
3767 			while (tt_free_list_cur) {
3768 				tt_free_entry_t *tt_free_list_next;
3769 
3770 				tt_free_list_next = tt_free_list_cur->next;
3771 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3772 					tt_free_list->next = tt_free_list_next->next;
3773 				} else {
3774 					tt_free_list = tt_free_list_next;
3775 				}
3776 				tt_free_list_cur = tt_free_list_next;
3777 			}
3778 		} else {
3779 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3780 			pmap->tt_entry_free = ttp;
3781 		}
3782 	} else {
3783 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3784 		pmap->tt_entry_free = ttp;
3785 	}
3786 
3787 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3788 
3789 	if (free_page != 0) {
3790 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3791 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3792 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3793 		if (level < pt_attr_leaf_level(pt_attr)) {
3794 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3795 		} else {
3796 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3797 		}
3798 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3799 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3800 	}
3801 }
3802 
3803 /**
3804  * Safely clear out a translation table entry.
3805  *
3806  * @note If the TTE to clear out points to a leaf table, then that leaf table
3807  *       must have a refcnt of zero before the TTE can be removed.
3808  * @note This function expects to be called with pmap locked exclusive, and will
3809  *       return with pmap unlocked.
3810  *
3811  * @param pmap The pmap containing the page table whose TTE is being removed.
3812  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3813  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3814  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3815  * @param ttep Pointer to the TTE that should be cleared out.
3816  * @param level The level of the page table that contains the TTE to be removed.
3817  */
3818 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3819 pmap_tte_remove(
3820 	pmap_t pmap,
3821 	vm_offset_t va_start,
3822 	vm_offset_t va_end,
3823 	bool need_strong_sync,
3824 	tt_entry_t *ttep,
3825 	unsigned int level)
3826 {
3827 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3828 
3829 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3830 	const tt_entry_t tte = *ttep;
3831 
3832 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3833 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3834 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3835 	}
3836 
3837 	*ttep = (tt_entry_t) 0;
3838 	FLUSH_PTE_STRONG();
3839 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3840 	if (va_end > va_start) {
3841 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3842 	}
3843 
3844 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3845 
3846 	/**
3847 	 * Remember, the passed in "level" parameter refers to the level above the
3848 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3849 	 * page table).
3850 	 */
3851 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3852 
3853 	/**
3854 	 * Non-leaf pagetables don't track active references in the PTD and instead
3855 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3856 	 * the real refcount below.
3857 	 */
3858 	unsigned short refcnt = PT_DESC_REFCOUNT;
3859 
3860 	/*
3861 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3862 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3863 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3864 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3865 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3866 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3867 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3868 	 * synchronize it against the disconnect operation.  If that removal caused the
3869 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3870 	 * operation is finished using the relevant pagetable descriptor.
3871 	 * Address these cases by waiting until all CPUs have been observed to not be
3872 	 * executing pmap_disconnect().
3873 	 */
3874 	if (remove_leaf_table) {
3875 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3876 		const int max_cpu = ml_get_max_cpu_number();
3877 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3878 		bool inflight_disconnect;
3879 
3880 		/*
3881 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3882 		 * ahead of any prior PTE load which may have observed the effect of a
3883 		 * concurrent disconnect operation.  An acquire fence is required for this;
3884 		 * a load-acquire operation is insufficient.
3885 		 */
3886 		os_atomic_thread_fence(acquire);
3887 		do {
3888 			inflight_disconnect = false;
3889 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3890 			    i >= 0;
3891 			    i = bitmap_next(&active_disconnects[0], i)) {
3892 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3893 				if (cpu_data == NULL) {
3894 					continue;
3895 				}
3896 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3897 					__builtin_arm_wfe();
3898 					inflight_disconnect = true;
3899 					continue;
3900 				}
3901 				os_atomic_clear_exclusive();
3902 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3903 			}
3904 		} while (inflight_disconnect);
3905 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3906 		os_atomic_thread_fence(acquire);
3907 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3908 	}
3909 
3910 #if MACH_ASSERT
3911 	/**
3912 	 * On internal devices, always do the page table consistency check
3913 	 * regardless of page table level or the actual refcnt value.
3914 	 */
3915 	{
3916 #else /* MACH_ASSERT */
3917 	/**
3918 	 * Only perform the page table consistency check when deleting leaf page
3919 	 * tables and it seems like there might be valid/compressed mappings
3920 	 * leftover.
3921 	 */
3922 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3923 #endif /* MACH_ASSERT */
3924 
3925 		/**
3926 		 * There are multiple problems that can arise as a non-zero refcnt:
3927 		 * 1. A bug in the refcnt management logic.
3928 		 * 2. A memory stomper or hardware failure.
3929 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3930 		 *    space before destroying a pmap.
3931 		 *
3932 		 * By looping over the page table and determining how many valid or
3933 		 * compressed entries there actually are, we can narrow down which of
3934 		 * these three cases is causing this panic. If the expected refcnt
3935 		 * (valid + compressed) and the actual refcnt don't match then the
3936 		 * problem is probably either a memory corruption issue (if the
3937 		 * non-empty entries don't match valid+compressed, that could also be a
3938 		 * sign of corruption) or refcnt management bug. Otherwise, there
3939 		 * actually are leftover mappings and the higher layers of xnu are
3940 		 * probably at fault.
3941 		 */
3942 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3943 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3944 
3945 		pt_entry_t *ptep = bpte;
3946 		unsigned short non_empty = 0, valid = 0, comp = 0;
3947 
3948 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3949 			/**
3950 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3951 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3952 			 * That's because it's possible for the 4-tuple PTE clear operation in
3953 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3954 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3955 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3956 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3957 			 * but we don't want it to trip our internal checks here.
3958 			 */
3959 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3960 				if ((i % PAGE_RATIO) == 0) {
3961 					comp++;
3962 				} else {
3963 					continue;
3964 				}
3965 			} else if (__improbable(pte_is_valid(*ptep))) {
3966 				valid++;
3967 			}
3968 
3969 			/* Keep track of all non-empty entries to detect memory corruption. */
3970 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3971 				non_empty++;
3972 			}
3973 		}
3974 
3975 #if MACH_ASSERT
3976 		/**
3977 		 * On internal machines, panic whenever a page table getting deleted has
3978 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3979 		 * non-zero refcnt.
3980 		 */
3981 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3982 #else /* MACH_ASSERT */
3983 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3984 		{
3985 #endif /* MACH_ASSERT */
3986 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3987 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3988 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3989 		}
3990 	}
3991 }
3992 
3993 /**
3994  * Given a pointer to an entry within a `level` page table, delete the
3995  * page table at `level` + 1 that is represented by that entry. For instance,
3996  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3997  * contains the PA of the L3 table, and `level` would be "2".
3998  *
3999  * @note If the table getting deallocated is a leaf table, then that leaf table
4000  *       must have a refcnt of zero before getting deallocated. All other levels
4001  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
4002  * @note This function expects to be called with pmap locked exclusive and will
4003  *       return with pmap unlocked.
4004  *
4005  * @param pmap The pmap that owns the page table to be deallocated.
4006  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4007  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4008  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4009  * @param ttep Pointer to the `level` TTE to remove.
4010  * @param level The level of the table that contains an entry pointing to the
4011  *              table to be removed. The deallocated page table will be a
4012  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
4013  *              deleted).
4014  */
4015 void
4016 pmap_tte_deallocate(
4017 	pmap_t pmap,
4018 	vm_offset_t va_start,
4019 	vm_offset_t va_end,
4020 	bool need_strong_sync,
4021 	tt_entry_t *ttep,
4022 	unsigned int level)
4023 {
4024 	tt_entry_t tte;
4025 
4026 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4027 
4028 	tte = *ttep;
4029 
4030 	if (tte_get_ptd(tte)->pmap != pmap) {
4031 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4032 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4033 	}
4034 
4035 	assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
4036 	    (unsigned long long)tte);
4037 
4038 	/* pmap_tte_remove() will drop the pmap lock */
4039 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4040 
4041 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4042 }
4043 
4044 /*
4045  *	Remove a range of hardware page-table entries.
4046  *	The entries given are the first (inclusive)
4047  *	and last (exclusive) entries for the VM pages.
4048  *	The virtual address is the va for the first pte.
4049  *
4050  *	The pmap must be locked.
4051  *	If the pmap is not the kernel pmap, the range must lie
4052  *	entirely within one pte-page.  This is NOT checked.
4053  *	Assumes that the pte-page exists.
4054  *
4055  *	Returns the number of PTE changed
4056  */
4057 MARK_AS_PMAP_TEXT static int
4058 pmap_remove_range(
4059 	pmap_t pmap,
4060 	vm_map_address_t va,
4061 	pt_entry_t *bpte,
4062 	pt_entry_t *epte)
4063 {
4064 	bool need_strong_sync = false;
4065 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4066 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4067 	if (num_changed > 0) {
4068 		PMAP_UPDATE_TLBS(pmap, va,
4069 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4070 	}
4071 	return num_changed;
4072 }
4073 
4074 
4075 #ifdef PVH_FLAG_EXEC
4076 
4077 /*
4078  *	Update the access protection bits of the physical aperture mapping for a page.
4079  *	This is useful, for example, in guranteeing that a verified executable page
4080  *	has no writable mappings anywhere in the system, including the physical
4081  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4082  *	synchronization overhead in cases where the call to this function is
4083  *	guaranteed to be followed by other TLB operations.
4084  */
4085 void
4086 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4087 {
4088 #if __ARM_PTE_PHYSMAP__
4089 	pvh_assert_locked(pai);
4090 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4091 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4092 
4093 	pt_entry_t tmplate = *pte_p;
4094 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4095 		return;
4096 	}
4097 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4098 	if (tmplate & ARM_PTE_HINT_MASK) {
4099 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4100 		    __func__, pte_p, (void *)kva, tmplate);
4101 	}
4102 	write_pte_strong(pte_p, tmplate);
4103 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4104 	if (!flush_tlb_async) {
4105 		sync_tlb_flush();
4106 	}
4107 #endif
4108 }
4109 #endif /* defined(PVH_FLAG_EXEC) */
4110 
4111 
4112 
4113 MARK_AS_PMAP_TEXT int
4114 pmap_remove_range_options(
4115 	pmap_t pmap,
4116 	vm_map_address_t va,
4117 	pt_entry_t *bpte,
4118 	pt_entry_t *epte,
4119 	vm_map_address_t *eva,
4120 	bool *need_strong_sync __unused,
4121 	int options)
4122 {
4123 	pt_entry_t     *cpte;
4124 	size_t          npages = 0;
4125 	int             num_removed, num_unwired;
4126 	int             num_pte_changed;
4127 	unsigned int    pai = 0;
4128 	pmap_paddr_t    pa;
4129 	int             num_external, num_internal, num_reusable;
4130 	int             num_alt_internal;
4131 	uint64_t        num_compressed, num_alt_compressed;
4132 	int16_t         refcnt = 0;
4133 
4134 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4135 
4136 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4137 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4138 
4139 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4140 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4141 	}
4142 
4143 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4144 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4145 	}
4146 
4147 	if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4148 		panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4149 	}
4150 
4151 	num_removed = 0;
4152 	num_unwired = 0;
4153 	num_pte_changed = 0;
4154 	num_external = 0;
4155 	num_internal = 0;
4156 	num_reusable = 0;
4157 	num_compressed = 0;
4158 	num_alt_internal = 0;
4159 	num_alt_compressed = 0;
4160 
4161 #if XNU_MONITOR
4162 	bool ro_va = false;
4163 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4164 		ro_va = true;
4165 	}
4166 #endif
4167 	for (cpte = bpte; cpte < epte;
4168 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4169 		pt_entry_t      spte;
4170 		boolean_t       managed = FALSE;
4171 
4172 		/*
4173 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4174 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4175 		 */
4176 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4177 			*eva = va;
4178 			break;
4179 		}
4180 
4181 		spte = *((volatile pt_entry_t*)cpte);
4182 
4183 		while (!managed) {
4184 			if (pmap != kernel_pmap &&
4185 			    (options & PMAP_OPTIONS_REMOVE) &&
4186 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4187 				/*
4188 				 * "pmap" must be locked at this point,
4189 				 * so this should not race with another
4190 				 * pmap_remove_range() or pmap_enter().
4191 				 */
4192 
4193 				/* one less "compressed"... */
4194 				num_compressed++;
4195 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4196 					/* ... but it used to be "ALTACCT" */
4197 					num_alt_compressed++;
4198 				}
4199 
4200 				/* clear marker */
4201 				write_pte_fast(cpte, ARM_PTE_EMPTY);
4202 				/*
4203 				 * "refcnt" also accounts for
4204 				 * our "compressed" markers,
4205 				 * so let's update it here.
4206 				 */
4207 				--refcnt;
4208 				spte = *((volatile pt_entry_t*)cpte);
4209 			}
4210 			/*
4211 			 * It may be possible for the pte to transition from managed
4212 			 * to unmanaged in this timeframe; for now, elide the assert.
4213 			 * We should break out as a consequence of checking pa_valid.
4214 			 */
4215 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4216 			pa = pte_to_pa(spte);
4217 			if (!pa_valid(pa)) {
4218 #if XNU_MONITOR
4219 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4220 #endif
4221 #if XNU_MONITOR
4222 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4223 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4224 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4225 					    __func__, (uint64_t)pa);
4226 				}
4227 #endif
4228 				break;
4229 			}
4230 #if HAS_FEAT_XS
4231 			if (pte_is_xs(pt_attr, spte)) {
4232 				*need_strong_sync = true;
4233 			}
4234 #endif /* HAS_FEAT_XS */
4235 			pai = pa_index(pa);
4236 			pvh_lock(pai);
4237 			spte = *((volatile pt_entry_t*)cpte);
4238 			pa = pte_to_pa(spte);
4239 			if (pai == pa_index(pa)) {
4240 				managed = TRUE;
4241 				break; // Leave pai locked as we will unlock it after we free the PV entry
4242 			}
4243 			pvh_unlock(pai);
4244 		}
4245 
4246 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4247 			/*
4248 			 * There used to be a valid mapping here but it
4249 			 * has already been removed when the page was
4250 			 * sent to the VM compressor, so nothing left to
4251 			 * remove now...
4252 			 */
4253 			continue;
4254 		}
4255 
4256 		/* remove the translation, do not flush the TLB */
4257 		if (*cpte != ARM_PTE_EMPTY) {
4258 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4259 			assertf(pte_is_valid(*cpte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4260 #if MACH_ASSERT
4261 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4262 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4263 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4264 			}
4265 #endif
4266 			write_pte_fast(cpte, ARM_PTE_EMPTY);
4267 			num_pte_changed++;
4268 		}
4269 
4270 		if ((spte != ARM_PTE_EMPTY) && (pmap != kernel_pmap)) {
4271 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4272 			assertf(pte_is_valid(spte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4273 			--refcnt;
4274 		}
4275 
4276 		if (pte_is_wired(spte)) {
4277 			pte_set_wired(pmap, cpte, 0);
4278 			num_unwired++;
4279 		}
4280 		/*
4281 		 * if not managed, we're done
4282 		 */
4283 		if (!managed) {
4284 			continue;
4285 		}
4286 
4287 #if XNU_MONITOR
4288 		if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4289 			panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4290 		}
4291 		if (__improbable(ro_va)) {
4292 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4293 		}
4294 #endif
4295 
4296 		/*
4297 		 * find and remove the mapping from the chain for this
4298 		 * physical address.
4299 		 */
4300 		bool is_internal, is_altacct;
4301 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4302 
4303 		if (is_altacct) {
4304 			assert(is_internal);
4305 			num_internal++;
4306 			num_alt_internal++;
4307 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4308 				ppattr_clear_altacct(pai);
4309 				ppattr_clear_internal(pai);
4310 			}
4311 		} else if (is_internal) {
4312 			if (ppattr_test_reusable(pai)) {
4313 				num_reusable++;
4314 			} else {
4315 				num_internal++;
4316 			}
4317 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4318 				ppattr_clear_internal(pai);
4319 			}
4320 		} else {
4321 			num_external++;
4322 		}
4323 		pvh_unlock(pai);
4324 		num_removed++;
4325 	}
4326 
4327 	/*
4328 	 *	Update the counts
4329 	 */
4330 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4331 
4332 	if (pmap != kernel_pmap) {
4333 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4334 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4335 		}
4336 
4337 		/* update ledgers */
4338 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4339 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4340 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4341 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4342 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4343 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4344 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4345 		/* make needed adjustments to phys_footprint */
4346 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4347 		    ((num_internal -
4348 		    num_alt_internal) +
4349 		    (num_compressed -
4350 		    num_alt_compressed)) * pmap_page_size);
4351 	}
4352 
4353 	/* flush the ptable entries we have written */
4354 	if (num_pte_changed > 0) {
4355 		FLUSH_PTE_STRONG();
4356 	}
4357 
4358 	return num_pte_changed;
4359 }
4360 
4361 
4362 /*
4363  *	Remove the given range of addresses
4364  *	from the specified map.
4365  *
4366  *	It is assumed that the start and end are properly
4367  *	rounded to the hardware page size.
4368  */
4369 void
4370 pmap_remove(
4371 	pmap_t pmap,
4372 	vm_map_address_t start,
4373 	vm_map_address_t end)
4374 {
4375 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4376 }
4377 
4378 MARK_AS_PMAP_TEXT vm_map_address_t
4379 pmap_remove_options_internal(
4380 	pmap_t pmap,
4381 	vm_map_address_t start,
4382 	vm_map_address_t end,
4383 	int options)
4384 {
4385 	vm_map_address_t eva = end;
4386 	pt_entry_t     *bpte, *epte;
4387 	pt_entry_t     *pte_p;
4388 	tt_entry_t     *tte_p;
4389 	int             remove_count = 0;
4390 	bool            need_strong_sync = false;
4391 	bool            unlock = true;
4392 
4393 	validate_pmap_mutable(pmap);
4394 
4395 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4396 
4397 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4398 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4399 	}
4400 
4401 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4402 
4403 	tte_p = pmap_tte(pmap, start);
4404 
4405 	if (tte_p == (tt_entry_t *) NULL) {
4406 		goto done;
4407 	}
4408 
4409 	if (tte_is_valid_table(*tte_p)) {
4410 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4411 		bpte = &pte_p[pte_index(pt_attr, start)];
4412 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4413 
4414 		/*
4415 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4416 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4417 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4418 		 */
4419 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4420 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4421 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4422 		}
4423 
4424 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4425 		    &need_strong_sync, options);
4426 
4427 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4428 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4429 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4430 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4431 		}
4432 	}
4433 
4434 done:
4435 	if (unlock) {
4436 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4437 	}
4438 
4439 	if (remove_count > 0) {
4440 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4441 	}
4442 	return eva;
4443 }
4444 
4445 void
4446 pmap_remove_options(
4447 	pmap_t pmap,
4448 	vm_map_address_t start,
4449 	vm_map_address_t end,
4450 	int options)
4451 {
4452 	vm_map_address_t va;
4453 
4454 	if (pmap == PMAP_NULL) {
4455 		return;
4456 	}
4457 
4458 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4459 
4460 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4461 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4462 	    VM_KERNEL_ADDRHIDE(end));
4463 
4464 	/*
4465 	 * We allow single-page requests to execute non-preemptibly,
4466 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4467 	 * operation, and there are a couple of special use cases that
4468 	 * require a non-preemptible single-page operation.
4469 	 */
4470 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4471 		pmap_verify_preemptible();
4472 	}
4473 
4474 	/*
4475 	 *      Invalidate the translation buffer first
4476 	 */
4477 	va = start;
4478 	while (va < end) {
4479 		vm_map_address_t l;
4480 
4481 #if XNU_TARGET_OS_XR
4482 		/* rdar://84856940 */
4483 		unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4484 
4485 		l = va + BATCH_SIZE;
4486 
4487 		vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4488 
4489 		if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4490 			// We're not allowed to cross an L2 boundary.
4491 			l = l_twig;
4492 		}
4493 #else /* XNU_TARGET_OS_XR */
4494 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4495 #endif /* XNU_TARGET_OS_XR */
4496 		if (l > end) {
4497 			l = end;
4498 		}
4499 
4500 #if XNU_MONITOR
4501 		va = pmap_remove_options_ppl(pmap, va, l, options);
4502 
4503 		pmap_ledger_check_balance(pmap);
4504 #else
4505 		va = pmap_remove_options_internal(pmap, va, l, options);
4506 #endif
4507 	}
4508 
4509 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4510 }
4511 
4512 
4513 /*
4514  *	Remove phys addr if mapped in specified map
4515  */
4516 void
4517 pmap_remove_some_phys(
4518 	__unused pmap_t map,
4519 	__unused ppnum_t pn)
4520 {
4521 	/* Implement to support working set code */
4522 }
4523 
4524 /*
4525  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4526  * switch a thread onto a new vm_map.
4527  */
4528 void
4529 pmap_switch_user(thread_t thread, vm_map_t new_map)
4530 {
4531 	pmap_t new_pmap = new_map->pmap;
4532 
4533 
4534 	thread->map = new_map;
4535 	pmap_set_pmap(new_pmap, thread);
4536 
4537 }
4538 
4539 void
4540 pmap_set_pmap(
4541 	pmap_t pmap,
4542 	thread_t        thread)
4543 {
4544 	pmap_switch(pmap, thread);
4545 #if __ARM_USER_PROTECT__
4546 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4547 	thread->machine.asid = pmap->hw_asid;
4548 #endif
4549 }
4550 
4551 static void
4552 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4553 {
4554 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4555 }
4556 
4557 #if HAS_SPECRES
4558 static void
4559 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4560 {
4561 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4562 	asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4563 }
4564 
4565 #if REQUIRES_DVP_RCTX
4566 static void
4567 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4568 {
4569 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4570 	asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4571 }
4572 #endif /* REQUIRES_DVP_RCTX */
4573 #endif /* HAS_SPECRES */
4574 
4575 static inline bool
4576 pmap_user_ttb_is_clear(void)
4577 {
4578 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4579 }
4580 
4581 MARK_AS_PMAP_TEXT void
4582 pmap_switch_internal(
4583 	pmap_t pmap)
4584 {
4585 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4586 #if XNU_MONITOR
4587 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4588 
4589 	/**
4590 	 * Make sure a pmap is never active-and-nested. For more details,
4591 	 * see pmap_set_nested_internal().
4592 	 */
4593 	os_atomic_thread_fence(seq_cst);
4594 	if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4595 		panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4596 	}
4597 #endif
4598 	validate_pmap_mutable(pmap);
4599 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4600 	uint16_t asid_index = pmap->hw_asid;
4601 	bool do_asid_flush = false;
4602 	bool do_commpage_flush = false;
4603 #if HAS_SPECRES
4604 	bool do_speculation_restriction = false;
4605 #endif /* HAS_SPECRES */
4606 
4607 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4608 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4609 	}
4610 #if __ARM_KERNEL_PROTECT__
4611 	asid_index >>= 1;
4612 #endif
4613 
4614 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4615 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4616 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4617 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4618 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4619 	bool break_before_make = do_shared_region_flush;
4620 
4621 #if !HAS_16BIT_ASID
4622 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4623 		asid_index -= 1;
4624 		pmap_update_plru(asid_index);
4625 
4626 		/* Paranoia. */
4627 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4628 
4629 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4630 		uint8_t new_sw_asid = pmap->sw_asid;
4631 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4632 
4633 		if (new_sw_asid != last_sw_asid) {
4634 			/**
4635 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4636 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4637 			 * then this switch runs the risk of aliasing.  We need to flush the
4638 			 * TLB for this phyiscal ASID in this case.
4639 			 */
4640 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4641 			do_asid_flush = true;
4642 #if HAS_SPECRES
4643 			do_speculation_restriction = true;
4644 #endif /* HAS_SPECRES */
4645 			break_before_make = true;
4646 		}
4647 	}
4648 #endif /* !HAS_16BIT_ASID */
4649 
4650 #if HAS_SPECRES_DEBUGGING
4651 	if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4652 		do_speculation_restriction = true;
4653 	} else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4654 		do_speculation_restriction = false;
4655 	}
4656 #endif /* HAS_SPECRES_DEBUGGING */
4657 
4658 #if __ARM_MIXED_PAGE_SIZE__
4659 	if (pt_attr->pta_tcr_value != get_tcr()) {
4660 		break_before_make = true;
4661 	}
4662 #endif
4663 #if __ARM_MIXED_PAGE_SIZE__
4664 	/*
4665 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4666 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4667 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4668 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4669 	 * conflict abort or other unpredictable behavior.
4670 	 */
4671 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4672 		do_commpage_flush = true;
4673 	}
4674 	if (do_commpage_flush) {
4675 		break_before_make = true;
4676 	}
4677 #endif
4678 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4679 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4680 		pmap_clear_user_ttb_internal();
4681 	}
4682 
4683 #if HAS_SPECRES
4684 	/**
4685 	 * Perform an CFP/DVP flush if required.
4686 	 */
4687 	if (__improbable(do_speculation_restriction)) {
4688 		pmap_flush_core_cfp_asid_async(pmap);
4689 #if REQUIRES_DVP_RCTX
4690 		pmap_flush_core_dvp_asid_async(pmap);
4691 #endif /* REQUIRES_DVP_RCTX */
4692 #if DEVELOPMENT || DEBUG
4693 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4694 #endif /* DEVELOPMENT || DEBUG */
4695 	}
4696 #endif /* HAS_SPECRES */
4697 
4698 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4699 	 * to flush the userspace mappings for that region.  Those mappings are global
4700 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4701 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4702 	if (__improbable(do_shared_region_flush)) {
4703 #if __ARM_RANGE_TLBI__
4704 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4705 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4706 
4707 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4708 		 * There may still be non-global entries that overlap with the incoming pmap's
4709 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4710 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4711 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4712 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4713 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4714 		 * to consider additional invalidation here in the future. */
4715 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4716 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4717 		} else {
4718 			/*
4719 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4720 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4721 			 * have a single-page shared region anyway, not least because pmap_nest()
4722 			 * requires L2 block alignment of the address and size.
4723 			 */
4724 			do_asid_flush = false;
4725 			flush_core_tlb_async();
4726 		}
4727 #else
4728 		do_asid_flush = false;
4729 		flush_core_tlb_async();
4730 #endif // __ARM_RANGE_TLBI__
4731 	}
4732 
4733 #if __ARM_MIXED_PAGE_SIZE__
4734 	if (__improbable(do_commpage_flush)) {
4735 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4736 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4737 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4738 		flush_core_tlb_allrange_async(rtlbi_param);
4739 	}
4740 #endif
4741 	if (__improbable(do_asid_flush)) {
4742 		pmap_flush_core_tlb_asid_async(pmap);
4743 #if DEVELOPMENT || DEBUG
4744 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4745 #endif /* DEVELOPMENT || DEBUG */
4746 	}
4747 
4748 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4749 #if HAS_SPECRES && !HAS_ERRATA_123855614
4750 	    || do_speculation_restriction
4751 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4752 	    )) {
4753 		sync_tlb_flush_local();
4754 	}
4755 
4756 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4757 }
4758 
4759 void
4760 pmap_switch(
4761 	pmap_t pmap,
4762 	thread_t thread __unused)
4763 {
4764 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4765 #if XNU_MONITOR
4766 	pmap_switch_ppl(pmap);
4767 #else
4768 	pmap_switch_internal(pmap);
4769 #endif
4770 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4771 }
4772 
4773 void
4774 pmap_page_protect(
4775 	ppnum_t ppnum,
4776 	vm_prot_t prot)
4777 {
4778 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4779 }
4780 
4781 /*
4782  *	Routine:	pmap_page_protect_options
4783  *
4784  *	Function:
4785  *		Lower the permission for all mappings to a given
4786  *		page.
4787  */
4788 MARK_AS_PMAP_TEXT static void
4789 pmap_page_protect_options_with_flush_range(
4790 	ppnum_t ppnum,
4791 	vm_prot_t prot,
4792 	unsigned int options,
4793 	pmap_tlb_flush_range_t *flush_range)
4794 {
4795 	pmap_paddr_t    phys = ptoa(ppnum);
4796 	pv_entry_t    **pv_h;
4797 	pv_entry_t     *pve_p, *orig_pve_p;
4798 	pv_entry_t     *pveh_p;
4799 	pv_entry_t     *pvet_p;
4800 	pt_entry_t     *pte_p, *orig_pte_p;
4801 	pv_entry_t     *new_pve_p;
4802 	pt_entry_t     *new_pte_p;
4803 	vm_offset_t     pvh_flags;
4804 	unsigned int    pai;
4805 	bool            remove;
4806 	bool            set_NX;
4807 	unsigned int    pvh_cnt = 0;
4808 	unsigned int    pass1_updated = 0;
4809 	unsigned int    pass2_updated = 0;
4810 
4811 	assert(ppnum != vm_page_fictitious_addr);
4812 
4813 	/* Only work with managed pages. */
4814 	if (!pa_valid(phys)) {
4815 		return;
4816 	}
4817 
4818 	/*
4819 	 * Determine the new protection.
4820 	 */
4821 	switch (prot) {
4822 	case VM_PROT_ALL:
4823 		return;         /* nothing to do */
4824 	case VM_PROT_READ:
4825 	case VM_PROT_READ | VM_PROT_EXECUTE:
4826 		remove = false;
4827 		break;
4828 	default:
4829 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4830 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4831 		remove = true;
4832 		break;
4833 	}
4834 
4835 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4836 	if (remove) {
4837 #if !XNU_MONITOR
4838 		mp_disable_preemption();
4839 #endif
4840 		pmap_cpu_data = pmap_get_cpu_data();
4841 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4842 		/*
4843 		 * Ensure the store to inflight_disconnect will be observed before any of the
4844 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4845 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4846 		 * another CPU, in between this function's clearing a PTE and dropping the
4847 		 * corresponding pagetable refcount.  That can lead to a panic if the
4848 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4849 		 * store barrier; a store-release operation would not be sufficient.
4850 		 */
4851 		os_atomic_thread_fence(release);
4852 	}
4853 
4854 	pai = pa_index(phys);
4855 	pvh_lock(pai);
4856 	pv_h = pai_to_pvh(pai);
4857 	pvh_flags = pvh_get_flags(pv_h);
4858 
4859 #if XNU_MONITOR
4860 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4861 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4862 	}
4863 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4864 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4865 	}
4866 	if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4867 		panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4868 	}
4869 #endif
4870 
4871 
4872 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4873 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4874 	pveh_p = PV_ENTRY_NULL;
4875 	pvet_p = PV_ENTRY_NULL;
4876 	new_pve_p = PV_ENTRY_NULL;
4877 	new_pte_p = PT_ENTRY_NULL;
4878 
4879 
4880 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4881 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4882 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4883 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4884 		pveh_p = pve_p;
4885 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4886 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4887 	}
4888 
4889 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4890 	int pve_ptep_idx = 0;
4891 
4892 	/*
4893 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4894 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4895 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4896 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4897 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4898 	 * tlb_flush_needed to be true while issue_tlbi is false.
4899 	 */
4900 	bool issue_tlbi = false;
4901 	bool tlb_flush_needed = false;
4902 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4903 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4904 		pt_entry_t tmplate = ARM_PTE_EMPTY;
4905 		bool update = false;
4906 
4907 		if (pve_p != PV_ENTRY_NULL) {
4908 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4909 			if (pte_p == PT_ENTRY_NULL) {
4910 				goto protect_skip_pve_pass1;
4911 			}
4912 		}
4913 
4914 #ifdef PVH_FLAG_IOMMU
4915 		if (pvh_ptep_is_iommu(pte_p)) {
4916 #if XNU_MONITOR
4917 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4918 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4919 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4920 			}
4921 #endif
4922 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4923 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4924 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4925 			}
4926 			goto protect_skip_pve_pass1;
4927 		}
4928 #endif
4929 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4930 		const pmap_t pmap = ptdp->pmap;
4931 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4932 
4933 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4934 #if MACH_ASSERT
4935 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4936 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4937 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4938 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4939 
4940 				pv_entry_t *check_pvep = pve_p;
4941 
4942 				do {
4943 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4944 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4945 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4946 					}
4947 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4948 
4949 				/* Restore previous PTEP value. */
4950 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4951 			}
4952 #endif
4953 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4954 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4955 		}
4956 
4957 #if DEVELOPMENT || DEBUG
4958 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4959 #else
4960 		if ((prot & VM_PROT_EXECUTE))
4961 #endif
4962 		{
4963 			set_NX = false;
4964 		} else {
4965 			set_NX = true;
4966 		}
4967 
4968 #if HAS_FEAT_XS
4969 		/**
4970 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4971 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4972 		 */
4973 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4974 #endif /* HAS_FEAT_XS */
4975 
4976 		/* Remove the mapping if new protection is NONE */
4977 		if (remove) {
4978 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4979 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4980 				    __func__, pmap, ppnum);
4981 			}
4982 
4983 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4984 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4985 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4986 			pt_entry_t spte = *pte_p;
4987 
4988 			if (pte_is_wired(spte)) {
4989 				pte_set_wired(pmap, pte_p, 0);
4990 				spte = *pte_p;
4991 				if (pmap != kernel_pmap) {
4992 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4993 				}
4994 			}
4995 
4996 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4997 			    (uint64_t)spte, pte_p, ppnum);
4998 
4999 			if (compress && is_internal && (pmap != kernel_pmap)) {
5000 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
5001 				/* mark this PTE as having been "compressed" */
5002 				tmplate = ARM_PTE_COMPRESSED;
5003 				if (is_altacct) {
5004 					tmplate |= ARM_PTE_COMPRESSED_ALT;
5005 				}
5006 			} else {
5007 				tmplate = ARM_PTE_EMPTY;
5008 			}
5009 
5010 			assert(spte != tmplate);
5011 			write_pte_fast(pte_p, tmplate);
5012 			update = true;
5013 			++pass1_updated;
5014 
5015 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5016 
5017 			if (pmap != kernel_pmap) {
5018 				if (ppattr_test_reusable(pai) &&
5019 				    is_internal &&
5020 				    !is_altacct) {
5021 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5022 				} else if (!is_internal) {
5023 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5024 				}
5025 
5026 				if (is_altacct) {
5027 					assert(is_internal);
5028 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5029 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5030 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5031 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5032 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5033 					}
5034 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5035 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5036 				} else if (ppattr_test_reusable(pai)) {
5037 					assert(is_internal);
5038 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5039 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5040 						/* was not in footprint, but is now */
5041 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5042 					}
5043 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5044 				} else if (is_internal) {
5045 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5046 
5047 					/*
5048 					 * Update all stats related to physical footprint, which only
5049 					 * deals with internal pages.
5050 					 */
5051 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5052 						/*
5053 						 * This removal is only being done so we can send this page to
5054 						 * the compressor; therefore it mustn't affect total task footprint.
5055 						 */
5056 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5057 					} else {
5058 						/*
5059 						 * This internal page isn't going to the compressor, so adjust stats to keep
5060 						 * phys_footprint up to date.
5061 						 */
5062 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5063 					}
5064 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5065 				} else {
5066 					/* external page: no impact on ledgers */
5067 				}
5068 			}
5069 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5070 		} else {
5071 			pt_entry_t spte = *pte_p;
5072 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5073 
5074 			if (pmap == kernel_pmap) {
5075 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5076 			} else {
5077 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5078 			}
5079 
5080 			/*
5081 			 * While the naive implementation of this would serve to add execute
5082 			 * permission, this is not how the VM uses this interface, or how
5083 			 * x86_64 implements it.  So ignore requests to add execute permissions.
5084 			 */
5085 			if (set_NX) {
5086 				tmplate |= pt_attr_leaf_xn(pt_attr);
5087 			}
5088 
5089 
5090 			assert(spte != ARM_PTE_EMPTY);
5091 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5092 
5093 			if (spte != tmplate) {
5094 				/*
5095 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5096 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5097 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
5098 				 * should always be cleared by this function.
5099 				 */
5100 				pte_set_was_writeable(tmplate, true);
5101 				write_pte_fast(pte_p, tmplate);
5102 				update = true;
5103 				++pass1_updated;
5104 			} else if (pte_was_writeable(tmplate)) {
5105 				/*
5106 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5107 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5108 				 * write access to a page, this function should always at least clear that flag for
5109 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5110 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5111 				 * be handled through arm_fast_fault().
5112 				 */
5113 				pte_set_was_writeable(tmplate, false);
5114 				write_pte_fast(pte_p, tmplate);
5115 			}
5116 		}
5117 
5118 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5119 			tlb_flush_needed = true;
5120 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5121 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5122 				issue_tlbi = true;
5123 			}
5124 		}
5125 protect_skip_pve_pass1:
5126 		pte_p = PT_ENTRY_NULL;
5127 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5128 			pve_ptep_idx = 0;
5129 			pve_p = pve_next(pve_p);
5130 		}
5131 	}
5132 
5133 	if (tlb_flush_needed) {
5134 		FLUSH_PTE_STRONG();
5135 	}
5136 
5137 	if (!remove && !issue_tlbi) {
5138 		goto protect_finish;
5139 	}
5140 
5141 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5142 	pv_entry_t **pve_pp = pv_h;
5143 	pve_p = orig_pve_p;
5144 	pte_p = orig_pte_p;
5145 	pve_ptep_idx = 0;
5146 
5147 	/*
5148 	 * We need to keep track of whether a particular PVE list contains IOMMU
5149 	 * mappings when removing entries, because we should only remove CPU
5150 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5151 	 * it around.
5152 	 */
5153 	bool iommu_mapping_in_pve = false;
5154 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5155 		if (pve_p != PV_ENTRY_NULL) {
5156 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5157 			if (pte_p == PT_ENTRY_NULL) {
5158 				goto protect_skip_pve_pass2;
5159 			}
5160 		}
5161 
5162 #ifdef PVH_FLAG_IOMMU
5163 		if (pvh_ptep_is_iommu(pte_p)) {
5164 			iommu_mapping_in_pve = true;
5165 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5166 				/*
5167 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5168 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5169 				 * contain the single IOMMU PTE and exit the loop.
5170 				 */
5171 				new_pte_p = pte_p;
5172 				break;
5173 			}
5174 			goto protect_skip_pve_pass2;
5175 		}
5176 #endif
5177 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5178 		const pmap_t pmap = ptdp->pmap;
5179 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5180 
5181 		if (remove) {
5182 			if (!compress && (pmap != kernel_pmap)) {
5183 				/*
5184 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5185 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5186 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5187 				 * under us.
5188 				 */
5189 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5190 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5191 				}
5192 			}
5193 			/* Remove this CPU mapping from PVE list. */
5194 			if (pve_p != PV_ENTRY_NULL) {
5195 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5196 			}
5197 		} else {
5198 			pt_entry_t spte = *pte_p;
5199 			if (pte_was_writeable(spte)) {
5200 				pte_set_was_writeable(spte, false);
5201 				write_pte_fast(pte_p, spte);
5202 			} else {
5203 				goto protect_skip_pve_pass2;
5204 			}
5205 		}
5206 		++pass2_updated;
5207 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5208 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5209 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5210 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5211 		}
5212 
5213 protect_skip_pve_pass2:
5214 		pte_p = PT_ENTRY_NULL;
5215 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5216 			pve_ptep_idx = 0;
5217 
5218 			if (remove) {
5219 				/**
5220 				 * If there are any IOMMU mappings in the PVE list, preserve
5221 				 * those mappings in a new PVE list (new_pve_p) which will later
5222 				 * become the new PVH entry. Keep track of the CPU mappings in
5223 				 * pveh_p/pvet_p so they can be deallocated later.
5224 				 */
5225 				if (iommu_mapping_in_pve) {
5226 					iommu_mapping_in_pve = false;
5227 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5228 					pve_remove(pv_h, pve_pp, pve_p);
5229 					pveh_p = pvh_pve_list(pv_h);
5230 					pve_p->pve_next = new_pve_p;
5231 					new_pve_p = pve_p;
5232 					pve_p = temp_pve_p;
5233 					continue;
5234 				} else {
5235 					pvet_p = pve_p;
5236 					pvh_cnt++;
5237 				}
5238 			}
5239 
5240 			pve_pp = pve_next_ptr(pve_p);
5241 			pve_p = pve_next(pve_p);
5242 			iommu_mapping_in_pve = false;
5243 		}
5244 	}
5245 
5246 protect_finish:
5247 
5248 #ifdef PVH_FLAG_EXEC
5249 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5250 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5251 	}
5252 #endif
5253 	if (__improbable(pass1_updated != pass2_updated)) {
5254 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5255 		    __func__, pass1_updated, pass2_updated);
5256 	}
5257 	/* if we removed a bunch of entries, take care of them now */
5258 	if (remove) {
5259 		if (new_pve_p != PV_ENTRY_NULL) {
5260 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5261 			pvh_set_flags(pv_h, pvh_flags);
5262 		} else if (new_pte_p != PT_ENTRY_NULL) {
5263 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5264 			pvh_set_flags(pv_h, pvh_flags);
5265 		} else {
5266 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5267 				pmap_flush_noncoherent_page(phys);
5268 			}
5269 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5270 		}
5271 	}
5272 
5273 	if (flush_range && tlb_flush_needed) {
5274 		if (!remove) {
5275 			flush_range->ptfr_flush_needed = true;
5276 			tlb_flush_needed = false;
5277 		}
5278 	}
5279 
5280 	/*
5281 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5282 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5283 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5284 	 * a page to be repurposed while it is still live in the TLBs.
5285 	 */
5286 	if (remove && tlb_flush_needed) {
5287 		sync_tlb_flush();
5288 	}
5289 
5290 
5291 	pvh_unlock(pai);
5292 
5293 	if (remove) {
5294 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5295 #if !XNU_MONITOR
5296 		mp_enable_preemption();
5297 #endif
5298 	}
5299 
5300 	if (!remove && tlb_flush_needed) {
5301 		sync_tlb_flush();
5302 	}
5303 
5304 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5305 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5306 	}
5307 }
5308 
5309 MARK_AS_PMAP_TEXT void
5310 pmap_page_protect_options_internal(
5311 	ppnum_t ppnum,
5312 	vm_prot_t prot,
5313 	unsigned int options,
5314 	void *arg)
5315 {
5316 	if (arg != NULL) {
5317 		/*
5318 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5319 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5320 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5321 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5322 		 * In that case, force the flush to take place.
5323 		 */
5324 		options &= ~PMAP_OPTIONS_NOFLUSH;
5325 	}
5326 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5327 }
5328 
5329 void
5330 pmap_page_protect_options(
5331 	ppnum_t ppnum,
5332 	vm_prot_t prot,
5333 	unsigned int options,
5334 	void *arg)
5335 {
5336 	pmap_paddr_t    phys = ptoa(ppnum);
5337 
5338 	assert(ppnum != vm_page_fictitious_addr);
5339 
5340 	/* Only work with managed pages. */
5341 	if (!pa_valid(phys)) {
5342 		return;
5343 	}
5344 
5345 	/*
5346 	 * Determine the new protection.
5347 	 */
5348 	if (prot == VM_PROT_ALL) {
5349 		return;         /* nothing to do */
5350 	}
5351 
5352 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5353 
5354 #if XNU_MONITOR
5355 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5356 #else
5357 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5358 #endif
5359 
5360 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5361 }
5362 
5363 
5364 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5365 MARK_AS_PMAP_TEXT void
5366 pmap_disable_user_jop_internal(pmap_t pmap)
5367 {
5368 	if (pmap == kernel_pmap) {
5369 		panic("%s: called with kernel_pmap", __func__);
5370 	}
5371 	validate_pmap_mutable(pmap);
5372 	pmap->disable_jop = true;
5373 }
5374 
5375 void
5376 pmap_disable_user_jop(pmap_t pmap)
5377 {
5378 #if XNU_MONITOR
5379 	pmap_disable_user_jop_ppl(pmap);
5380 #else
5381 	pmap_disable_user_jop_internal(pmap);
5382 #endif
5383 }
5384 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5385 
5386 /*
5387  * Indicates if the pmap layer enforces some additional restrictions on the
5388  * given set of protections.
5389  */
5390 bool
5391 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5392 {
5393 	return false;
5394 }
5395 
5396 /*
5397  *	Set the physical protection on the
5398  *	specified range of this map as requested.
5399  *	VERY IMPORTANT: Will not increase permissions.
5400  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5401  */
5402 void
5403 pmap_protect(
5404 	pmap_t pmap,
5405 	vm_map_address_t b,
5406 	vm_map_address_t e,
5407 	vm_prot_t prot)
5408 {
5409 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5410 }
5411 
5412 MARK_AS_PMAP_TEXT vm_map_address_t
5413 pmap_protect_options_internal(
5414 	pmap_t pmap,
5415 	vm_map_address_t start,
5416 	vm_map_address_t end,
5417 	vm_prot_t prot,
5418 	unsigned int options,
5419 	__unused void *args)
5420 {
5421 	tt_entry_t      *tte_p;
5422 	pt_entry_t      *bpte_p, *epte_p;
5423 	pt_entry_t      *pte_p;
5424 	boolean_t        set_NX = TRUE;
5425 	boolean_t        set_XO = FALSE;
5426 	boolean_t        should_have_removed = FALSE;
5427 	bool             need_strong_sync = false;
5428 
5429 	/* Validate the pmap input before accessing its data. */
5430 	validate_pmap_mutable(pmap);
5431 
5432 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5433 
5434 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5435 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5436 	}
5437 
5438 #if DEVELOPMENT || DEBUG
5439 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5440 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5441 			should_have_removed = TRUE;
5442 		}
5443 	} else
5444 #endif
5445 	{
5446 		/* Determine the new protection. */
5447 		switch (prot) {
5448 		case VM_PROT_EXECUTE:
5449 			set_XO = TRUE;
5450 			OS_FALLTHROUGH;
5451 		case VM_PROT_READ:
5452 		case VM_PROT_READ | VM_PROT_EXECUTE:
5453 			break;
5454 		case VM_PROT_READ | VM_PROT_WRITE:
5455 		case VM_PROT_ALL:
5456 			return end;         /* nothing to do */
5457 		default:
5458 			should_have_removed = TRUE;
5459 		}
5460 	}
5461 
5462 	if (should_have_removed) {
5463 		panic("%s: should have been a remove operation, "
5464 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5465 		    __FUNCTION__,
5466 		    pmap, (void *)start, (void *)end, prot, options, args);
5467 	}
5468 
5469 #if DEVELOPMENT || DEBUG
5470 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5471 #else
5472 	if ((prot & VM_PROT_EXECUTE))
5473 #endif
5474 	{
5475 		set_NX = FALSE;
5476 	} else {
5477 		set_NX = TRUE;
5478 	}
5479 
5480 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5481 	vm_map_address_t va = start;
5482 	unsigned int npages = 0;
5483 
5484 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5485 
5486 	tte_p = pmap_tte(pmap, start);
5487 
5488 	if ((tte_p != (tt_entry_t *) NULL) && tte_is_valid_table(*tte_p)) {
5489 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5490 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5491 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5492 		pte_p = bpte_p;
5493 
5494 		for (pte_p = bpte_p;
5495 		    pte_p < epte_p;
5496 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5497 			++npages;
5498 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5499 			    pmap_pending_preemption())) {
5500 				break;
5501 			}
5502 			pt_entry_t spte;
5503 #if DEVELOPMENT || DEBUG
5504 			boolean_t  force_write = FALSE;
5505 #endif
5506 
5507 			spte = *((volatile pt_entry_t*)pte_p);
5508 
5509 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5510 				continue;
5511 			}
5512 
5513 			pmap_paddr_t    pa;
5514 			unsigned int    pai = 0;
5515 			boolean_t       managed = FALSE;
5516 
5517 			while (!managed) {
5518 				/*
5519 				 * It may be possible for the pte to transition from managed
5520 				 * to unmanaged in this timeframe; for now, elide the assert.
5521 				 * We should break out as a consequence of checking pa_valid.
5522 				 */
5523 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5524 				pa = pte_to_pa(spte);
5525 				if (!pa_valid(pa)) {
5526 					break;
5527 				}
5528 				pai = pa_index(pa);
5529 				pvh_lock(pai);
5530 				spte = *((volatile pt_entry_t*)pte_p);
5531 				pa = pte_to_pa(spte);
5532 				if (pai == pa_index(pa)) {
5533 					managed = TRUE;
5534 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5535 				}
5536 				pvh_unlock(pai);
5537 			}
5538 
5539 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5540 				continue;
5541 			}
5542 
5543 			pt_entry_t      tmplate;
5544 
5545 			if (pmap == kernel_pmap) {
5546 #if DEVELOPMENT || DEBUG
5547 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5548 					force_write = TRUE;
5549 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5550 				} else
5551 #endif
5552 				{
5553 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5554 				}
5555 			} else {
5556 #if DEVELOPMENT || DEBUG
5557 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5558 					assert(pmap->type != PMAP_TYPE_NESTED);
5559 					force_write = TRUE;
5560 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5561 				} else
5562 #endif
5563 				{
5564 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5565 				}
5566 			}
5567 
5568 			/*
5569 			 * XXX Removing "NX" would
5570 			 * grant "execute" access
5571 			 * immediately, bypassing any
5572 			 * checks VM might want to do
5573 			 * in its soft fault path.
5574 			 * pmap_protect() and co. are
5575 			 * not allowed to increase
5576 			 * access permissions.
5577 			 */
5578 			if (set_NX) {
5579 				tmplate |= pt_attr_leaf_xn(pt_attr);
5580 			} else {
5581 				if (pmap == kernel_pmap) {
5582 					/* do NOT clear "PNX"! */
5583 					tmplate |= ARM_PTE_NX;
5584 				} else {
5585 					/* do NOT clear "NX"! */
5586 					tmplate |= pt_attr_leaf_x(pt_attr);
5587 					if (set_XO) {
5588 						tmplate &= ~ARM_PTE_APMASK;
5589 						tmplate |= pt_attr_leaf_rona(pt_attr);
5590 					}
5591 				}
5592 			}
5593 
5594 #if DEVELOPMENT || DEBUG
5595 			if (force_write) {
5596 				/*
5597 				 * TODO: Run CS/Monitor checks here.
5598 				 */
5599 				if (managed) {
5600 					/*
5601 					 * We are marking the page as writable,
5602 					 * so we consider it to be modified and
5603 					 * referenced.
5604 					 */
5605 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5606 					tmplate |= ARM_PTE_AF;
5607 
5608 					if (ppattr_test_reffault(pai)) {
5609 						ppattr_clear_reffault(pai);
5610 					}
5611 
5612 					if (ppattr_test_modfault(pai)) {
5613 						ppattr_clear_modfault(pai);
5614 					}
5615 				}
5616 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5617 				/*
5618 				 * An immediate request for anything other than
5619 				 * write should still mark the page as
5620 				 * referenced if managed.
5621 				 */
5622 				if (managed) {
5623 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5624 					tmplate |= ARM_PTE_AF;
5625 
5626 					if (ppattr_test_reffault(pai)) {
5627 						ppattr_clear_reffault(pai);
5628 					}
5629 				}
5630 			}
5631 #endif
5632 
5633 			/* We do not expect to write fast fault the entry. */
5634 			pte_set_was_writeable(tmplate, false);
5635 #if HAS_FEAT_XS
5636 			if (pte_is_xs(pt_attr, spte)) {
5637 				need_strong_sync = true;
5638 			}
5639 #endif /* HAS_FEAT_XS */
5640 
5641 			write_pte_fast(pte_p, tmplate);
5642 
5643 			if (managed) {
5644 				pvh_assert_locked(pai);
5645 				pvh_unlock(pai);
5646 			}
5647 		}
5648 		FLUSH_PTE_STRONG();
5649 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5650 	} else {
5651 		va = end;
5652 	}
5653 
5654 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5655 	return va;
5656 }
5657 
5658 void
5659 pmap_protect_options(
5660 	pmap_t pmap,
5661 	vm_map_address_t b,
5662 	vm_map_address_t e,
5663 	vm_prot_t prot,
5664 	unsigned int options,
5665 	__unused void *args)
5666 {
5667 	vm_map_address_t l, beg;
5668 
5669 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5670 
5671 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5672 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5673 		    pmap, (uint64_t)b, (uint64_t)e);
5674 	}
5675 
5676 	/*
5677 	 * We allow single-page requests to execute non-preemptibly,
5678 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5679 	 * operation, and there are a couple of special use cases that
5680 	 * require a non-preemptible single-page operation.
5681 	 */
5682 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5683 		pmap_verify_preemptible();
5684 	}
5685 
5686 #if DEVELOPMENT || DEBUG
5687 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5688 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5689 			pmap_remove_options(pmap, b, e, options);
5690 			return;
5691 		}
5692 	} else
5693 #endif
5694 	{
5695 		/* Determine the new protection. */
5696 		switch (prot) {
5697 		case VM_PROT_EXECUTE:
5698 		case VM_PROT_READ:
5699 		case VM_PROT_READ | VM_PROT_EXECUTE:
5700 			break;
5701 		case VM_PROT_READ | VM_PROT_WRITE:
5702 		case VM_PROT_ALL:
5703 			return;         /* nothing to do */
5704 		default:
5705 			pmap_remove_options(pmap, b, e, options);
5706 			return;
5707 		}
5708 	}
5709 
5710 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5711 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5712 	    VM_KERNEL_ADDRHIDE(e));
5713 
5714 	beg = b;
5715 
5716 	while (beg < e) {
5717 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5718 
5719 		if (l > e) {
5720 			l = e;
5721 		}
5722 
5723 #if XNU_MONITOR
5724 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5725 #else
5726 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5727 #endif
5728 	}
5729 
5730 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5731 }
5732 
5733 /**
5734  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5735  *
5736  * @param pmap pmap to insert the pages into.
5737  * @param va virtual address to map the pages into.
5738  * @param pa page number of the first physical page to map.
5739  * @param size block size, in number of pages.
5740  * @param prot mapping protection attributes.
5741  * @param attr flags to pass to pmap_enter().
5742  *
5743  * @return KERN_SUCCESS.
5744  */
5745 kern_return_t
5746 pmap_map_block(
5747 	pmap_t pmap,
5748 	addr64_t va,
5749 	ppnum_t pa,
5750 	uint32_t size,
5751 	vm_prot_t prot,
5752 	int attr,
5753 	unsigned int flags)
5754 {
5755 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5756 }
5757 
5758 /**
5759  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5760  * As opposed to pmap_map_block(), this function takes
5761  * a physical address as an input and operates using the
5762  * page size associated with the input pmap.
5763  *
5764  * @param pmap pmap to insert the pages into.
5765  * @param va virtual address to map the pages into.
5766  * @param pa physical address of the first physical page to map.
5767  * @param size block size, in number of pages.
5768  * @param prot mapping protection attributes.
5769  * @param attr flags to pass to pmap_enter().
5770  *
5771  * @return KERN_SUCCESS.
5772  */
5773 kern_return_t
5774 pmap_map_block_addr(
5775 	pmap_t pmap,
5776 	addr64_t va,
5777 	pmap_paddr_t pa,
5778 	uint32_t size,
5779 	vm_prot_t prot,
5780 	int attr,
5781 	unsigned int flags)
5782 {
5783 #if __ARM_MIXED_PAGE_SIZE__
5784 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5785 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5786 #else
5787 	const uint64_t pmap_page_size = PAGE_SIZE;
5788 #endif
5789 
5790 	for (ppnum_t page = 0; page < size; page++) {
5791 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5792 			panic("%s: failed pmap_enter_addr, "
5793 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5794 			    __FUNCTION__,
5795 			    pmap, va, (uint64_t)pa, size, prot, flags);
5796 		}
5797 
5798 		va += pmap_page_size;
5799 		pa += pmap_page_size;
5800 	}
5801 
5802 	return KERN_SUCCESS;
5803 }
5804 
5805 kern_return_t
5806 pmap_enter_addr(
5807 	pmap_t pmap,
5808 	vm_map_address_t v,
5809 	pmap_paddr_t pa,
5810 	vm_prot_t prot,
5811 	vm_prot_t fault_type,
5812 	unsigned int flags,
5813 	boolean_t wired)
5814 {
5815 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5816 }
5817 
5818 /*
5819  *	Insert the given physical page (p) at
5820  *	the specified virtual address (v) in the
5821  *	target physical map with the protection requested.
5822  *
5823  *	If specified, the page will be wired down, meaning
5824  *	that the related pte can not be reclaimed.
5825  *
5826  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5827  *	or lose information.  That is, this routine must actually
5828  *	insert this page into the given map eventually (must make
5829  *	forward progress eventually.
5830  */
5831 kern_return_t
5832 pmap_enter(
5833 	pmap_t pmap,
5834 	vm_map_address_t v,
5835 	ppnum_t pn,
5836 	vm_prot_t prot,
5837 	vm_prot_t fault_type,
5838 	unsigned int flags,
5839 	boolean_t wired,
5840 	__unused pmap_mapping_type_t mapping_type)
5841 {
5842 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5843 }
5844 
5845 /*
5846  * Attempt to commit the pte.
5847  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5848  * Performs no page table or accounting writes on failures.
5849  */
5850 static inline bool
5851 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5852 {
5853 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5854 	bool success = false, changed_wiring = false;
5855 
5856 	__unreachable_ok_push
5857 	if (TEST_PAGE_RATIO_4) {
5858 		/*
5859 		 * 16K virtual pages w/ 4K hw pages.
5860 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5861 		 * As a result we require the exclusive pmap lock.
5862 		 */
5863 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5864 		*old_pte = *pte_p;
5865 		if (*old_pte == new_pte) {
5866 			/* Another thread completed this operation. Nothing to do here. */
5867 			success = true;
5868 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5869 		    pte_is_valid(*old_pte)) {
5870 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5871 			success = false;
5872 		} else {
5873 			write_pte_fast(pte_p, new_pte);
5874 			success = true;
5875 		}
5876 	} else {
5877 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5878 	}
5879 	__unreachable_ok_pop
5880 
5881 	if (success && *old_pte != new_pte) {
5882 		if (pte_is_valid(*old_pte)) {
5883 			bool need_strong_sync = false;
5884 			FLUSH_PTE_STRONG();
5885 #if HAS_FEAT_XS
5886 			if (pte_is_xs(pt_attr, *old_pte)) {
5887 				need_strong_sync = true;
5888 			}
5889 #endif /* HAS_FEAT_XS */
5890 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5891 		} else {
5892 			FLUSH_PTE();
5893 			__builtin_arm_isb(ISB_SY);
5894 		}
5895 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5896 		    (new_pte & ARM_PTE_WIRED) != 0 :
5897 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5898 
5899 		if (pmap != kernel_pmap && changed_wiring) {
5900 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5901 			if (new_pte & ARM_PTE_WIRED) {
5902 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5903 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5904 			} else {
5905 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5906 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5907 			}
5908 		}
5909 
5910 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5911 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5912 	}
5913 	return success;
5914 }
5915 
5916 MARK_AS_PMAP_TEXT static pt_entry_t
5917 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5918 {
5919 	pt_entry_t pte;
5920 
5921 	switch (wimg & (VM_WIMG_MASK)) {
5922 	case VM_WIMG_IO:
5923 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5924 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5925 		// AP, while preserving the security benefits of using device
5926 		// mapping against side-channel attacks. On pre-H14 platforms,
5927 		// the accesses will still be strongly ordered.
5928 		if (is_dram_addr(pa)) {
5929 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5930 		} else {
5931 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5932 #if HAS_FEAT_XS
5933 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5934 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5935 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5936 			}
5937 #endif /* HAS_FEAT_XS */
5938 		}
5939 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5940 		break;
5941 	case VM_WIMG_RT:
5942 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5943 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5944 		break;
5945 	case VM_WIMG_POSTED:
5946 		if (is_dram_addr(pa)) {
5947 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5948 		} else {
5949 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5950 		}
5951 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5952 		break;
5953 	case VM_WIMG_POSTED_REORDERED:
5954 		if (is_dram_addr(pa)) {
5955 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5956 		} else {
5957 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5958 		}
5959 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5960 		break;
5961 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5962 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5963 #if HAS_FEAT_XS
5964 		if (!is_dram_addr(pa)) {
5965 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5966 		}
5967 #endif /* HAS_FEAT_XS */
5968 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5969 		break;
5970 	case VM_WIMG_WCOMB:
5971 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5972 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5973 		break;
5974 	case VM_WIMG_WTHRU:
5975 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5976 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5977 		break;
5978 	case VM_WIMG_COPYBACK:
5979 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5980 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5981 		break;
5982 	case VM_WIMG_INNERWBACK:
5983 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5984 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5985 		break;
5986 	default:
5987 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5988 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5989 	}
5990 
5991 	return pte;
5992 }
5993 
5994 
5995 /*
5996  * Construct a PTE (and the physical page attributes) for the given virtual to
5997  * physical mapping.
5998  *
5999  * This function has no side effects and is safe to call so that it is safe to
6000  * call while attempting a pmap_enter transaction.
6001  */
6002 MARK_AS_PMAP_TEXT static pt_entry_t
6003 pmap_construct_pte(
6004 	const pmap_t pmap,
6005 	vm_map_address_t va,
6006 	pmap_paddr_t pa,
6007 	vm_prot_t prot,
6008 	vm_prot_t fault_type,
6009 	boolean_t wired,
6010 	const pt_attr_t* const pt_attr,
6011 	uint16_t *pp_attr_bits /* OUTPUT */
6012 	)
6013 {
6014 	bool set_NX = false, set_XO = false;
6015 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
6016 	assert(pp_attr_bits != NULL);
6017 	*pp_attr_bits = 0;
6018 
6019 	if (wired) {
6020 		pte |= ARM_PTE_WIRED;
6021 	}
6022 
6023 #if DEVELOPMENT || DEBUG
6024 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6025 #else
6026 	if ((prot & VM_PROT_EXECUTE))
6027 #endif
6028 	{
6029 		set_NX = false;
6030 	} else {
6031 		set_NX = true;
6032 	}
6033 
6034 	if (prot == VM_PROT_EXECUTE) {
6035 		set_XO = true;
6036 	}
6037 
6038 	if (set_NX) {
6039 		pte |= pt_attr_leaf_xn(pt_attr);
6040 	} else {
6041 		if (pmap == kernel_pmap) {
6042 			pte |= ARM_PTE_NX;
6043 		} else {
6044 			pte |= pt_attr_leaf_x(pt_attr);
6045 		}
6046 	}
6047 
6048 	if (pmap == kernel_pmap) {
6049 #if __ARM_KERNEL_PROTECT__
6050 		pte |= ARM_PTE_NG;
6051 #endif /* __ARM_KERNEL_PROTECT__ */
6052 		if (prot & VM_PROT_WRITE) {
6053 			pte |= ARM_PTE_AP(AP_RWNA);
6054 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6055 		} else {
6056 			pte |= ARM_PTE_AP(AP_RONA);
6057 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6058 		}
6059 	} else {
6060 		if (pmap->type != PMAP_TYPE_NESTED) {
6061 			pte |= ARM_PTE_NG;
6062 		} else if ((pmap->nested_region_unnested_table_bitmap)
6063 		    && (va >= pmap->nested_region_addr)
6064 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6065 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
6066 
6067 			if ((pmap->nested_region_unnested_table_bitmap)
6068 			    && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6069 				pte |= ARM_PTE_NG;
6070 			}
6071 		}
6072 		if (prot & VM_PROT_WRITE) {
6073 			assert(pmap->type != PMAP_TYPE_NESTED);
6074 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6075 				if (fault_type & VM_PROT_WRITE) {
6076 					pte |= pt_attr_leaf_rw(pt_attr);
6077 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6078 				} else {
6079 					pte |= pt_attr_leaf_ro(pt_attr);
6080 					/*
6081 					 * Mark the page as MODFAULT so that a subsequent write
6082 					 * may be handled through arm_fast_fault().
6083 					 */
6084 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6085 					pte_set_was_writeable(pte, true);
6086 				}
6087 			} else {
6088 				pte |= pt_attr_leaf_rw(pt_attr);
6089 				*pp_attr_bits |= PP_ATTR_REFERENCED;
6090 			}
6091 		} else {
6092 			if (set_XO) {
6093 				pte |= pt_attr_leaf_rona(pt_attr);
6094 			} else {
6095 				pte |= pt_attr_leaf_ro(pt_attr);
6096 			}
6097 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6098 		}
6099 	}
6100 
6101 	pte |= ARM_PTE_AF;
6102 	return pte;
6103 }
6104 
6105 MARK_AS_PMAP_TEXT kern_return_t
6106 pmap_enter_options_internal(
6107 	pmap_t pmap,
6108 	vm_map_address_t v,
6109 	pmap_paddr_t pa,
6110 	vm_prot_t prot,
6111 	vm_prot_t fault_type,
6112 	unsigned int flags,
6113 	boolean_t wired,
6114 	unsigned int options)
6115 {
6116 	ppnum_t         pn = (ppnum_t)atop(pa);
6117 	pt_entry_t      pte;
6118 	pt_entry_t      spte;
6119 	pt_entry_t      *pte_p;
6120 	bool            refcnt_updated;
6121 	bool            wiredcnt_updated;
6122 	bool            ro_va = false;
6123 	unsigned int    wimg_bits;
6124 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6125 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6126 	kern_return_t   kr = KERN_SUCCESS;
6127 	uint16_t pp_attr_bits;
6128 	volatile uint16_t *refcnt;
6129 	volatile uint16_t *wiredcnt;
6130 	pv_free_list_t *local_pv_free;
6131 
6132 	validate_pmap_mutable(pmap);
6133 
6134 #if XNU_MONITOR
6135 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6136 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6137 	}
6138 #endif
6139 
6140 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6141 
6142 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6143 		panic("%s: pmap %p v 0x%llx not page-aligned",
6144 		    __func__, pmap, (unsigned long long)v);
6145 	}
6146 
6147 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6148 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6149 	}
6150 
6151 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6152 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6153 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6154 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6155 	}
6156 
6157 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6158 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6159 		    pmap, (uint64_t)pa);
6160 	}
6161 
6162 	/* The PA should not extend beyond the architected physical address space */
6163 	pa &= ARM_PTE_PAGE_MASK;
6164 
6165 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6166 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6167 		extern vm_offset_t ctrr_test_page;
6168 		if (__probable(v != ctrr_test_page))
6169 #endif
6170 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6171 	}
6172 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6173 		if (__improbable(prot != VM_PROT_READ)) {
6174 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6175 			    __func__, (unsigned long long)v, prot);
6176 		}
6177 		ro_va = true;
6178 	}
6179 	assert(pn != vm_page_fictitious_addr);
6180 
6181 	refcnt_updated = false;
6182 	wiredcnt_updated = false;
6183 
6184 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6185 		/*
6186 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6187 		 *
6188 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6189 		 */
6190 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6191 	}
6192 
6193 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6194 		return KERN_ABORTED;
6195 	}
6196 
6197 	/*
6198 	 *	Expand pmap to include this pte.  Assume that
6199 	 *	pmap is always expanded to include enough hardware
6200 	 *	pages to map one VM page.
6201 	 */
6202 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6203 		/* Must unlock to expand the pmap. */
6204 		pmap_unlock(pmap, lock_mode);
6205 
6206 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6207 
6208 		if (kr != KERN_SUCCESS) {
6209 			return kr;
6210 		}
6211 
6212 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6213 			return KERN_ABORTED;
6214 		}
6215 	}
6216 
6217 	if (options & PMAP_OPTIONS_NOENTER) {
6218 		pmap_unlock(pmap, lock_mode);
6219 		return KERN_SUCCESS;
6220 	}
6221 
6222 	/*
6223 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6224 	 * done via a cmpxchg loop.
6225 	 * We need to be careful about modifying non-local data structures before commiting
6226 	 * the new pte since we may need to re-do the transaction.
6227 	 */
6228 	spte = os_atomic_load(pte_p, relaxed);
6229 	while (!committed) {
6230 		refcnt = NULL;
6231 		wiredcnt = NULL;
6232 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6233 		had_valid_mapping = pte_is_valid(spte);
6234 
6235 		if (pmap != kernel_pmap) {
6236 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6237 			refcnt = &ptd_info->refcnt;
6238 			wiredcnt = &ptd_info->wiredcnt;
6239 			/*
6240 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6241 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6242 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6243 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6244 			 * have PTDs, so we can't use the check there.
6245 			 */
6246 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6247 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6248 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6249 			}
6250 			/*
6251 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6252 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6253 			 * or acquire the pmap lock exclusive.
6254 			 */
6255 			if (!wiredcnt_updated) {
6256 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6257 				wiredcnt_updated = true;
6258 			}
6259 			if (!refcnt_updated) {
6260 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6261 				refcnt_updated = true;
6262 				drop_refcnt = true;
6263 			}
6264 		}
6265 
6266 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6267 			/*
6268 			 * There is already a mapping here & it's for a different physical page.
6269 			 * First remove that mapping.
6270 			 *
6271 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6272 			 */
6273 			if (lock_mode == PMAP_LOCK_SHARED) {
6274 				if (pmap_lock_shared_to_exclusive(pmap)) {
6275 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6276 				} else {
6277 					/*
6278 					 * We failed to upgrade to an exclusive lock.
6279 					 * As a result we no longer hold the lock at all,
6280 					 * so we need to re-acquire it and restart the transaction.
6281 					 */
6282 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6283 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6284 					/* pmap might have changed after we dropped the lock. Try again. */
6285 					spte = os_atomic_load(pte_p, relaxed);
6286 					continue;
6287 				}
6288 			}
6289 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6290 			spte = ARM_PTE_EMPTY;
6291 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_EMPTY);
6292 		}
6293 
6294 		/*
6295 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6296 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6297 		 * read-write protection. The PMAP layer though still needs to use the right
6298 		 * index, which is the older XO-now-TPRO one and that is specially selected
6299 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6300 		 */
6301 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6302 			if (__improbable(pmap == kernel_pmap)) {
6303 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6304 				    __func__);
6305 			}
6306 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6307 		} else {
6308 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6309 		}
6310 
6311 		if (pa_valid(pa)) {
6312 			unsigned int pai;
6313 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6314 
6315 			is_internal = FALSE;
6316 			is_altacct = FALSE;
6317 
6318 			pai = pa_index(pa);
6319 
6320 			pvh_lock(pai);
6321 
6322 			/*
6323 			 * Make sure that the current per-cpu PV free list has
6324 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6325 			 * if the transaction succeeds. We're either in the
6326 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6327 			 * Note that we can still be interrupted, but a primary
6328 			 * interrupt handler can never enter the pmap.
6329 			 */
6330 #if !XNU_MONITOR
6331 			assert(get_preemption_level() > 0);
6332 #endif
6333 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6334 			pv_entry_t **pv_h = pai_to_pvh(pai);
6335 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6336 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6337 
6338 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6339 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6340 				int new_allocated_pves = 0;
6341 
6342 				while (new_allocated_pves < 2) {
6343 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6344 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6345 					if (pv_status == PV_ALLOC_FAIL) {
6346 						break;
6347 					} else if (pv_status == PV_ALLOC_RETRY) {
6348 						/*
6349 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6350 						 * it will have dropped the pmap lock while doing so.
6351 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6352 						 * be on a different CPU now.
6353 						 */
6354 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6355 					} else {
6356 						/* If we've gotten this far then a node should've been allocated. */
6357 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6358 
6359 						new_allocated_pves++;
6360 					}
6361 				}
6362 
6363 				for (int i = 0; i < new_allocated_pves; i++) {
6364 					pv_free(new_pve_p[i]);
6365 				}
6366 			}
6367 
6368 			if (pv_status == PV_ALLOC_FAIL) {
6369 				pvh_unlock(pai);
6370 				kr = KERN_RESOURCE_SHORTAGE;
6371 				break;
6372 			} else if (pv_status == PV_ALLOC_RETRY) {
6373 				pvh_unlock(pai);
6374 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6375 				spte = os_atomic_load(pte_p, relaxed);
6376 				continue;
6377 			}
6378 
6379 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6380 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6381 			} else {
6382 				wimg_bits = pmap_cache_attributes(pn);
6383 			}
6384 
6385 			/* We may be retrying this operation after dropping the PVH lock.
6386 			 * Cache attributes for the physical page may have changed while the lock
6387 			 * was dropped, so clear any cache attributes we may have previously set
6388 			 * in the PTE template. */
6389 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6390 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6391 
6392 #if XNU_MONITOR
6393 			/* The regular old kernel is not allowed to remap PPL pages. */
6394 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6395 				panic("%s: page belongs to PPL, "
6396 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6397 				    __FUNCTION__,
6398 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6399 			}
6400 
6401 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6402 				panic("%s: page locked down, "
6403 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6404 				    __FUNCTION__,
6405 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6406 			}
6407 #endif
6408 
6409 
6410 
6411 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6412 			if (!committed) {
6413 				pvh_unlock(pai);
6414 				continue;
6415 			}
6416 			had_valid_mapping = pte_is_valid(spte);
6417 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6418 
6419 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6420 			/*
6421 			 * If there was already a valid pte here then we reuse its reference
6422 			 * on the ptd and drop the one that we took above.
6423 			 */
6424 			drop_refcnt = had_valid_mapping;
6425 
6426 			if (!had_valid_mapping) {
6427 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6428 				int pve_ptep_idx = 0;
6429 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6430 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6431 				if (pv_status != PV_ALLOC_SUCCESS) {
6432 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6433 					    __func__, pv_status, new_pve_p, pmap);
6434 				}
6435 
6436 				if (pmap != kernel_pmap) {
6437 					if (options & PMAP_OPTIONS_INTERNAL) {
6438 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6439 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6440 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6441 							/*
6442 							 * Make a note to ourselves that this
6443 							 * mapping is using alternative
6444 							 * accounting. We'll need this in order
6445 							 * to know which ledger to debit when
6446 							 * the mapping is removed.
6447 							 *
6448 							 * The altacct bit must be set while
6449 							 * the pv head is locked. Defer the
6450 							 * ledger accounting until after we've
6451 							 * dropped the lock.
6452 							 */
6453 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6454 							is_altacct = TRUE;
6455 						}
6456 					}
6457 					if (ppattr_test_reusable(pai) &&
6458 					    !is_altacct) {
6459 						is_reusable = TRUE;
6460 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6461 						is_internal = TRUE;
6462 					} else {
6463 						is_external = TRUE;
6464 					}
6465 				}
6466 			}
6467 
6468 			pvh_unlock(pai);
6469 
6470 			if (pp_attr_bits != 0) {
6471 				ppattr_pa_set_bits(pa, pp_attr_bits);
6472 			}
6473 
6474 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6475 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6476 
6477 				if (is_internal) {
6478 					/*
6479 					 * Make corresponding adjustments to
6480 					 * phys_footprint statistics.
6481 					 */
6482 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6483 					if (is_altacct) {
6484 						/*
6485 						 * If this page is internal and
6486 						 * in an IOKit region, credit
6487 						 * the task's total count of
6488 						 * dirty, internal IOKit pages.
6489 						 * It should *not* count towards
6490 						 * the task's total physical
6491 						 * memory footprint, because
6492 						 * this entire region was
6493 						 * already billed to the task
6494 						 * at the time the mapping was
6495 						 * created.
6496 						 *
6497 						 * Put another way, this is
6498 						 * internal++ and
6499 						 * alternate_accounting++, so
6500 						 * net effect on phys_footprint
6501 						 * is 0. That means: don't
6502 						 * touch phys_footprint here.
6503 						 */
6504 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6505 					} else {
6506 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6507 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6508 							skip_footprint_debit = true;
6509 						} else {
6510 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6511 						}
6512 					}
6513 				}
6514 				if (is_reusable) {
6515 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6516 				} else if (is_external) {
6517 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6518 				}
6519 			}
6520 		} else {
6521 			if (prot & VM_PROT_EXECUTE) {
6522 				kr = KERN_FAILURE;
6523 				break;
6524 			}
6525 
6526 			wimg_bits = pmap_cache_attributes(pn);
6527 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6528 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6529 			}
6530 
6531 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6532 
6533 #if XNU_MONITOR
6534 			pte = pmap_construct_io_pte(pa, pte);
6535 
6536 			/**
6537 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6538 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6539 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6540 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6541 			 */
6542 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6543 			    pte_is_valid(spte) &&
6544 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6545 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6546 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6547 				    __func__, (uint64_t)pte_to_pa(spte));
6548 			}
6549 #endif
6550 
6551 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6552 			if (committed) {
6553 				had_valid_mapping = pte_is_valid(spte);
6554 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6555 
6556 				/**
6557 				 * If there was already a valid pte here then we reuse its
6558 				 * reference on the ptd and drop the one that we took above.
6559 				 */
6560 				drop_refcnt = had_valid_mapping;
6561 			}
6562 		}
6563 		if (committed) {
6564 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6565 				assert(pmap != kernel_pmap);
6566 
6567 				/* One less "compressed" */
6568 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6569 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6570 
6571 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6572 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6573 				} else if (!skip_footprint_debit) {
6574 					/* Was part of the footprint */
6575 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6576 				}
6577 				/* The old entry held a reference so drop the extra one that we took above. */
6578 				drop_refcnt = true;
6579 			}
6580 		}
6581 	}
6582 
6583 	if (drop_refcnt && refcnt != NULL) {
6584 		assert(refcnt_updated);
6585 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6586 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6587 		}
6588 	}
6589 
6590 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6591 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6592 	}
6593 
6594 	pmap_unlock(pmap, lock_mode);
6595 
6596 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6597 		pmap_phys_write_disable(v);
6598 	}
6599 
6600 	return kr;
6601 }
6602 
6603 kern_return_t
6604 pmap_enter_options_addr(
6605 	pmap_t pmap,
6606 	vm_map_address_t v,
6607 	pmap_paddr_t pa,
6608 	vm_prot_t prot,
6609 	vm_prot_t fault_type,
6610 	unsigned int flags,
6611 	boolean_t wired,
6612 	unsigned int options,
6613 	__unused void   *arg,
6614 	__unused pmap_mapping_type_t mapping_type)
6615 {
6616 	kern_return_t kr = KERN_FAILURE;
6617 
6618 
6619 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6620 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6621 
6622 
6623 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6624 	do {
6625 #if XNU_MONITOR
6626 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6627 #else
6628 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6629 #endif
6630 
6631 		if (kr == KERN_RESOURCE_SHORTAGE) {
6632 #if XNU_MONITOR
6633 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6634 #endif
6635 			if (nowait_requested) {
6636 				break;
6637 			}
6638 		}
6639 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6640 
6641 #if XNU_MONITOR
6642 	pmap_ledger_check_balance(pmap);
6643 #endif
6644 
6645 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6646 
6647 	return kr;
6648 }
6649 
6650 kern_return_t
6651 pmap_enter_options(
6652 	pmap_t pmap,
6653 	vm_map_address_t v,
6654 	ppnum_t pn,
6655 	vm_prot_t prot,
6656 	vm_prot_t fault_type,
6657 	unsigned int flags,
6658 	boolean_t wired,
6659 	unsigned int options,
6660 	__unused void   *arg,
6661 	pmap_mapping_type_t mapping_type)
6662 {
6663 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6664 }
6665 
6666 /*
6667  *	Routine:	pmap_change_wiring
6668  *	Function:	Change the wiring attribute for a map/virtual-address
6669  *			pair.
6670  *	In/out conditions:
6671  *			The mapping must already exist in the pmap.
6672  */
6673 MARK_AS_PMAP_TEXT kern_return_t
6674 pmap_change_wiring_internal(
6675 	pmap_t pmap,
6676 	vm_map_address_t v,
6677 	boolean_t wired)
6678 {
6679 	pt_entry_t     *pte_p;
6680 	pmap_paddr_t    pa;
6681 
6682 	validate_pmap_mutable(pmap);
6683 
6684 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6685 		return KERN_ABORTED;
6686 	}
6687 
6688 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6689 
6690 	pte_p = pmap_pte(pmap, v);
6691 	if (pte_p == PT_ENTRY_NULL) {
6692 		if (!wired) {
6693 			/*
6694 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6695 			 * may have been freed by a remove operation.
6696 			 */
6697 			goto pmap_change_wiring_return;
6698 		} else {
6699 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6700 		}
6701 	}
6702 	/*
6703 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6704 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6705 	 */
6706 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6707 
6708 	while (pa_valid(pa)) {
6709 		pmap_paddr_t new_pa;
6710 
6711 		pvh_lock(pa_index(pa));
6712 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6713 
6714 		if (pa == new_pa) {
6715 			break;
6716 		}
6717 
6718 		pvh_unlock(pa_index(pa));
6719 		pa = new_pa;
6720 	}
6721 
6722 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6723 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6724 		if (!wired) {
6725 			/* PTE cleared by prior remove/disconnect operation */
6726 			goto pmap_change_wiring_cleanup;
6727 		} else {
6728 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6729 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6730 		}
6731 	}
6732 
6733 	assertf(pte_is_valid(*pte_p), "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6734 	if (wired != pte_is_wired(*pte_p)) {
6735 		pte_set_wired(pmap, pte_p, wired);
6736 		if (pmap != kernel_pmap) {
6737 			if (wired) {
6738 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6739 			} else if (!wired) {
6740 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6741 			}
6742 		}
6743 	}
6744 
6745 pmap_change_wiring_cleanup:
6746 	if (pa_valid(pa)) {
6747 		pvh_unlock(pa_index(pa));
6748 	}
6749 
6750 pmap_change_wiring_return:
6751 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6752 
6753 	return KERN_SUCCESS;
6754 }
6755 
6756 void
6757 pmap_change_wiring(
6758 	pmap_t pmap,
6759 	vm_map_address_t v,
6760 	boolean_t wired)
6761 {
6762 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6763 	pmap_verify_preemptible();
6764 
6765 	kern_return_t kr = KERN_FAILURE;
6766 #if XNU_MONITOR
6767 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6768 	do {
6769 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6770 	} while (kr == KERN_ABORTED);
6771 
6772 	pmap_ledger_check_balance(pmap);
6773 #else
6774 	/* Since we verified preemptibility, call the helper only once. */
6775 	kr = pmap_change_wiring_internal(pmap, v, wired);
6776 #endif
6777 
6778 	if (kr != KERN_SUCCESS) {
6779 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6780 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6781 	}
6782 }
6783 
6784 MARK_AS_PMAP_TEXT pmap_paddr_t
6785 pmap_find_pa_internal(
6786 	pmap_t pmap,
6787 	addr64_t va)
6788 {
6789 	pmap_paddr_t    pa = 0;
6790 
6791 	validate_pmap(pmap);
6792 
6793 	if (pmap != kernel_pmap) {
6794 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6795 	}
6796 
6797 	pa = pmap_vtophys(pmap, va);
6798 
6799 	if (pmap != kernel_pmap) {
6800 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6801 	}
6802 
6803 	return pa;
6804 }
6805 
6806 pmap_paddr_t
6807 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6808 {
6809 	pmap_paddr_t pa = 0;
6810 
6811 	if (pmap == kernel_pmap) {
6812 		pa = mmu_kvtop(va);
6813 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6814 		/*
6815 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6816 		 * translation even if PAN would prevent kernel access through the translation.
6817 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6818 		 */
6819 		pa = mmu_uvtop(va);
6820 	}
6821 	return pa;
6822 }
6823 
6824 pmap_paddr_t
6825 pmap_find_pa(
6826 	pmap_t pmap,
6827 	addr64_t va)
6828 {
6829 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6830 
6831 	if (pa != 0) {
6832 		return pa;
6833 	}
6834 
6835 	if (not_in_kdp) {
6836 #if XNU_MONITOR
6837 		return pmap_find_pa_ppl(pmap, va);
6838 #else
6839 		return pmap_find_pa_internal(pmap, va);
6840 #endif
6841 	} else {
6842 		return pmap_vtophys(pmap, va);
6843 	}
6844 }
6845 
6846 ppnum_t
6847 pmap_find_phys_nofault(
6848 	pmap_t pmap,
6849 	addr64_t va)
6850 {
6851 	ppnum_t ppn;
6852 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6853 	return ppn;
6854 }
6855 
6856 ppnum_t
6857 pmap_find_phys(
6858 	pmap_t pmap,
6859 	addr64_t va)
6860 {
6861 	ppnum_t ppn;
6862 	ppn = atop(pmap_find_pa(pmap, va));
6863 	return ppn;
6864 }
6865 
6866 /**
6867  * Translate a kernel virtual address into a physical address.
6868  *
6869  * @param va The kernel virtual address to translate. Does not work on user
6870  *           virtual addresses.
6871  *
6872  * @return The physical address if the translation was successful, or zero if
6873  *         no valid mappings were found for the given virtual address.
6874  */
6875 pmap_paddr_t
6876 kvtophys(vm_offset_t va)
6877 {
6878 	/**
6879 	 * Attempt to do the translation first in hardware using the AT (address
6880 	 * translation) instruction. This will attempt to use the MMU to do the
6881 	 * translation for us.
6882 	 */
6883 	pmap_paddr_t pa = mmu_kvtop(va);
6884 
6885 	if (pa) {
6886 		return pa;
6887 	}
6888 
6889 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6890 	return pmap_vtophys(kernel_pmap, va);
6891 }
6892 
6893 /**
6894  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6895  * points to a non-kernel-managed physical page, then this call will panic().
6896  *
6897  * @note The output of this function is guaranteed to be a kernel-managed
6898  *       physical page, which means it's safe to pass the output directly to
6899  *       pa_index() to create a physical address index for various pmap data
6900  *       structures.
6901  *
6902  * @param va The kernel virtual address to translate. Does not work on user
6903  *           virtual addresses.
6904  *
6905  * @return The translated physical address for the given virtual address.
6906  */
6907 pmap_paddr_t
6908 kvtophys_nofail(vm_offset_t va)
6909 {
6910 	pmap_paddr_t pa = kvtophys(va);
6911 
6912 	if (!pa_valid(pa)) {
6913 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6914 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6915 	}
6916 
6917 	return pa;
6918 }
6919 
6920 pmap_paddr_t
6921 pmap_vtophys(
6922 	pmap_t pmap,
6923 	addr64_t va)
6924 {
6925 	if ((va < pmap->min) || (va >= pmap->max)) {
6926 		return 0;
6927 	}
6928 
6929 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6930 
6931 	tt_entry_t * ttp = NULL;
6932 	tt_entry_t * ttep = NULL;
6933 	tt_entry_t   tte = ARM_TTE_EMPTY;
6934 	pmap_paddr_t pa = 0;
6935 	unsigned int cur_level;
6936 
6937 	ttp = pmap->tte;
6938 
6939 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6940 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6941 
6942 		tte = *ttep;
6943 
6944 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6945 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6946 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6947 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6948 
6949 		if ((tte & valid_mask) != valid_mask) {
6950 			return (pmap_paddr_t) 0;
6951 		}
6952 
6953 		/* This detects both leaf entries and intermediate block mappings. */
6954 		if ((tte & type_mask) == type_block) {
6955 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6956 			break;
6957 		}
6958 
6959 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6960 	}
6961 
6962 	return pa;
6963 }
6964 
6965 /*
6966  *	pmap_init_pte_page - Initialize a page table page.
6967  */
6968 MARK_AS_PMAP_TEXT void
6969 pmap_init_pte_page(
6970 	pmap_t pmap,
6971 	pt_entry_t *pte_p,
6972 	vm_offset_t va,
6973 	unsigned int ttlevel,
6974 	boolean_t alloc_ptd)
6975 {
6976 	pt_desc_t   *ptdp = NULL;
6977 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6978 
6979 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6980 		if (alloc_ptd) {
6981 			/*
6982 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6983 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6984 			 * bootstrap request, so we check for an existing PTD here.
6985 			 */
6986 			ptdp = ptd_alloc(pmap);
6987 			if (ptdp == NULL) {
6988 				panic("%s: unable to allocate PTD", __func__);
6989 			}
6990 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6991 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6992 			pvh_set_flags(pvh, 0);
6993 		} else {
6994 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6995 		}
6996 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6997 		ptdp = pvh_ptd(pvh);
6998 	} else {
6999 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7000 	}
7001 
7002 	// below barrier ensures previous updates to the page are visible to PTW before
7003 	// it is linked to the PTE of previous level
7004 	__builtin_arm_dmb(DMB_ISHST);
7005 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7006 }
7007 
7008 /*
7009  *	Routine:	pmap_expand
7010  *
7011  *	Expands a pmap to be able to map the specified virtual address.
7012  *
7013  *	Allocates new memory for the default (COARSE) translation table
7014  *	entry, initializes all the pte entries to ARM_PTE_EMPTY and
7015  *	also allocates space for the corresponding pv entries.
7016  *
7017  *	Nothing should be locked.
7018  */
7019 MARK_AS_PMAP_TEXT static kern_return_t
7020 pmap_expand(
7021 	pmap_t pmap,
7022 	vm_map_address_t v,
7023 	unsigned int options,
7024 	unsigned int level)
7025 {
7026 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7027 
7028 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7029 		return KERN_INVALID_ADDRESS;
7030 	}
7031 	pmap_paddr_t    pa;
7032 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
7033 	tt_entry_t              *tte_p;
7034 	tt_entry_t              *tt_p;
7035 
7036 	pa = 0x0ULL;
7037 	tt_p =  (tt_entry_t *)NULL;
7038 
7039 	for (; ttlevel < level; ttlevel++) {
7040 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7041 			return KERN_ABORTED;
7042 		}
7043 
7044 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7045 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7046 			kern_return_t ret;
7047 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7048 				if (options & PMAP_OPTIONS_NOWAIT) {
7049 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7050 					return ret;
7051 				}
7052 #if XNU_MONITOR
7053 				panic("%s: failed to allocate tt, "
7054 				    "pmap=%p, v=%p, options=0x%x, level=%u",
7055 				    __FUNCTION__,
7056 				    pmap, (void *)v, options, level);
7057 #else
7058 				VM_PAGE_WAIT();
7059 #endif
7060 			}
7061 
7062 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7063 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7064 				return KERN_ABORTED;
7065 			}
7066 
7067 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7068 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7069 				pa = kvtophys_nofail((vm_offset_t)tt_p);
7070 				tte_p = pmap_ttne(pmap, ttlevel, v);
7071 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7072 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7073 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7074 				pa = 0x0ULL;
7075 				tt_p = (tt_entry_t *)NULL;
7076 			}
7077 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7078 		} else {
7079 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7080 		}
7081 
7082 		if (tt_p != (tt_entry_t *)NULL) {
7083 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7084 			tt_p = (tt_entry_t *)NULL;
7085 		}
7086 	}
7087 
7088 	return KERN_SUCCESS;
7089 }
7090 
7091 /*
7092  *	Routine:	pmap_gc
7093  *	Function:
7094  *              Pmap garbage collection
7095  *		Called by the pageout daemon when pages are scarce.
7096  *
7097  */
7098 void
7099 pmap_gc(void)
7100 {
7101 	/*
7102 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7103 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7104 	 * or may contain wired mappings.  However, with the relatively recent change to
7105 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7106 	 * page, it may make sense to call that function here.
7107 	 */
7108 }
7109 
7110 /*
7111  *      By default, don't attempt pmap GC more frequently
7112  *      than once / 1 minutes.
7113  */
7114 
7115 void
7116 compute_pmap_gc_throttle(
7117 	void *arg __unused)
7118 {
7119 }
7120 
7121 /*
7122  * pmap_attribute_cache_sync(vm_offset_t pa)
7123  *
7124  * Invalidates all of the instruction cache on a physical page and
7125  * pushes any dirty data from the data cache for the same physical page
7126  */
7127 
7128 kern_return_t
7129 pmap_attribute_cache_sync(
7130 	ppnum_t pp,
7131 	vm_size_t size,
7132 	__unused vm_machine_attribute_t attribute,
7133 	__unused vm_machine_attribute_val_t * value)
7134 {
7135 	if (size > PAGE_SIZE) {
7136 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7137 	} else {
7138 		cache_sync_page(pp);
7139 	}
7140 
7141 	return KERN_SUCCESS;
7142 }
7143 
7144 /*
7145  * pmap_sync_page_data_phys(ppnum_t pp)
7146  *
7147  * Invalidates all of the instruction cache on a physical page and
7148  * pushes any dirty data from the data cache for the same physical page
7149  */
7150 void
7151 pmap_sync_page_data_phys(
7152 	ppnum_t pp)
7153 {
7154 	cache_sync_page(pp);
7155 }
7156 
7157 /*
7158  * pmap_sync_page_attributes_phys(ppnum_t pp)
7159  *
7160  * Write back and invalidate all cachelines on a physical page.
7161  */
7162 void
7163 pmap_sync_page_attributes_phys(
7164 	ppnum_t pp)
7165 {
7166 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7167 }
7168 
7169 #if CONFIG_COREDUMP
7170 /* temporary workaround */
7171 boolean_t
7172 coredumpok(
7173 	vm_map_t map,
7174 	mach_vm_offset_t va)
7175 {
7176 	pt_entry_t     *pte_p;
7177 	pt_entry_t      spte;
7178 
7179 	pte_p = pmap_pte(map->pmap, va);
7180 	if (0 == pte_p) {
7181 		return FALSE;
7182 	}
7183 	if (vm_map_entry_has_device_pager(map, va)) {
7184 		return FALSE;
7185 	}
7186 	spte = *pte_p;
7187 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7188 }
7189 #endif
7190 
7191 void
7192 fillPage(
7193 	ppnum_t pn,
7194 	unsigned int fill)
7195 {
7196 	unsigned int   *addr;
7197 	int             count;
7198 
7199 	addr = (unsigned int *) phystokv(ptoa(pn));
7200 	count = PAGE_SIZE / sizeof(unsigned int);
7201 	while (count--) {
7202 		*addr++ = fill;
7203 	}
7204 }
7205 
7206 extern void     mapping_set_mod(ppnum_t pn);
7207 
7208 void
7209 mapping_set_mod(
7210 	ppnum_t pn)
7211 {
7212 	pmap_set_modify(pn);
7213 }
7214 
7215 extern void     mapping_set_ref(ppnum_t pn);
7216 
7217 void
7218 mapping_set_ref(
7219 	ppnum_t pn)
7220 {
7221 	pmap_set_reference(pn);
7222 }
7223 
7224 /*
7225  * Clear specified attribute bits.
7226  *
7227  * Try to force an arm_fast_fault() for all mappings of
7228  * the page - to force attributes to be set again at fault time.
7229  * If the forcing succeeds, clear the cached bits at the head.
7230  * Otherwise, something must have been wired, so leave the cached
7231  * attributes alone.
7232  */
7233 MARK_AS_PMAP_TEXT static void
7234 phys_attribute_clear_with_flush_range(
7235 	ppnum_t         pn,
7236 	unsigned int    bits,
7237 	int             options,
7238 	void            *arg,
7239 	pmap_tlb_flush_range_t *flush_range)
7240 {
7241 	pmap_paddr_t    pa = ptoa(pn);
7242 	vm_prot_t       allow_mode = VM_PROT_ALL;
7243 
7244 #if XNU_MONITOR
7245 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7246 		panic("%s: illegal request, "
7247 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7248 		    __FUNCTION__,
7249 		    pn, bits, options, arg, flush_range);
7250 	}
7251 #endif
7252 	if ((arg != NULL) || (flush_range != NULL)) {
7253 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7254 	}
7255 
7256 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7257 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7258 		    "invalid options",
7259 		    pn, bits, options, arg, flush_range);
7260 	}
7261 
7262 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7263 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7264 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7265 		    "should not clear 'modified' without flushing TLBs",
7266 		    pn, bits, options, arg, flush_range);
7267 	}
7268 
7269 	assert(pn != vm_page_fictitious_addr);
7270 
7271 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7272 		assert(bits == PP_ATTR_MODIFIED);
7273 
7274 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7275 		/*
7276 		 * We short circuit this case; it should not need to
7277 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7278 		 * pmap_page_protect has taken care of resetting
7279 		 * the state so that we'll see the next write as a fault to
7280 		 * the VM (i.e. we don't want a fast fault).
7281 		 */
7282 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7283 		return;
7284 	}
7285 	if (bits & PP_ATTR_REFERENCED) {
7286 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7287 	}
7288 	if (bits & PP_ATTR_MODIFIED) {
7289 		allow_mode &= ~VM_PROT_WRITE;
7290 	}
7291 
7292 	if (bits == PP_ATTR_NOENCRYPT) {
7293 		/*
7294 		 * We short circuit this case; it should not need to
7295 		 * invoke arm_force_fast_fault, so just clear and
7296 		 * return.  On ARM, this bit is just a debugging aid.
7297 		 */
7298 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7299 		return;
7300 	}
7301 
7302 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7303 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7304 	}
7305 }
7306 
7307 MARK_AS_PMAP_TEXT void
7308 phys_attribute_clear_internal(
7309 	ppnum_t         pn,
7310 	unsigned int    bits,
7311 	int             options,
7312 	void            *arg)
7313 {
7314 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7315 }
7316 
7317 #if __ARM_RANGE_TLBI__
7318 MARK_AS_PMAP_TEXT static vm_map_address_t
7319 phys_attribute_clear_twig_internal(
7320 	pmap_t pmap,
7321 	vm_map_address_t start,
7322 	vm_map_address_t end,
7323 	unsigned int bits,
7324 	unsigned int options,
7325 	pmap_tlb_flush_range_t *flush_range)
7326 {
7327 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7328 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7329 	assert(end >= start);
7330 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7331 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7332 	vm_map_address_t va = start;
7333 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7334 	tt_entry_t     *tte_p;
7335 	tte_p = pmap_tte(pmap, start);
7336 	unsigned int npages = 0;
7337 
7338 	if (tte_p == (tt_entry_t *) NULL) {
7339 		return end;
7340 	}
7341 
7342 	if (tte_is_valid_table(*tte_p)) {
7343 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7344 
7345 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7346 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7347 		assert(end_pte_p >= start_pte_p);
7348 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7349 			if (__improbable(npages++ && pmap_pending_preemption())) {
7350 				return va;
7351 			}
7352 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7353 			if (pa_valid(pa)) {
7354 				ppnum_t pn = (ppnum_t) atop(pa);
7355 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7356 			}
7357 		}
7358 	}
7359 	return end;
7360 }
7361 
7362 MARK_AS_PMAP_TEXT vm_map_address_t
7363 phys_attribute_clear_range_internal(
7364 	pmap_t pmap,
7365 	vm_map_address_t start,
7366 	vm_map_address_t end,
7367 	unsigned int bits,
7368 	unsigned int options)
7369 {
7370 	if (__improbable(end < start)) {
7371 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7372 	}
7373 	validate_pmap_mutable(pmap);
7374 
7375 	vm_map_address_t va = start;
7376 	pmap_tlb_flush_range_t flush_range = {
7377 		.ptfr_pmap = pmap,
7378 		.ptfr_start = start,
7379 		.ptfr_end = end,
7380 		.ptfr_flush_needed = false
7381 	};
7382 
7383 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7384 		return va;
7385 	}
7386 
7387 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7388 
7389 	while (va < end) {
7390 		vm_map_address_t curr_end;
7391 
7392 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7393 		if (curr_end > end) {
7394 			curr_end = end;
7395 		}
7396 
7397 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7398 		if ((va < curr_end) || pmap_pending_preemption()) {
7399 			break;
7400 		}
7401 	}
7402 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7403 	if (flush_range.ptfr_flush_needed) {
7404 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7405 			flush_range.ptfr_start,
7406 			flush_range.ptfr_end - flush_range.ptfr_start,
7407 			flush_range.ptfr_pmap,
7408 			true,
7409 			false);
7410 		sync_tlb_flush();
7411 	}
7412 	return va;
7413 }
7414 
7415 static void
7416 phys_attribute_clear_range(
7417 	pmap_t pmap,
7418 	vm_map_address_t start,
7419 	vm_map_address_t end,
7420 	unsigned int bits,
7421 	unsigned int options)
7422 {
7423 	/*
7424 	 * We allow single-page requests to execute non-preemptibly,
7425 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7426 	 * operation, and there are a couple of special use cases that
7427 	 * require a non-preemptible single-page operation.
7428 	 */
7429 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7430 		pmap_verify_preemptible();
7431 	}
7432 
7433 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7434 
7435 	while (start < end) {
7436 #if XNU_MONITOR
7437 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7438 #else
7439 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7440 #endif
7441 	}
7442 
7443 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7444 }
7445 #endif /* __ARM_RANGE_TLBI__ */
7446 
7447 static void
7448 phys_attribute_clear(
7449 	ppnum_t         pn,
7450 	unsigned int    bits,
7451 	int             options,
7452 	void            *arg)
7453 {
7454 	/*
7455 	 * Do we really want this tracepoint?  It will be extremely chatty.
7456 	 * Also, should we have a corresponding trace point for the set path?
7457 	 */
7458 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7459 
7460 #if XNU_MONITOR
7461 	phys_attribute_clear_ppl(pn, bits, options, arg);
7462 #else
7463 	phys_attribute_clear_internal(pn, bits, options, arg);
7464 #endif
7465 
7466 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7467 }
7468 
7469 /*
7470  *	Set specified attribute bits.
7471  *
7472  *	Set cached value in the pv head because we have
7473  *	no per-mapping hardware support for referenced and
7474  *	modify bits.
7475  */
7476 MARK_AS_PMAP_TEXT void
7477 phys_attribute_set_internal(
7478 	ppnum_t pn,
7479 	unsigned int bits)
7480 {
7481 	pmap_paddr_t    pa = ptoa(pn);
7482 	assert(pn != vm_page_fictitious_addr);
7483 
7484 #if XNU_MONITOR
7485 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7486 		panic("%s: illegal request, "
7487 		    "pn=%u, bits=%#x",
7488 		    __FUNCTION__,
7489 		    pn, bits);
7490 	}
7491 #endif
7492 
7493 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7494 
7495 	return;
7496 }
7497 
7498 static void
7499 phys_attribute_set(
7500 	ppnum_t pn,
7501 	unsigned int bits)
7502 {
7503 #if XNU_MONITOR
7504 	phys_attribute_set_ppl(pn, bits);
7505 #else
7506 	phys_attribute_set_internal(pn, bits);
7507 #endif
7508 }
7509 
7510 
7511 /*
7512  *	Check specified attribute bits.
7513  *
7514  *	use the software cached bits (since no hw support).
7515  */
7516 static boolean_t
7517 phys_attribute_test(
7518 	ppnum_t pn,
7519 	unsigned int bits)
7520 {
7521 	pmap_paddr_t    pa = ptoa(pn);
7522 	assert(pn != vm_page_fictitious_addr);
7523 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7524 }
7525 
7526 
7527 /*
7528  *	Set the modify/reference bits on the specified physical page.
7529  */
7530 void
7531 pmap_set_modify(ppnum_t pn)
7532 {
7533 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7534 }
7535 
7536 
7537 /*
7538  *	Clear the modify bits on the specified physical page.
7539  */
7540 void
7541 pmap_clear_modify(
7542 	ppnum_t pn)
7543 {
7544 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7545 }
7546 
7547 
7548 /*
7549  *	pmap_is_modified:
7550  *
7551  *	Return whether or not the specified physical page is modified
7552  *	by any physical maps.
7553  */
7554 boolean_t
7555 pmap_is_modified(
7556 	ppnum_t pn)
7557 {
7558 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7559 }
7560 
7561 
7562 /*
7563  *	Set the reference bit on the specified physical page.
7564  */
7565 static void
7566 pmap_set_reference(
7567 	ppnum_t pn)
7568 {
7569 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7570 }
7571 
7572 /*
7573  *	Clear the reference bits on the specified physical page.
7574  */
7575 void
7576 pmap_clear_reference(
7577 	ppnum_t pn)
7578 {
7579 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7580 }
7581 
7582 
7583 /*
7584  *	pmap_is_referenced:
7585  *
7586  *	Return whether or not the specified physical page is referenced
7587  *	by any physical maps.
7588  */
7589 boolean_t
7590 pmap_is_referenced(
7591 	ppnum_t pn)
7592 {
7593 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7594 }
7595 
7596 /*
7597  * pmap_get_refmod(phys)
7598  *  returns the referenced and modified bits of the specified
7599  *  physical page.
7600  */
7601 unsigned int
7602 pmap_get_refmod(
7603 	ppnum_t pn)
7604 {
7605 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7606 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7607 }
7608 
7609 static inline unsigned int
7610 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7611 {
7612 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7613 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7614 }
7615 
7616 /*
7617  * pmap_clear_refmod(phys, mask)
7618  *  clears the referenced and modified bits as specified by the mask
7619  *  of the specified physical page.
7620  */
7621 void
7622 pmap_clear_refmod_options(
7623 	ppnum_t         pn,
7624 	unsigned int    mask,
7625 	unsigned int    options,
7626 	void            *arg)
7627 {
7628 	unsigned int    bits;
7629 
7630 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7631 	phys_attribute_clear(pn, bits, options, arg);
7632 }
7633 
7634 /*
7635  * Perform pmap_clear_refmod_options on a virtual address range.
7636  * The operation will be performed in bulk & tlb flushes will be coalesced
7637  * if possible.
7638  *
7639  * Returns true if the operation is supported on this platform.
7640  * If this function returns false, the operation is not supported and
7641  * nothing has been modified in the pmap.
7642  */
7643 bool
7644 pmap_clear_refmod_range_options(
7645 	pmap_t pmap __unused,
7646 	vm_map_address_t start __unused,
7647 	vm_map_address_t end __unused,
7648 	unsigned int mask __unused,
7649 	unsigned int options __unused)
7650 {
7651 #if __ARM_RANGE_TLBI__
7652 	unsigned int    bits;
7653 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7654 	phys_attribute_clear_range(pmap, start, end, bits, options);
7655 	return true;
7656 #else /* __ARM_RANGE_TLBI__ */
7657 #pragma unused(pmap, start, end, mask, options)
7658 	/*
7659 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7660 	 * contiguous range of addresses. This is large performance improvement on
7661 	 * platforms that support ranged tlbi instructions. But on older platforms,
7662 	 * we can only flush per-page or the entire asid. So we currently
7663 	 * only support this operation on platforms that support ranged tlbi.
7664 	 * instructions. On other platforms, we require that
7665 	 * the VM modify the bits on a per-page basis.
7666 	 */
7667 	return false;
7668 #endif /* __ARM_RANGE_TLBI__ */
7669 }
7670 
7671 void
7672 pmap_clear_refmod(
7673 	ppnum_t pn,
7674 	unsigned int mask)
7675 {
7676 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7677 }
7678 
7679 unsigned int
7680 pmap_disconnect_options(
7681 	ppnum_t pn,
7682 	unsigned int options,
7683 	void *arg)
7684 {
7685 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7686 		/*
7687 		 * On ARM, the "modified" bit is managed by software, so
7688 		 * we know up-front if the physical page is "modified",
7689 		 * without having to scan all the PTEs pointing to it.
7690 		 * The caller should have made the VM page "busy" so noone
7691 		 * should be able to establish any new mapping and "modify"
7692 		 * the page behind us.
7693 		 */
7694 		if (pmap_is_modified(pn)) {
7695 			/*
7696 			 * The page has been modified and will be sent to
7697 			 * the VM compressor.
7698 			 */
7699 			options |= PMAP_OPTIONS_COMPRESSOR;
7700 		} else {
7701 			/*
7702 			 * The page hasn't been modified and will be freed
7703 			 * instead of compressed.
7704 			 */
7705 		}
7706 	}
7707 
7708 	/* disconnect the page */
7709 	pmap_page_protect_options(pn, 0, options, arg);
7710 
7711 	/* return ref/chg status */
7712 	return pmap_get_refmod(pn);
7713 }
7714 
7715 /*
7716  *	Routine:
7717  *		pmap_disconnect
7718  *
7719  *	Function:
7720  *		Disconnect all mappings for this page and return reference and change status
7721  *		in generic format.
7722  *
7723  */
7724 unsigned int
7725 pmap_disconnect(
7726 	ppnum_t pn)
7727 {
7728 	pmap_page_protect(pn, 0);       /* disconnect the page */
7729 	return pmap_get_refmod(pn);   /* return ref/chg status */
7730 }
7731 
7732 boolean_t
7733 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7734 {
7735 	if (ptoa(first) >= vm_last_phys) {
7736 		return FALSE;
7737 	}
7738 	if (ptoa(last) < vm_first_phys) {
7739 		return FALSE;
7740 	}
7741 
7742 	return TRUE;
7743 }
7744 
7745 /*
7746  * The state maintained by the noencrypt functions is used as a
7747  * debugging aid on ARM.  This incurs some overhead on the part
7748  * of the caller.  A special case check in phys_attribute_clear
7749  * (the most expensive path) currently minimizes this overhead,
7750  * but stubbing these functions out on RELEASE kernels yields
7751  * further wins.
7752  */
7753 boolean_t
7754 pmap_is_noencrypt(
7755 	ppnum_t pn)
7756 {
7757 #if DEVELOPMENT || DEBUG
7758 	boolean_t result = FALSE;
7759 
7760 	if (!pa_valid(ptoa(pn))) {
7761 		return FALSE;
7762 	}
7763 
7764 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7765 
7766 	return result;
7767 #else
7768 #pragma unused(pn)
7769 	return FALSE;
7770 #endif
7771 }
7772 
7773 void
7774 pmap_set_noencrypt(
7775 	ppnum_t pn)
7776 {
7777 #if DEVELOPMENT || DEBUG
7778 	if (!pa_valid(ptoa(pn))) {
7779 		return;
7780 	}
7781 
7782 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7783 #else
7784 #pragma unused(pn)
7785 #endif
7786 }
7787 
7788 void
7789 pmap_clear_noencrypt(
7790 	ppnum_t pn)
7791 {
7792 #if DEVELOPMENT || DEBUG
7793 	if (!pa_valid(ptoa(pn))) {
7794 		return;
7795 	}
7796 
7797 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7798 #else
7799 #pragma unused(pn)
7800 #endif
7801 }
7802 
7803 #if XNU_MONITOR
7804 boolean_t
7805 pmap_is_monitor(ppnum_t pn)
7806 {
7807 	assert(pa_valid(ptoa(pn)));
7808 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7809 }
7810 #endif
7811 
7812 void
7813 pmap_lock_phys_page(ppnum_t pn)
7814 {
7815 #if !XNU_MONITOR
7816 	unsigned int    pai;
7817 	pmap_paddr_t    phys = ptoa(pn);
7818 
7819 	if (pa_valid(phys)) {
7820 		pai = pa_index(phys);
7821 		pvh_lock(pai);
7822 	} else
7823 #else
7824 	(void)pn;
7825 #endif
7826 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7827 }
7828 
7829 
7830 void
7831 pmap_unlock_phys_page(ppnum_t pn)
7832 {
7833 #if !XNU_MONITOR
7834 	unsigned int    pai;
7835 	pmap_paddr_t    phys = ptoa(pn);
7836 
7837 	if (pa_valid(phys)) {
7838 		pai = pa_index(phys);
7839 		pvh_unlock(pai);
7840 	} else
7841 #else
7842 	(void)pn;
7843 #endif
7844 	{ simple_unlock(&phys_backup_lock);}
7845 }
7846 
7847 MARK_AS_PMAP_TEXT static void
7848 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7849 {
7850 	if (pmap != kernel_pmap) {
7851 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7852 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7853 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7854 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7855 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7856 #if __ARM_MIXED_PAGE_SIZE__
7857 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7858 #endif
7859 	}
7860 
7861 
7862 #if __ARM_MIXED_PAGE_SIZE__
7863 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7864 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7865 	}
7866 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7867 
7868 
7869 	if (pmap != kernel_pmap) {
7870 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7871 	} else if (!pmap_user_ttb_is_clear()) {
7872 		pmap_clear_user_ttb_internal();
7873 	}
7874 }
7875 
7876 MARK_AS_PMAP_TEXT void
7877 pmap_clear_user_ttb_internal(void)
7878 {
7879 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7880 }
7881 
7882 void
7883 pmap_clear_user_ttb(void)
7884 {
7885 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7886 #if XNU_MONITOR
7887 	pmap_clear_user_ttb_ppl();
7888 #else
7889 	pmap_clear_user_ttb_internal();
7890 #endif
7891 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7892 }
7893 
7894 
7895 #if defined(__arm64__)
7896 /*
7897  * Marker for use in multi-pass fast-fault PV list processing.
7898  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7899  * these functions, as compressed PTEs should never be present in PV lists.
7900  * Note that this only holds true for arm64; for arm32 we don't have enough
7901  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7902  * and WRITEABLE marker depending on whether the PTE is valid.
7903  */
7904 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7905 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7906 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7907 #endif
7908 
7909 
7910 MARK_AS_PMAP_TEXT static boolean_t
7911 arm_force_fast_fault_with_flush_range(
7912 	ppnum_t         ppnum,
7913 	vm_prot_t       allow_mode,
7914 	int             options,
7915 	pmap_tlb_flush_range_t *flush_range)
7916 {
7917 	pmap_paddr_t     phys = ptoa(ppnum);
7918 	pv_entry_t      *pve_p;
7919 	pt_entry_t      *pte_p;
7920 	unsigned int     pai;
7921 	unsigned int     pass1_updated = 0;
7922 	unsigned int     pass2_updated = 0;
7923 	boolean_t        result;
7924 	pv_entry_t     **pv_h;
7925 	bool             is_reusable;
7926 	bool             ref_fault;
7927 	bool             mod_fault;
7928 	bool             clear_write_fault = false;
7929 	bool             ref_aliases_mod = false;
7930 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7931 
7932 	assert(ppnum != vm_page_fictitious_addr);
7933 
7934 	if (!pa_valid(phys)) {
7935 		return FALSE;   /* Not a managed page. */
7936 	}
7937 
7938 	result = TRUE;
7939 	ref_fault = false;
7940 	mod_fault = false;
7941 	pai = pa_index(phys);
7942 	if (__probable(mustsynch)) {
7943 		pvh_lock(pai);
7944 	}
7945 	pv_h = pai_to_pvh(pai);
7946 
7947 #if XNU_MONITOR
7948 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7949 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7950 	}
7951 #endif
7952 	pte_p = PT_ENTRY_NULL;
7953 	pve_p = PV_ENTRY_NULL;
7954 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7955 		pte_p = pvh_ptep(pv_h);
7956 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7957 		pve_p = pvh_pve_list(pv_h);
7958 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7959 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7960 	}
7961 
7962 	is_reusable = ppattr_test_reusable(pai);
7963 
7964 	/*
7965 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7966 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7967 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7968 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7969 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7970 	 * tlb_flush_needed to be true while issue_tlbi is false.
7971 	 */
7972 	bool issue_tlbi = false;
7973 	bool tlb_flush_needed = false;
7974 
7975 	pv_entry_t *orig_pve_p = pve_p;
7976 	pt_entry_t *orig_pte_p = pte_p;
7977 	int pve_ptep_idx = 0;
7978 
7979 	/*
7980 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7981 	 * TLB invalidation in pass 2.
7982 	 */
7983 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7984 		pt_entry_t       spte;
7985 		pt_entry_t       tmplate;
7986 
7987 		if (pve_p != PV_ENTRY_NULL) {
7988 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7989 			if (pte_p == PT_ENTRY_NULL) {
7990 				goto fff_skip_pve_pass1;
7991 			}
7992 		}
7993 
7994 #ifdef PVH_FLAG_IOMMU
7995 		if (pvh_ptep_is_iommu(pte_p)) {
7996 			goto fff_skip_pve_pass1;
7997 		}
7998 #endif
7999 		if (*pte_p == ARM_PTE_EMPTY) {
8000 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8001 		}
8002 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8003 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8004 		}
8005 
8006 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8007 		const pmap_t pmap = ptdp->pmap;
8008 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8009 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8010 
8011 		assert(va >= pmap->min && va < pmap->max);
8012 
8013 		/* update pmap stats and ledgers */
8014 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8015 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8016 		if (is_altacct) {
8017 			/*
8018 			 * We do not track "reusable" status for
8019 			 * "alternate accounting" mappings.
8020 			 */
8021 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8022 		    is_reusable &&
8023 		    is_internal &&
8024 		    pmap != kernel_pmap) {
8025 			/* one less "reusable" */
8026 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8027 			/* one more "internal" */
8028 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8029 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8030 
8031 			/*
8032 			 * Since the page is being marked non-reusable, we assume that it will be
8033 			 * modified soon.  Avoid the cost of another trap to handle the fast
8034 			 * fault when we next write to this page.
8035 			 */
8036 			clear_write_fault = true;
8037 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8038 		    !is_reusable &&
8039 		    is_internal &&
8040 		    pmap != kernel_pmap) {
8041 			/* one more "reusable" */
8042 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8043 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8044 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8045 		}
8046 
8047 		bool wiredskip = pte_is_wired(*pte_p) &&
8048 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8049 
8050 		if (wiredskip) {
8051 			result = FALSE;
8052 			goto fff_skip_pve_pass1;
8053 		}
8054 
8055 		spte = *pte_p;
8056 		tmplate = spte;
8057 
8058 #if HAS_FEAT_XS
8059 		/**
8060 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8061 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8062 		 */
8063 		assert(!pte_is_xs(pt_attr, spte));
8064 #endif /* HAS_FEAT_XS */
8065 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8066 			/* read protection sets the pte to fault */
8067 			tmplate =  tmplate & ~ARM_PTE_AF;
8068 			ref_fault = true;
8069 		}
8070 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8071 			/* take away write permission if set */
8072 			if (pmap == kernel_pmap) {
8073 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8074 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8075 					pte_set_was_writeable(tmplate, true);
8076 					mod_fault = true;
8077 				}
8078 			} else {
8079 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8080 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8081 					pte_set_was_writeable(tmplate, true);
8082 					mod_fault = true;
8083 				}
8084 			}
8085 		}
8086 
8087 #if MACH_ASSERT && XNU_MONITOR
8088 		if (is_pte_xprr_protected(pmap, spte)) {
8089 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8090 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8091 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8092 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8093 				    ppnum, options, allow_mode);
8094 			}
8095 		}
8096 #endif /* MACH_ASSERT && XNU_MONITOR */
8097 
8098 		if (result && (tmplate != spte)) {
8099 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8100 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
8101 				tlb_flush_needed = true;
8102 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8103 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8104 #ifdef ARM_PTE_FF_MARKER
8105 					assert(!(spte & ARM_PTE_FF_MARKER));
8106 					tmplate |= ARM_PTE_FF_MARKER;
8107 					++pass1_updated;
8108 #endif
8109 					issue_tlbi = true;
8110 				}
8111 			}
8112 			write_pte_fast(pte_p, tmplate);
8113 		}
8114 
8115 fff_skip_pve_pass1:
8116 		pte_p = PT_ENTRY_NULL;
8117 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8118 			pve_ptep_idx = 0;
8119 			pve_p = pve_next(pve_p);
8120 		}
8121 	}
8122 
8123 	if (tlb_flush_needed) {
8124 		FLUSH_PTE_STRONG();
8125 	}
8126 
8127 	if (!issue_tlbi) {
8128 		goto fff_finish;
8129 	}
8130 
8131 	/* Pass 2: Issue any required TLB invalidations */
8132 	pve_p = orig_pve_p;
8133 	pte_p = orig_pte_p;
8134 	pve_ptep_idx = 0;
8135 
8136 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8137 		if (pve_p != PV_ENTRY_NULL) {
8138 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8139 			if (pte_p == PT_ENTRY_NULL) {
8140 				goto fff_skip_pve_pass2;
8141 			}
8142 		}
8143 
8144 #ifdef PVH_FLAG_IOMMU
8145 		if (pvh_ptep_is_iommu(pte_p)) {
8146 			goto fff_skip_pve_pass2;
8147 		}
8148 #endif
8149 
8150 #ifdef ARM_PTE_FF_MARKER
8151 		pt_entry_t spte = *pte_p;
8152 
8153 		if (!(spte & ARM_PTE_FF_MARKER)) {
8154 			goto fff_skip_pve_pass2;
8155 		} else {
8156 			spte &= (~ARM_PTE_FF_MARKER);
8157 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8158 			write_pte_fast(pte_p, spte);
8159 			++pass2_updated;
8160 		}
8161 #endif
8162 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8163 		const pmap_t pmap = ptdp->pmap;
8164 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8165 
8166 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8167 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8168 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8169 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8170 		}
8171 
8172 fff_skip_pve_pass2:
8173 		pte_p = PT_ENTRY_NULL;
8174 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8175 			pve_ptep_idx = 0;
8176 			pve_p = pve_next(pve_p);
8177 		}
8178 	}
8179 
8180 fff_finish:
8181 	if (__improbable(pass1_updated != pass2_updated)) {
8182 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8183 		    __func__, pass1_updated, pass2_updated);
8184 	}
8185 
8186 	/*
8187 	 * If we are using the same approach for ref and mod
8188 	 * faults on this PTE, do not clear the write fault;
8189 	 * this would cause both ref and mod to be set on the
8190 	 * page again, and prevent us from taking ANY read/write
8191 	 * fault on the mapping.
8192 	 */
8193 	if (clear_write_fault && !ref_aliases_mod) {
8194 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8195 	}
8196 	if (tlb_flush_needed) {
8197 		if (flush_range) {
8198 			/* Delayed flush. Signal to the caller that the flush is needed. */
8199 			flush_range->ptfr_flush_needed = true;
8200 		} else {
8201 			sync_tlb_flush();
8202 		}
8203 	}
8204 
8205 	/* update global "reusable" status for this page */
8206 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8207 		ppattr_clear_reusable(pai);
8208 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8209 		ppattr_set_reusable(pai);
8210 	}
8211 
8212 	if (mod_fault) {
8213 		ppattr_set_modfault(pai);
8214 	}
8215 	if (ref_fault) {
8216 		ppattr_set_reffault(pai);
8217 	}
8218 	if (__probable(mustsynch)) {
8219 		pvh_unlock(pai);
8220 	}
8221 	return result;
8222 }
8223 
8224 MARK_AS_PMAP_TEXT boolean_t
8225 arm_force_fast_fault_internal(
8226 	ppnum_t         ppnum,
8227 	vm_prot_t       allow_mode,
8228 	int             options)
8229 {
8230 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8231 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8232 	}
8233 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8234 }
8235 
8236 /*
8237  *	Routine:	arm_force_fast_fault
8238  *
8239  *	Function:
8240  *		Force all mappings for this page to fault according
8241  *		to the access modes allowed, so we can gather ref/modify
8242  *		bits again.
8243  */
8244 
8245 boolean_t
8246 arm_force_fast_fault(
8247 	ppnum_t         ppnum,
8248 	vm_prot_t       allow_mode,
8249 	int             options,
8250 	__unused void   *arg)
8251 {
8252 	pmap_paddr_t    phys = ptoa(ppnum);
8253 
8254 	assert(ppnum != vm_page_fictitious_addr);
8255 
8256 	if (!pa_valid(phys)) {
8257 		return FALSE;   /* Not a managed page. */
8258 	}
8259 
8260 #if XNU_MONITOR
8261 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8262 #else
8263 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8264 #endif
8265 }
8266 
8267 /*
8268  *	Routine:	arm_clear_fast_fault
8269  *
8270  *	Function:
8271  *		Clear pending force fault for all mappings for this page based on
8272  *		the observed fault type, update ref/modify bits.
8273  */
8274 MARK_AS_PMAP_TEXT static boolean_t
8275 arm_clear_fast_fault(
8276 	ppnum_t ppnum,
8277 	vm_prot_t fault_type,
8278 	pt_entry_t *pte_p)
8279 {
8280 	pmap_paddr_t    pa = ptoa(ppnum);
8281 	pv_entry_t     *pve_p;
8282 	unsigned int    pai;
8283 	boolean_t       result;
8284 	bool            tlb_flush_needed = false;
8285 	pv_entry_t    **pv_h;
8286 	unsigned int    npve = 0;
8287 	unsigned int    pass1_updated = 0;
8288 	unsigned int    pass2_updated = 0;
8289 
8290 	assert(ppnum != vm_page_fictitious_addr);
8291 
8292 	if (!pa_valid(pa)) {
8293 		return FALSE;   /* Not a managed page. */
8294 	}
8295 
8296 	result = FALSE;
8297 	pai = pa_index(pa);
8298 	pvh_assert_locked(pai);
8299 	pv_h = pai_to_pvh(pai);
8300 
8301 	pve_p = PV_ENTRY_NULL;
8302 	if (pte_p == PT_ENTRY_NULL) {
8303 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8304 			pte_p = pvh_ptep(pv_h);
8305 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8306 			pve_p = pvh_pve_list(pv_h);
8307 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8308 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8309 		}
8310 	}
8311 
8312 	pv_entry_t *orig_pve_p = pve_p;
8313 	pt_entry_t *orig_pte_p = pte_p;
8314 	int pve_ptep_idx = 0;
8315 
8316 	/*
8317 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8318 	 * TLB invalidation in pass 2.
8319 	 */
8320 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8321 		pt_entry_t spte;
8322 		pt_entry_t tmplate;
8323 
8324 		if (pve_p != PV_ENTRY_NULL) {
8325 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8326 			if (pte_p == PT_ENTRY_NULL) {
8327 				goto cff_skip_pve_pass1;
8328 			}
8329 		}
8330 
8331 #ifdef PVH_FLAG_IOMMU
8332 		if (pvh_ptep_is_iommu(pte_p)) {
8333 			goto cff_skip_pve_pass1;
8334 		}
8335 #endif
8336 		if (*pte_p == ARM_PTE_EMPTY) {
8337 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8338 		}
8339 
8340 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8341 		const pmap_t pmap = ptdp->pmap;
8342 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8343 
8344 		assert(va >= pmap->min && va < pmap->max);
8345 
8346 		spte = *pte_p;
8347 		tmplate = spte;
8348 
8349 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8350 			{
8351 				if (pmap == kernel_pmap) {
8352 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8353 				} else {
8354 					assert(pmap->type != PMAP_TYPE_NESTED);
8355 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8356 				}
8357 			}
8358 
8359 			tmplate |= ARM_PTE_AF;
8360 
8361 			pte_set_was_writeable(tmplate, false);
8362 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8363 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8364 			tmplate = spte | ARM_PTE_AF;
8365 
8366 			{
8367 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8368 			}
8369 		}
8370 
8371 #if MACH_ASSERT && XNU_MONITOR
8372 		if (is_pte_xprr_protected(pmap, spte)) {
8373 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8374 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8375 				    "ppnum=0x%x, fault_type=0x%x",
8376 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8377 				    ppnum, fault_type);
8378 			}
8379 		}
8380 #endif /* MACH_ASSERT && XNU_MONITOR */
8381 
8382 		assert(spte != ARM_PTE_EMPTY);
8383 		if (spte != tmplate) {
8384 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8385 #ifdef ARM_PTE_FF_MARKER
8386 				assert(!(spte & ARM_PTE_FF_MARKER));
8387 				tmplate |= ARM_PTE_FF_MARKER;
8388 				++pass1_updated;
8389 #endif
8390 				tlb_flush_needed = true;
8391 			}
8392 			write_pte_fast(pte_p, tmplate);
8393 			result = TRUE;
8394 		}
8395 
8396 cff_skip_pve_pass1:
8397 		pte_p = PT_ENTRY_NULL;
8398 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8399 			pve_ptep_idx = 0;
8400 			pve_p = pve_next(pve_p);
8401 			++npve;
8402 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8403 				break;
8404 			}
8405 		}
8406 	}
8407 
8408 	if (!tlb_flush_needed) {
8409 		goto cff_finish;
8410 	}
8411 
8412 	FLUSH_PTE_STRONG();
8413 
8414 	/* Pass 2: Issue any required TLB invalidations */
8415 	pve_p = orig_pve_p;
8416 	pte_p = orig_pte_p;
8417 	pve_ptep_idx = 0;
8418 	npve = 0;
8419 
8420 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8421 		if (pve_p != PV_ENTRY_NULL) {
8422 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8423 			if (pte_p == PT_ENTRY_NULL) {
8424 				goto cff_skip_pve_pass2;
8425 			}
8426 		}
8427 
8428 #ifdef PVH_FLAG_IOMMU
8429 		if (pvh_ptep_is_iommu(pte_p)) {
8430 			goto cff_skip_pve_pass2;
8431 		}
8432 #endif
8433 
8434 #ifdef ARM_PTE_FF_MARKER
8435 		pt_entry_t spte = *pte_p;
8436 
8437 		if (!(spte & ARM_PTE_FF_MARKER)) {
8438 			goto cff_skip_pve_pass2;
8439 		} else {
8440 			spte &= (~ARM_PTE_FF_MARKER);
8441 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8442 			write_pte_fast(pte_p, spte);
8443 			++pass2_updated;
8444 		}
8445 #endif
8446 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8447 		const pmap_t pmap = ptdp->pmap;
8448 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8449 
8450 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8451 		    pmap, true, false);
8452 
8453 cff_skip_pve_pass2:
8454 		pte_p = PT_ENTRY_NULL;
8455 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8456 			pve_ptep_idx = 0;
8457 			pve_p = pve_next(pve_p);
8458 			++npve;
8459 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8460 				break;
8461 			}
8462 		}
8463 	}
8464 
8465 cff_finish:
8466 	if (__improbable(pass1_updated != pass2_updated)) {
8467 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8468 		    __func__, pass1_updated, pass2_updated);
8469 	}
8470 	if (tlb_flush_needed) {
8471 		sync_tlb_flush();
8472 	}
8473 	return result;
8474 }
8475 
8476 /*
8477  * Determine if the fault was induced by software tracking of
8478  * modify/reference bits.  If so, re-enable the mapping (and set
8479  * the appropriate bits).
8480  *
8481  * Returns KERN_SUCCESS if the fault was induced and was
8482  * successfully handled.
8483  *
8484  * Returns KERN_FAILURE if the fault was not induced and
8485  * the function was unable to deal with it.
8486  *
8487  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8488  * disallows this type of access.
8489  *
8490  * Returns KERN_ABORTED if the pmap lock is taken and a
8491  * preemption is pending.
8492  *
8493  */
8494 MARK_AS_PMAP_TEXT kern_return_t
8495 arm_fast_fault_internal(
8496 	pmap_t pmap,
8497 	vm_map_address_t va,
8498 	vm_prot_t fault_type,
8499 	__unused bool was_af_fault,
8500 	__unused bool from_user)
8501 {
8502 	kern_return_t   result = KERN_FAILURE;
8503 	pt_entry_t     *ptep;
8504 	pt_entry_t      spte = ARM_PTE_EMPTY;
8505 	unsigned int    pai;
8506 	pmap_paddr_t    pa;
8507 	validate_pmap_mutable(pmap);
8508 
8509 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8510 		return KERN_ABORTED;
8511 	}
8512 
8513 	/*
8514 	 * If the entry doesn't exist, is completely invalid, or is already
8515 	 * valid, we can't fix it here.
8516 	 */
8517 
8518 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8519 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8520 	if (ptep != PT_ENTRY_NULL) {
8521 		while (true) {
8522 			spte = *((volatile pt_entry_t*)ptep);
8523 
8524 			pa = pte_to_pa(spte);
8525 
8526 			if ((spte == ARM_PTE_EMPTY) ||
8527 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8528 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8529 				return result;
8530 			}
8531 
8532 			if (!pa_valid(pa)) {
8533 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8534 #if XNU_MONITOR
8535 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8536 					return KERN_PROTECTION_FAILURE;
8537 				} else
8538 #endif
8539 				return result;
8540 			}
8541 			pai = pa_index(pa);
8542 			pvh_lock(pai);
8543 			if (*ptep == spte) {
8544 				/*
8545 				 * Double-check the spte value, as we care about the AF bit.
8546 				 * It's also possible that pmap_page_protect() transitioned the
8547 				 * PTE to compressed/empty before we grabbed the PVH lock.
8548 				 */
8549 				break;
8550 			}
8551 			pvh_unlock(pai);
8552 		}
8553 	} else {
8554 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8555 		return result;
8556 	}
8557 
8558 
8559 	if ((result != KERN_SUCCESS) &&
8560 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8561 		/*
8562 		 * An attempted access will always clear ref/mod fault state, as
8563 		 * appropriate for the fault type.  arm_clear_fast_fault will
8564 		 * update the associated PTEs for the page as appropriate; if
8565 		 * any PTEs are updated, we redrive the access.  If the mapping
8566 		 * does not actually allow for the attempted access, the
8567 		 * following fault will (hopefully) fail to update any PTEs, and
8568 		 * thus cause arm_fast_fault to decide that it failed to handle
8569 		 * the fault.
8570 		 */
8571 		if (ppattr_test_reffault(pai)) {
8572 			ppattr_clear_reffault(pai);
8573 		}
8574 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8575 			ppattr_clear_modfault(pai);
8576 		}
8577 
8578 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8579 			/*
8580 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8581 			 * cost of not doing so is a another fault in a case
8582 			 * that should already result in an exception.
8583 			 */
8584 			result = KERN_SUCCESS;
8585 		}
8586 	}
8587 
8588 	/*
8589 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8590 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8591 	 * on mappings of the same page
8592 	 */
8593 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8594 		uintptr_t ap_ro, ap_rw, ap_x;
8595 		if (pmap == kernel_pmap) {
8596 			ap_ro = ARM_PTE_AP(AP_RONA);
8597 			ap_rw = ARM_PTE_AP(AP_RWNA);
8598 			ap_x = ARM_PTE_NX;
8599 		} else {
8600 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8601 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8602 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8603 		}
8604 		/*
8605 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8606 		 * hardware they may be xPRR-protected, in which case they'll be handled
8607 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8608 		 * handling path currently does not call arm_fast_fault() without at least
8609 		 * VM_PROT_READ in fault_type.
8610 		 */
8611 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8612 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8613 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8614 				result = KERN_SUCCESS;
8615 			}
8616 		}
8617 	}
8618 
8619 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8620 		/*
8621 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8622 		 * another pending PV list operation or an excessively large PV list.
8623 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8624 		 * taking a fault on the same mapping.
8625 		 */
8626 		result = KERN_SUCCESS;
8627 	}
8628 
8629 	pvh_unlock(pai);
8630 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8631 	return result;
8632 }
8633 
8634 kern_return_t
8635 arm_fast_fault(
8636 	pmap_t pmap,
8637 	vm_map_address_t va,
8638 	vm_prot_t fault_type,
8639 	bool was_af_fault,
8640 	__unused bool from_user)
8641 {
8642 	kern_return_t   result = KERN_FAILURE;
8643 
8644 	if (va < pmap->min || va >= pmap->max) {
8645 		return result;
8646 	}
8647 
8648 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8649 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8650 	    from_user);
8651 
8652 	do {
8653 #if XNU_MONITOR
8654 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8655 #else
8656 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8657 #endif
8658 	} while (result == KERN_ABORTED);
8659 
8660 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8661 
8662 	return result;
8663 }
8664 
8665 void
8666 pmap_copy_page(
8667 	ppnum_t psrc,
8668 	ppnum_t pdst,
8669 	int options)
8670 {
8671 	bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8672 	    (addr64_t) (ptoa(pdst)),
8673 	    PAGE_SIZE,
8674 	    options);
8675 }
8676 
8677 
8678 /*
8679  *	pmap_copy_page copies the specified (machine independent) pages.
8680  */
8681 void
8682 pmap_copy_part_page(
8683 	ppnum_t psrc,
8684 	vm_offset_t src_offset,
8685 	ppnum_t pdst,
8686 	vm_offset_t dst_offset,
8687 	vm_size_t len)
8688 {
8689 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8690 	    (addr64_t) (ptoa(pdst) + dst_offset),
8691 	    len);
8692 }
8693 
8694 
8695 /*
8696  *	pmap_zero_page zeros the specified (machine independent) page.
8697  */
8698 void
8699 pmap_zero_page(
8700 	ppnum_t pn)
8701 {
8702 	assert(pn != vm_page_fictitious_addr);
8703 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8704 }
8705 
8706 void
8707 pmap_zero_page_with_options(
8708 	ppnum_t pn,
8709 	int options)
8710 {
8711 	assert(pn != vm_page_fictitious_addr);
8712 	bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8713 }
8714 
8715 /*
8716  *	pmap_zero_part_page
8717  *	zeros the specified (machine independent) part of a page.
8718  */
8719 void
8720 pmap_zero_part_page(
8721 	ppnum_t pn,
8722 	vm_offset_t offset,
8723 	vm_size_t len)
8724 {
8725 	assert(pn != vm_page_fictitious_addr);
8726 	assert(offset + len <= PAGE_SIZE);
8727 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8728 }
8729 
8730 void
8731 pmap_map_globals(
8732 	void)
8733 {
8734 	pt_entry_t      *ptep, pte;
8735 
8736 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8737 	assert(ptep != PT_ENTRY_NULL);
8738 	assert(*ptep == ARM_PTE_EMPTY);
8739 
8740 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8741 #if __ARM_KERNEL_PROTECT__
8742 	pte |= ARM_PTE_NG;
8743 #endif /* __ARM_KERNEL_PROTECT__ */
8744 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8745 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8746 	*ptep = pte;
8747 	FLUSH_PTE();
8748 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8749 
8750 #if KASAN
8751 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8752 #endif
8753 }
8754 
8755 vm_offset_t
8756 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8757 {
8758 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8759 		panic("%s: invalid index %u", __func__, index);
8760 	}
8761 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8762 }
8763 
8764 MARK_AS_PMAP_TEXT unsigned int
8765 pmap_map_cpu_windows_copy_internal(
8766 	ppnum_t pn,
8767 	vm_prot_t prot,
8768 	unsigned int wimg_bits)
8769 {
8770 	pt_entry_t      *ptep = NULL, pte;
8771 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8772 	unsigned int    cpu_num;
8773 	unsigned int    i;
8774 	vm_offset_t     cpu_copywindow_vaddr = 0;
8775 	bool            need_strong_sync = false;
8776 
8777 #if XNU_MONITOR
8778 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8779 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8780 #endif
8781 
8782 #if XNU_MONITOR
8783 #ifdef  __ARM_COHERENT_IO__
8784 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8785 		panic("%s: attempted to map a managed page, "
8786 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8787 		    __FUNCTION__,
8788 		    pn, prot, wimg_bits);
8789 	}
8790 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8791 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8792 	}
8793 
8794 #else /* __ARM_COHERENT_IO__ */
8795 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8796 #endif /* __ARM_COHERENT_IO__ */
8797 #endif /* XNU_MONITOR */
8798 	cpu_num = pmap_cpu_data->cpu_number;
8799 
8800 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8801 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8802 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8803 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8804 		if (!pte_is_valid(*ptep)) {
8805 			break;
8806 		}
8807 	}
8808 	if (i == CPUWINDOWS_MAX) {
8809 		panic("pmap_map_cpu_windows_copy: out of window");
8810 	}
8811 
8812 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8813 #if __ARM_KERNEL_PROTECT__
8814 	pte |= ARM_PTE_NG;
8815 #endif /* __ARM_KERNEL_PROTECT__ */
8816 
8817 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8818 
8819 	if (prot & VM_PROT_WRITE) {
8820 		pte |= ARM_PTE_AP(AP_RWNA);
8821 	} else {
8822 		pte |= ARM_PTE_AP(AP_RONA);
8823 	}
8824 #if HAS_FEAT_XS
8825 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8826 #endif
8827 	write_pte_fast(ptep, pte);
8828 	/*
8829 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8830 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8831 	 */
8832 	FLUSH_PTE_STRONG();
8833 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8834 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8835 
8836 	return i;
8837 }
8838 
8839 unsigned int
8840 pmap_map_cpu_windows_copy(
8841 	ppnum_t pn,
8842 	vm_prot_t prot,
8843 	unsigned int wimg_bits)
8844 {
8845 #if XNU_MONITOR
8846 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8847 #else
8848 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8849 #endif
8850 }
8851 
8852 MARK_AS_PMAP_TEXT void
8853 pmap_unmap_cpu_windows_copy_internal(
8854 	unsigned int index)
8855 {
8856 	pt_entry_t      *ptep;
8857 	unsigned int    cpu_num;
8858 	vm_offset_t     cpu_copywindow_vaddr = 0;
8859 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8860 
8861 	cpu_num = pmap_cpu_data->cpu_number;
8862 
8863 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8864 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8865 	 * (which are likely to have been on I/O memory) are complete before
8866 	 * tearing down the mapping. */
8867 	__builtin_arm_dsb(DSB_SY);
8868 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8869 	write_pte_strong(ptep, ARM_PTE_EMPTY);
8870 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8871 }
8872 
8873 void
8874 pmap_unmap_cpu_windows_copy(
8875 	unsigned int index)
8876 {
8877 #if XNU_MONITOR
8878 	return pmap_unmap_cpu_windows_copy_ppl(index);
8879 #else
8880 	return pmap_unmap_cpu_windows_copy_internal(index);
8881 #endif
8882 }
8883 
8884 #if XNU_MONITOR
8885 
8886 MARK_AS_PMAP_TEXT void
8887 pmap_invoke_with_page(
8888 	ppnum_t page_number,
8889 	void *ctx,
8890 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8891 {
8892 	#pragma unused(page_number, ctx, callback)
8893 }
8894 
8895 /*
8896  * Loop over every pmap_io_range (I/O ranges marked as owned by
8897  * the PPL in the device tree) and conditionally call callback() on each range
8898  * that needs to be included in the hibernation image.
8899  *
8900  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8901  *                 context is needed in the callback.
8902  * @param callback Callback function invoked on each range (gated by flag).
8903  */
8904 MARK_AS_PMAP_TEXT void
8905 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8906 {
8907 	extern const pmap_io_range_t* io_attr_table;
8908 	extern const unsigned int num_io_rgns;
8909 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8910 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8911 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8912 		}
8913 	}
8914 }
8915 
8916 /**
8917  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8918  * PPL-owned page. Otherwise, do nothing.
8919  *
8920  * @param addr Physical address of the page to set the HASHED flag on.
8921  */
8922 MARK_AS_PMAP_TEXT void
8923 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8924 {
8925 	/* Ignore non-managed kernel memory. */
8926 	if (!pa_valid(addr)) {
8927 		return;
8928 	}
8929 
8930 	const unsigned int pai = pa_index(addr);
8931 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8932 		pv_entry_t **pv_h = pai_to_pvh(pai);
8933 
8934 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8935 		pvh_lock(pai);
8936 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8937 		pvh_unlock(pai);
8938 	}
8939 }
8940 
8941 /**
8942  * Loop through every physical page in the system and clear out the HASHED flag
8943  * on every PPL-owned page. That flag is used to keep track of which pages have
8944  * been hashed into the hibernation image during the hibernation entry process.
8945  *
8946  * The HASHED flag needs to be cleared out between hibernation cycles because the
8947  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8948  * image with the HASHED flag set on certain pages. It's important to clear the
8949  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8950  * into the hibernation image can't be compromised across hibernation cycles.
8951  */
8952 MARK_AS_PMAP_TEXT void
8953 pmap_clear_ppl_hashed_flag_all(void)
8954 {
8955 	const unsigned int last_index = pa_index(vm_last_phys);
8956 	pv_entry_t **pv_h = NULL;
8957 
8958 	for (int pai = 0; pai < last_index; ++pai) {
8959 		pv_h = pai_to_pvh(pai);
8960 
8961 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8962 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8963 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8964 			pvh_lock(pai);
8965 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8966 			pvh_unlock(pai);
8967 		}
8968 	}
8969 }
8970 
8971 /**
8972  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8973  * ppl_hib driver will call this after all wired pages have been copied into the
8974  * hibernation image.
8975  */
8976 MARK_AS_PMAP_TEXT void
8977 pmap_check_ppl_hashed_flag_all(void)
8978 {
8979 	const unsigned int last_index = pa_index(vm_last_phys);
8980 	pv_entry_t **pv_h = NULL;
8981 
8982 	for (int pai = 0; pai < last_index; ++pai) {
8983 		pv_h = pai_to_pvh(pai);
8984 
8985 		/**
8986 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8987 		 * the pages that contain the PMAP stacks.
8988 		 */
8989 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8990 		    (pai < pa_index(pmap_stacks_end_pa));
8991 
8992 		if (!is_pmap_stack &&
8993 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8994 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8995 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8996 		}
8997 	}
8998 }
8999 
9000 #endif /* XNU_MONITOR */
9001 
9002 /*
9003  * Indicate that a pmap is intended to be used as a nested pmap
9004  * within one or more larger address spaces.  This must be set
9005  * before pmap_nest() is called with this pmap as the 'subordinate'.
9006  */
9007 MARK_AS_PMAP_TEXT void
9008 pmap_set_nested_internal(
9009 	pmap_t pmap)
9010 {
9011 	validate_pmap_mutable(pmap);
9012 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9013 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9014 		    __func__, pmap, pmap->type);
9015 	}
9016 
9017 #if XNU_MONITOR
9018 	/**
9019 	 * The "seq_cst" ordering of the atomic load here guarantees
9020 	 * the check below is performed after the type update above
9021 	 * is observed. Together with similar order guarantee at
9022 	 * pmap_switch_internal(), it makes sure a pmap is never
9023 	 * active-and-nested:
9024 	 *
9025 	 * pmap_set_nested() | pmap_switch()
9026 	 * --------------------------------------
9027 	 * set nested        | set active
9028 	 * store-load barrier| store-load barrier
9029 	 * assert !active    | assert !nested
9030 	 */
9031 	const int max_cpu = ml_get_max_cpu_number();
9032 	for (unsigned int i = 0; i <= max_cpu; ++i) {
9033 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9034 		if (cpu_data == NULL) {
9035 			continue;
9036 		}
9037 		if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9038 			panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9039 		}
9040 	}
9041 #endif /* XNU_MONITOR */
9042 
9043 	/**
9044 	 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
9045 	 * this pmap its own nested pmap.
9046 	 */
9047 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9048 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9049 	}
9050 
9051 	pmap_get_pt_ops(pmap)->free_id(pmap);
9052 }
9053 
9054 void
9055 pmap_set_nested(
9056 	pmap_t pmap)
9057 {
9058 #if XNU_MONITOR
9059 	pmap_set_nested_ppl(pmap);
9060 #else
9061 	pmap_set_nested_internal(pmap);
9062 #endif
9063 }
9064 
9065 bool
9066 pmap_is_nested(
9067 	pmap_t pmap)
9068 {
9069 	return pmap->type == PMAP_TYPE_NESTED;
9070 }
9071 
9072 /*
9073  * pmap_trim_range(pmap, start, end)
9074  *
9075  * pmap  = pmap to operate on
9076  * start = start of the range
9077  * end   = end of the range
9078  *
9079  * Attempts to deallocate TTEs for the given range in the nested range.
9080  */
9081 MARK_AS_PMAP_TEXT static void
9082 pmap_trim_range(
9083 	pmap_t pmap,
9084 	addr64_t start,
9085 	addr64_t end)
9086 {
9087 	addr64_t cur;
9088 	addr64_t nested_region_start;
9089 	addr64_t nested_region_end;
9090 	addr64_t adjusted_start;
9091 	addr64_t adjusted_end;
9092 	addr64_t adjust_offmask;
9093 	tt_entry_t * tte_p;
9094 	pt_entry_t * pte_p;
9095 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9096 
9097 	if (__improbable(end < start)) {
9098 		panic("%s: invalid address range, "
9099 		    "pmap=%p, start=%p, end=%p",
9100 		    __func__,
9101 		    pmap, (void*)start, (void*)end);
9102 	}
9103 
9104 	nested_region_start = pmap->nested_region_addr;
9105 	nested_region_end = nested_region_start + pmap->nested_region_size;
9106 
9107 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9108 		panic("%s: range outside nested region %p-%p, "
9109 		    "pmap=%p, start=%p, end=%p",
9110 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9111 		    pmap, (void*)start, (void*)end);
9112 	}
9113 
9114 	/* Contract the range to TT page boundaries. */
9115 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9116 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9117 	adjusted_end = end & ~adjust_offmask;
9118 
9119 	/* Iterate over the range, trying to remove TTEs. */
9120 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9121 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9122 
9123 		tte_p = pmap_tte(pmap, cur);
9124 
9125 		if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9126 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9127 
9128 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9129 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9130 				/* Deallocate for the nested map. */
9131 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9132 			} else if (pmap->type == PMAP_TYPE_USER) {
9133 				/**
9134 				 * Just remove for the parent map. If the leaf table pointed
9135 				 * to by the TTE being removed (owned by the nested pmap)
9136 				 * has any mappings, then this call will panic. This
9137 				 * enforces the policy that tables being trimmed must be
9138 				 * empty to prevent possible use-after-free attacks.
9139 				 */
9140 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9141 			} else {
9142 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9143 			}
9144 		} else {
9145 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9146 		}
9147 	}
9148 
9149 	/* Remove empty L2 TTs. */
9150 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9151 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9152 
9153 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9154 		/* For each L1 entry in our range... */
9155 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9156 
9157 		bool remove_tt1e = true;
9158 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9159 		tt_entry_t * tt2e_start;
9160 		tt_entry_t * tt2e_end;
9161 		tt_entry_t * tt2e_p;
9162 		tt_entry_t tt1e;
9163 
9164 		if (tt1e_p == NULL) {
9165 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9166 			continue;
9167 		}
9168 
9169 		tt1e = *tt1e_p;
9170 
9171 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9172 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9173 			continue;
9174 		}
9175 
9176 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9177 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9178 
9179 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9180 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9181 				/*
9182 				 * If any TTEs are populated, don't remove the
9183 				 * L1 TT.
9184 				 */
9185 				remove_tt1e = false;
9186 			}
9187 		}
9188 
9189 		if (remove_tt1e) {
9190 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9191 		} else {
9192 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9193 		}
9194 	}
9195 }
9196 
9197 /**
9198  * State machine for multi-step pmap trimming. Trimming is the action of
9199  * deallocating the TTEs of the shared region of pmaps down to a given range.
9200  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9201  * disabling preemption for too long. These steps include computing the bounds
9202  * of the shared region, trimming the head of the "grand", trimming the tail of
9203  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9204  * different conditions.
9205  *
9206  * @param grand the pmap in which the pages are nested
9207  * @param subord the pmap from which the pages are shared, or nested
9208  * @param vstart start of the used range in "grand"
9209  * @param size size of the used range
9210  * @param state the current state of the state machine
9211  *
9212  * @return the next state of the state machine, to be used in the next call
9213  *         into this function.
9214  */
9215 MARK_AS_PMAP_TEXT pmap_trim_state_t
9216 pmap_trim_internal(
9217 	pmap_t grand,
9218 	pmap_t subord,
9219 	addr64_t vstart,
9220 	uint64_t size,
9221 	pmap_trim_state_t state)
9222 {
9223 	/* Validation needs to be done regardless of state. */
9224 	addr64_t vend;
9225 
9226 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9227 		panic("%s: grand addr wraps around, "
9228 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9229 		    __func__, grand, subord, (void*)vstart, size, state);
9230 	}
9231 
9232 	validate_pmap_mutable(grand);
9233 	validate_pmap(subord);
9234 
9235 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9236 		panic("%s: subord is of non-nestable type 0x%hhx, "
9237 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9238 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9239 	}
9240 
9241 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9242 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9243 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9244 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9245 	}
9246 
9247 	if (__improbable(grand->nested_pmap != subord)) {
9248 		panic("%s: grand->nested != subord, "
9249 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9250 		    __func__, grand, subord, (void*)vstart, size, state);
9251 	}
9252 
9253 	if (__improbable((size != 0) &&
9254 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9255 		panic("%s: grand range not in nested region, "
9256 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9257 		    __func__, grand, subord, (void*)vstart, size, state);
9258 	}
9259 
9260 
9261 	/* Trimming starts with figuring out the bounds for the grand. */
9262 	if (state == PMAP_TRIM_STATE_START) {
9263 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9264 
9265 		/**
9266 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9267 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9268 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9269 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9270 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9271 		 * PMAP_TRIM_STATE_DONE.
9272 		 */
9273 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9274 			assert(subord->nested_bounds_set);
9275 
9276 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9277 			if (!grand->nested_bounds_set) {
9278 				/* Inherit the bounds from subord. */
9279 				grand->nested_region_true_start = subord->nested_region_true_start;
9280 				grand->nested_region_true_end = subord->nested_region_true_end;
9281 				grand->nested_bounds_set = true;
9282 			}
9283 
9284 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9285 
9286 			/* Now that the grand has bounds, we are done. */
9287 			return PMAP_TRIM_STATE_DONE;
9288 		}
9289 
9290 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9291 		if ((!subord->nested_bounds_set) && size) {
9292 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9293 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9294 
9295 			subord->nested_region_true_start = vstart;
9296 			subord->nested_region_true_end = vend;
9297 			subord->nested_region_true_start &= ~adjust_offmask;
9298 
9299 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9300 				panic("%s: padded true end wraps around, "
9301 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9302 				    __func__, grand, subord, (void*)vstart, size, state);
9303 			}
9304 
9305 			subord->nested_region_true_end &= ~adjust_offmask;
9306 			subord->nested_bounds_set = true;
9307 		}
9308 
9309 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9310 		if (subord->nested_bounds_set) {
9311 			/* Inherit the bounds from subord. */
9312 			grand->nested_region_true_start = subord->nested_region_true_start;
9313 			grand->nested_region_true_end = subord->nested_region_true_end;
9314 			grand->nested_bounds_set = true;
9315 
9316 			/* If we know the bounds, we can trim the pmap. */
9317 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9318 
9319 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9320 		} else {
9321 			/* Don't trim if we don't know the bounds. */
9322 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9323 
9324 			return PMAP_TRIM_STATE_DONE;
9325 		}
9326 	}
9327 
9328 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9329 	if (!grand->nested_bounds_set) {
9330 		panic("%s: !grand->nested_bounds_set, "
9331 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9332 		    __func__, grand, subord, (void*)vstart, size, state);
9333 	}
9334 
9335 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9336 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9337 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9338 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9339 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9340 			    (unsigned int)grand->nested_no_bounds_ref_state);
9341 		}
9342 
9343 #if XNU_MONITOR
9344 		if (pmap_pending_preemption()) {
9345 			return PMAP_TRIM_STATE_GRAND_AFTER;
9346 		}
9347 #endif
9348 
9349 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9350 	}
9351 
9352 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9353 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9354 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9355 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9356 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9357 			    (unsigned int)grand->nested_no_bounds_ref_state);
9358 		}
9359 
9360 #if XNU_MONITOR
9361 		if (pmap_pending_preemption()) {
9362 			return PMAP_TRIM_STATE_SUBORD;
9363 		}
9364 #endif
9365 
9366 		state = PMAP_TRIM_STATE_SUBORD;
9367 	}
9368 
9369 	/* START state is guaranteed to compute the bounds for the subord. */
9370 	if (!subord->nested_bounds_set) {
9371 		panic("%s: !subord->nested_bounds_set, "
9372 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9373 		    __func__, grand, subord, (void*)vstart, size, state);
9374 	}
9375 
9376 	if (state == PMAP_TRIM_STATE_SUBORD) {
9377 		/**
9378 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9379 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9380 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9381 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9382 		 * the state update is visible only once the preceding trim operation is complete.  An
9383 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9384 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9385 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9386 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9387 		 * of the state CAS.
9388 		 */
9389 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9390 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9391 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9392 			    (unsigned int)grand->nested_no_bounds_ref_state);
9393 		}
9394 		pmap_trim_subord(subord);
9395 	}
9396 
9397 	return PMAP_TRIM_STATE_DONE;
9398 }
9399 
9400 MARK_AS_PMAP_TEXT static void
9401 pmap_trim_self(pmap_t pmap)
9402 {
9403 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9404 		/* If we have a no bounds ref, we need to drop it. */
9405 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9406 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9407 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9408 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9409 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9410 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9411 
9412 		if (nested_bounds_set) {
9413 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9414 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9415 		}
9416 		/*
9417 		 * Try trimming the nested pmap, in case we had the
9418 		 * last reference.
9419 		 */
9420 		pmap_trim_subord(pmap->nested_pmap);
9421 	}
9422 }
9423 
9424 /*
9425  * pmap_trim_subord(grand, subord)
9426  *
9427  * grand  = pmap that we have nested subord in
9428  * subord = nested pmap we are attempting to trim
9429  *
9430  * Trims subord if possible
9431  */
9432 MARK_AS_PMAP_TEXT static void
9433 pmap_trim_subord(pmap_t subord)
9434 {
9435 	bool contract_subord = false;
9436 
9437 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9438 
9439 	subord->nested_no_bounds_refcnt--;
9440 
9441 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9442 		/* If this was the last no bounds reference, trim subord. */
9443 		contract_subord = true;
9444 	}
9445 
9446 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9447 
9448 	if (contract_subord) {
9449 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9450 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9451 	}
9452 }
9453 
9454 /**
9455  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9456  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9457  * disabling preemption for too long.
9458  *
9459  * @note When we load the shared region we always create pages tables for the
9460  *       entire region. In practice, the shared cache may use just a portion
9461  *       of that. Before we know the bounds of the shared region, it can
9462  *       already be mapped into processes. Therefore, once the bounds are
9463  *       known, "trimming" comes in handy to remove the unnecessary page
9464  *       tables in the processes the shared region is mapped in, and eventually
9465  *       those in the shared region itself. Note that the shared region must
9466  *       be trimmed after the user processes because it has the L3 entries
9467  *       everyone else is pointing to.
9468  *
9469  * @param grand the pmap in which the pages are nested
9470  * @param subord the pmap from which the pages are shared, or nested
9471  * @param vstart start of the used range in "grand"
9472  * @param size size of the used range
9473  */
9474 void
9475 pmap_trim(
9476 	pmap_t grand,
9477 	pmap_t subord,
9478 	addr64_t vstart,
9479 	uint64_t size)
9480 {
9481 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9482 
9483 #if XNU_MONITOR
9484 	/* On PPL systems, drives the state machine until its done. */
9485 	while (state != PMAP_TRIM_STATE_DONE) {
9486 		__assert_only pmap_trim_state_t old_state = state;
9487 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9488 
9489 		/* Are we making progress? */
9490 		assert(old_state != state);
9491 	}
9492 
9493 	pmap_ledger_check_balance(grand);
9494 	pmap_ledger_check_balance(subord);
9495 #else
9496 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9497 
9498 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9499 	assert(state == PMAP_TRIM_STATE_DONE);
9500 #endif
9501 }
9502 
9503 #if HAS_APPLE_PAC
9504 void *
9505 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9506 {
9507 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9508 		panic("attempt to sign user pointer without process independent key");
9509 	}
9510 
9511 	void *res = NULL;
9512 	uint64_t current_intr_state = pmap_interrupts_disable();
9513 
9514 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9515 
9516 	__compiler_materialize_and_prevent_reordering_on(value);
9517 	switch (key) {
9518 	case ptrauth_key_asia:
9519 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9520 		break;
9521 	case ptrauth_key_asda:
9522 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9523 		break;
9524 	default:
9525 		__builtin_unreachable();
9526 	}
9527 	__compiler_materialize_and_prevent_reordering_on(res);
9528 
9529 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9530 
9531 	pmap_interrupts_restore(current_intr_state);
9532 
9533 	return res;
9534 }
9535 
9536 void *
9537 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9538 {
9539 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9540 }
9541 
9542 void *
9543 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9544 {
9545 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9546 		panic("attempt to auth user pointer without process independent key");
9547 	}
9548 
9549 	void *res = NULL;
9550 	uint64_t current_intr_state = pmap_interrupts_disable();
9551 
9552 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9553 	__compiler_materialize_and_prevent_reordering_on(value);
9554 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9555 	__compiler_materialize_and_prevent_reordering_on(res);
9556 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9557 
9558 	pmap_interrupts_restore(current_intr_state);
9559 
9560 	return res;
9561 }
9562 
9563 void *
9564 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9565 {
9566 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9567 }
9568 #endif /* HAS_APPLE_PAC */
9569 
9570 /*
9571  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9572  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9573  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9574  * return value, to indicate where a preempted [un]nest operation should resume.
9575  * When the return value contains the ending address of the nested region with
9576  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9577  */
9578 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9579 
9580 /*
9581  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9582  *
9583  *	grand  = the pmap that we will nest subord into
9584  *	subord = the pmap that goes into the grand
9585  *	vstart  = start of range in pmap to be inserted
9586  *	size   = Size of nest area (up to 16TB)
9587  *
9588  *	Inserts a pmap into another.  This is used to implement shared segments.
9589  *
9590  */
9591 
9592 /**
9593  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9594  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9595  * This function operates in 3 main phases:
9596  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9597  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9598  *    the mapping range are present in subord.
9599  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9600  *    contains pointers to subord's leaf-level pagetable pages for the specified
9601  *    VA range.
9602  *
9603  * This function may return early due to pending AST_URGENT preemption; if so
9604  * it will indicate the need to be re-entered.
9605  *
9606  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9607  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9608  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9609  * @param size twig-aligned size of the nesting range
9610  * @param vrestart the twig-aligned starting address of the current call.  May contain
9611  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9612  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9613  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9614  *
9615  * @return the virtual address at which to restart the operation, possibly including
9616  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9617  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9618  */
9619 MARK_AS_PMAP_TEXT vm_map_offset_t
9620 pmap_nest_internal(
9621 	pmap_t grand,
9622 	pmap_t subord,
9623 	addr64_t vstart,
9624 	uint64_t size,
9625 	vm_map_offset_t vrestart,
9626 	kern_return_t *krp)
9627 {
9628 	kern_return_t kr = KERN_FAILURE;
9629 	vm_map_offset_t vaddr;
9630 	tt_entry_t     *stte_p;
9631 	tt_entry_t     *gtte_p;
9632 	uint64_t        nested_region_unnested_table_bitmap_size;
9633 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9634 	uint64_t        new_nested_region_unnested_table_bitmap_size;
9635 	unsigned int*   new_nested_region_unnested_table_bitmap = NULL;
9636 	int             expand_options = 0;
9637 	bool            deref_subord = true;
9638 	bool            grand_locked = false;
9639 
9640 	addr64_t vend;
9641 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9642 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9643 	}
9644 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9645 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9646 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9647 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9648 	}
9649 
9650 	assert(krp != NULL);
9651 	validate_pmap_mutable(grand);
9652 	validate_pmap(subord);
9653 #if XNU_MONITOR
9654 	/*
9655 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9656 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9657 	 * be in the process of being destroyed.  If destruction is already committed,
9658 	 * then the check of ref_count below will cover us.  If destruction is initiated
9659 	 * during or after this call, then pmap_destroy() will catch the non-zero
9660 	 * nested_count.
9661 	 */
9662 	os_atomic_inc(&subord->nested_count, relaxed);
9663 	os_atomic_thread_fence(seq_cst);
9664 #endif
9665 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9666 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9667 	}
9668 
9669 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9670 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9671 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9672 	}
9673 
9674 #if XNU_MONITOR
9675 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9676 #endif
9677 
9678 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9679 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9680 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9681 		    grand, vstart, size, (unsigned long long)vrestart);
9682 	}
9683 
9684 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9685 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9686 	}
9687 
9688 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9689 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9690 	}
9691 
9692 	if (subord->nested_region_unnested_table_bitmap == NULL) {
9693 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9694 
9695 		/**
9696 		 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9697 		 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9698 		 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9699 		 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9700 		 */
9701 		nested_region_unnested_table_bitmap_size <<= 1;
9702 
9703 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9704 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9705 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9706 			    __func__, nested_region_unnested_table_bitmap_size,
9707 			    grand, subord, vstart, size);
9708 		}
9709 
9710 #if XNU_MONITOR
9711 		pmap_paddr_t pa = 0;
9712 
9713 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9714 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9715 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9716 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9717 			    grand, subord, vstart, size);
9718 		}
9719 
9720 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9721 
9722 		if (kr != KERN_SUCCESS) {
9723 			goto nest_cleanup;
9724 		}
9725 
9726 		assert(pa);
9727 
9728 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9729 #else
9730 		nested_region_unnested_table_bitmap = kalloc_data(
9731 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9732 			Z_WAITOK | Z_ZERO);
9733 #endif
9734 
9735 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9736 			kr = KERN_ABORTED;
9737 			goto nest_cleanup;
9738 		}
9739 
9740 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9741 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9742 			subord->nested_region_addr = vstart;
9743 			subord->nested_region_size = (mach_vm_offset_t) size;
9744 
9745 			/**
9746 			 * Ensure that the rest of the subord->nested_region_* fields are
9747 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
9748 			 * field (which is used as the flag to say that the rest are initialized).
9749 			 */
9750 			__builtin_arm_dmb(DMB_ISHST);
9751 			subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9752 			nested_region_unnested_table_bitmap = NULL;
9753 		}
9754 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9755 		if (nested_region_unnested_table_bitmap != NULL) {
9756 #if XNU_MONITOR
9757 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9758 #else
9759 			kfree_data(nested_region_unnested_table_bitmap,
9760 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9761 #endif
9762 			nested_region_unnested_table_bitmap = NULL;
9763 		}
9764 	}
9765 
9766 	/**
9767 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9768 	 * speculated before their initialization.
9769 	 */
9770 	__builtin_arm_dmb(DMB_ISHLD);
9771 
9772 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9773 		uint64_t        new_size;
9774 
9775 		nested_region_unnested_table_bitmap = NULL;
9776 		nested_region_unnested_table_bitmap_size = 0ULL;
9777 		new_size =  vend - subord->nested_region_addr;
9778 
9779 		new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9780 		new_nested_region_unnested_table_bitmap_size <<= 1;
9781 
9782 		if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9783 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9784 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9785 			    __func__, new_nested_region_unnested_table_bitmap_size,
9786 			    grand, subord, vstart, size);
9787 		}
9788 
9789 #if XNU_MONITOR
9790 		pmap_paddr_t pa = 0;
9791 
9792 		if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9793 			panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9794 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9795 			    __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9796 			    grand, subord, vstart, new_size);
9797 		}
9798 
9799 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9800 
9801 		if (kr != KERN_SUCCESS) {
9802 			goto nest_cleanup;
9803 		}
9804 
9805 		assert(pa);
9806 
9807 		new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9808 #else
9809 		new_nested_region_unnested_table_bitmap = kalloc_data(
9810 			new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9811 			Z_WAITOK | Z_ZERO);
9812 #endif
9813 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9814 			kr = KERN_ABORTED;
9815 			goto nest_cleanup;
9816 		}
9817 
9818 		if (subord->nested_region_size < new_size) {
9819 			bcopy(subord->nested_region_unnested_table_bitmap,
9820 			    new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9821 			nested_region_unnested_table_bitmap_size  = subord->nested_region_unnested_table_bitmap_size;
9822 			nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9823 			subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9824 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9825 			subord->nested_region_size = new_size;
9826 			new_nested_region_unnested_table_bitmap = NULL;
9827 		}
9828 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9829 		if (nested_region_unnested_table_bitmap != NULL) {
9830 #if XNU_MONITOR
9831 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9832 #else
9833 			kfree_data(nested_region_unnested_table_bitmap,
9834 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9835 #endif
9836 			nested_region_unnested_table_bitmap = NULL;
9837 		}
9838 		if (new_nested_region_unnested_table_bitmap != NULL) {
9839 #if XNU_MONITOR
9840 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9841 #else
9842 			kfree_data(new_nested_region_unnested_table_bitmap,
9843 			    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9844 #endif
9845 			new_nested_region_unnested_table_bitmap = NULL;
9846 		}
9847 	}
9848 
9849 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9850 		kr = KERN_ABORTED;
9851 		goto nest_cleanup;
9852 	}
9853 
9854 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9855 		/**
9856 		 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9857 		 * into a nested pmap, which would then produce multiple levels of nesting.
9858 		 */
9859 		if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9860 			panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9861 		}
9862 		/*
9863 		 * If this is grand's first nesting operation, keep the reference on subord.
9864 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9865 		 */
9866 		deref_subord = false;
9867 
9868 		if (!subord->nested_bounds_set) {
9869 			/*
9870 			 * We are nesting without the shared regions bounds
9871 			 * being known.  We'll have to trim the pmap later.
9872 			 */
9873 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9874 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9875 				panic("%s: grand %p already nested", __func__, grand);
9876 			}
9877 			subord->nested_no_bounds_refcnt++;
9878 		}
9879 
9880 		if (__improbable(vstart < subord->nested_region_addr ||
9881 		    vend > (subord->nested_region_addr + subord->nested_region_size))) {
9882 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9883 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9884 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9885 		}
9886 
9887 		grand->nested_region_addr = vstart;
9888 		grand->nested_region_size = (mach_vm_offset_t) size;
9889 	} else {
9890 		if (__improbable(grand->nested_pmap != subord)) {
9891 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9892 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9893 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9894 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9895 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9896 		}
9897 	}
9898 
9899 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9900 	if (vaddr < subord->nested_region_true_start) {
9901 		vaddr = subord->nested_region_true_start;
9902 	}
9903 
9904 	addr64_t true_end = vend;
9905 	if (true_end > subord->nested_region_true_end) {
9906 		true_end = subord->nested_region_true_end;
9907 	}
9908 	__unused unsigned int ttecount = 0;
9909 
9910 	if (vrestart & PMAP_NEST_GRAND) {
9911 		goto nest_grand;
9912 	}
9913 
9914 	while (vaddr < true_end) {
9915 		stte_p = pmap_tte(subord, vaddr);
9916 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9917 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9918 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9919 
9920 			if (kr != KERN_SUCCESS) {
9921 				goto done;
9922 			}
9923 
9924 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9925 		}
9926 		vaddr += pt_attr_twig_size(pt_attr);
9927 		vrestart = vaddr;
9928 		++ttecount;
9929 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9930 		    pmap_pending_preemption())) {
9931 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9932 			kr = KERN_SUCCESS;
9933 			goto done;
9934 		}
9935 	}
9936 	/*
9937 	 * copy TTEs from subord pmap into grand pmap
9938 	 */
9939 
9940 	vaddr = (vm_map_offset_t) vstart;
9941 	if (vaddr < subord->nested_region_true_start) {
9942 		vaddr = subord->nested_region_true_start;
9943 	}
9944 	vrestart = vaddr | PMAP_NEST_GRAND;
9945 
9946 nest_grand:
9947 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9948 
9949 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9950 		kr = KERN_ABORTED;
9951 		goto done;
9952 	}
9953 	while (vaddr < true_end) {
9954 		gtte_p = pmap_tte(grand, vaddr);
9955 		if (gtte_p == PT_ENTRY_NULL) {
9956 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9957 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9958 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9959 				if (kr == KERN_SUCCESS) {
9960 					kr = KERN_ABORTED;
9961 				}
9962 			}
9963 
9964 			if (kr != KERN_SUCCESS) {
9965 				goto done;
9966 			}
9967 
9968 			gtte_p = pmap_tt2e(grand, vaddr);
9969 		}
9970 		/* Don't leak a page table page.  Don't violate break-before-make. */
9971 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9972 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9973 			    __func__, gtte_p, grand);
9974 		}
9975 		/**
9976 		 * It's possible that grand was trimmed by pmap_trim_internal() while the
9977 		 * lock was dropped, in which case the previously stored "true" start/end
9978 		 * will no longer be accurate.  In that case, we need to avoid nesting
9979 		 * tables outside the trimmed range, as those tables may be immediately freed
9980 		 * which would lead to a dangling page table pointer in grand.
9981 		 * Note that pmap_trim() may concurrently update grand's bounds as we are
9982 		 * making these checks, but in that case pmap_trim_range() has not yet
9983 		 * been called on grand and will wait for us to drop grand's lock, so it
9984 		 * should see any TTEs we've nested here and clear them appropriately.
9985 		 */
9986 		if (__probable((vaddr >= grand->nested_region_true_start) &&
9987 		    (vaddr < grand->nested_region_true_end))) {
9988 			stte_p = pmap_tte(subord, vaddr);
9989 			if (__improbable(stte_p == PT_ENTRY_NULL)) {
9990 				panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9991 			}
9992 			*gtte_p = *stte_p;
9993 		}
9994 
9995 		vaddr += pt_attr_twig_size(pt_attr);
9996 		vrestart = vaddr | PMAP_NEST_GRAND;
9997 		++ttecount;
9998 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9999 		    pmap_pending_preemption())) {
10000 			break;
10001 		}
10002 	}
10003 	if (vaddr >= true_end) {
10004 		vrestart = vend | PMAP_NEST_GRAND;
10005 	}
10006 
10007 	kr = KERN_SUCCESS;
10008 done:
10009 
10010 	FLUSH_PTE();
10011 	__builtin_arm_isb(ISB_SY);
10012 
10013 	if (grand_locked) {
10014 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10015 	}
10016 
10017 nest_cleanup:
10018 #if XNU_MONITOR
10019 	if (kr != KERN_SUCCESS) {
10020 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10021 		*krp = kr;
10022 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10023 	}
10024 #else
10025 	if (kr != KERN_SUCCESS) {
10026 		*krp = kr;
10027 	}
10028 #endif
10029 	if (nested_region_unnested_table_bitmap != NULL) {
10030 #if XNU_MONITOR
10031 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
10032 #else
10033 		kfree_data(nested_region_unnested_table_bitmap,
10034 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10035 #endif
10036 	}
10037 	if (new_nested_region_unnested_table_bitmap != NULL) {
10038 #if XNU_MONITOR
10039 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
10040 #else
10041 		kfree_data(new_nested_region_unnested_table_bitmap,
10042 		    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10043 #endif
10044 	}
10045 	if (deref_subord) {
10046 #if XNU_MONITOR
10047 		os_atomic_dec(&subord->nested_count, relaxed);
10048 #endif
10049 		pmap_destroy_internal(subord);
10050 	}
10051 	return vrestart;
10052 }
10053 
10054 kern_return_t
10055 pmap_nest(
10056 	pmap_t grand,
10057 	pmap_t subord,
10058 	addr64_t vstart,
10059 	uint64_t size)
10060 {
10061 	kern_return_t kr = KERN_SUCCESS;
10062 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10063 	vm_map_offset_t vend = vaddr + size;
10064 	__unused vm_map_offset_t vlast = vaddr;
10065 
10066 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10067 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10068 	    VM_KERNEL_ADDRHIDE(vstart));
10069 
10070 	pmap_verify_preemptible();
10071 #if XNU_MONITOR
10072 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
10073 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10074 		if (kr == KERN_RESOURCE_SHORTAGE) {
10075 			pmap_alloc_page_for_ppl(0);
10076 			kr = KERN_SUCCESS;
10077 		} else if (kr == KERN_ABORTED) {
10078 			/**
10079 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10080 			 * that it won't update kr when KERN_SUCCESS is to be returned.
10081 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10082 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10083 			 */
10084 			kr = KERN_SUCCESS;
10085 			continue;
10086 		} else if (kr != KERN_SUCCESS) {
10087 			break;
10088 		} else if (vaddr == vlast) {
10089 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10090 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10091 		}
10092 		vlast = vaddr;
10093 	}
10094 
10095 	pmap_ledger_check_balance(grand);
10096 	pmap_ledger_check_balance(subord);
10097 #else
10098 	/**
10099 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10100 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10101 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10102 	 */
10103 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10104 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10105 	}
10106 #endif
10107 
10108 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10109 
10110 	return kr;
10111 }
10112 
10113 /*
10114  *	kern_return_t pmap_unnest(grand, vaddr)
10115  *
10116  *	grand  = the pmap that will have the virtual range unnested
10117  *	vaddr  = start of range in pmap to be unnested
10118  *	size   = size of range in pmap to be unnested
10119  *
10120  */
10121 
10122 kern_return_t
10123 pmap_unnest(
10124 	pmap_t grand,
10125 	addr64_t vaddr,
10126 	uint64_t size)
10127 {
10128 	return pmap_unnest_options(grand, vaddr, size, 0);
10129 }
10130 
10131 /**
10132  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10133  * from a top-level pmap ('grand').  The corresponding mappings in the nested
10134  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10135  * still have the region nested.  The mappings in 'grand' will be left empty
10136  * with the assumption that they will be demand-filled by subsequent access faults.
10137  *
10138  * This function operates in 2 main phases:
10139  * 1. Iteration over the nested pmap's mappings for the specified range to mark
10140  *    them non-global.
10141  * 2. Clearing of the twig-level TTEs for the address range in grand.
10142  *
10143  * This function may return early due to pending AST_URGENT preemption; if so
10144  * it will indicate the need to be re-entered.
10145  *
10146  * @param grand pmap from which to unnest mappings
10147  * @param vaddr twig-aligned virtual address for the beginning of the nested range
10148  * @param size twig-aligned size of the nested range
10149  * @param vrestart the page-aligned starting address of the current call.  May contain
10150  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10151  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10152  *        grand is being torn down and step 1) above is not needed.
10153  *
10154  * @return the virtual address at which to restart the operation, possibly including
10155  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
10156  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10157  */
10158 MARK_AS_PMAP_TEXT vm_map_offset_t
10159 pmap_unnest_options_internal(
10160 	pmap_t grand,
10161 	addr64_t vaddr,
10162 	uint64_t size,
10163 	vm_map_offset_t vrestart,
10164 	unsigned int option)
10165 {
10166 	vm_map_offset_t start;
10167 	vm_map_offset_t addr;
10168 	tt_entry_t     *tte_p;
10169 	unsigned int    current_index;
10170 	unsigned int    start_index;
10171 	unsigned int    max_index;
10172 	unsigned int    entry_count = 0;
10173 
10174 	addr64_t vend;
10175 	addr64_t true_end;
10176 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10177 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10178 	}
10179 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10180 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10181 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10182 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10183 	}
10184 
10185 	validate_pmap_mutable(grand);
10186 
10187 	if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10188 		panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10189 	}
10190 
10191 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10192 
10193 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10194 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10195 		    (unsigned long long)vaddr, (unsigned long long)size);
10196 	}
10197 
10198 	if (__improbable(grand->nested_pmap == NULL)) {
10199 		panic("%s: %p has no nested pmap", __func__, grand);
10200 	}
10201 
10202 	true_end = vend;
10203 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10204 		true_end = grand->nested_pmap->nested_region_true_end;
10205 	}
10206 
10207 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10208 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10209 			return vrestart;
10210 		}
10211 
10212 		start = vrestart;
10213 		if (start < grand->nested_pmap->nested_region_true_start) {
10214 			start = grand->nested_pmap->nested_region_true_start;
10215 		}
10216 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10217 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10218 		bool flush_tlb = false;
10219 
10220 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10221 			pt_entry_t  *bpte, *cpte;
10222 
10223 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10224 
10225 			bpte = pmap_pte(grand->nested_pmap, addr);
10226 
10227 			/*
10228 			 * If we've re-entered this function partway through unnesting a leaf region, the
10229 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10230 			 * the run of PTEs and the adjacent "in-progress" bit will be set.
10231 			 */
10232 			if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10233 			    testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10234 				/*
10235 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10236 				 * the nested pmap in this region will now be marked non-global.  Do this
10237 				 * before marking any of the PTEs within the region as non-global to avoid
10238 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10239 				 * in the region, which could lead to a TLB conflict if a non-global entry
10240 				 * is later inserted for the same VA in a pmap which has fully unnested this
10241 				 * region.
10242 				 */
10243 				setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10244 				setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10245 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10246 					pmap_paddr_t    pa;
10247 					unsigned int    pai = 0;
10248 					boolean_t               managed = FALSE;
10249 					pt_entry_t  spte;
10250 
10251 					if (pte_is_valid(*cpte) && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10252 						spte = *((volatile pt_entry_t*)cpte);
10253 						while (!managed) {
10254 							pa = pte_to_pa(spte);
10255 							if (!pa_valid(pa)) {
10256 								break;
10257 							}
10258 							pai = pa_index(pa);
10259 							pvh_lock(pai);
10260 							spte = *((volatile pt_entry_t*)cpte);
10261 							pa = pte_to_pa(spte);
10262 							if (pai == pa_index(pa)) {
10263 								managed = TRUE;
10264 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10265 							}
10266 							pvh_unlock(pai);
10267 						}
10268 
10269 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10270 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10271 							flush_tlb = true;
10272 						}
10273 
10274 						if (managed) {
10275 							pvh_assert_locked(pai);
10276 							pvh_unlock(pai);
10277 						}
10278 					}
10279 
10280 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10281 					vrestart = addr;
10282 					++entry_count;
10283 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10284 					    pmap_pending_preemption())) {
10285 						goto unnest_subord_done;
10286 					}
10287 				}
10288 				clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10289 			}
10290 			addr = vlim;
10291 			vrestart = addr;
10292 			++entry_count;
10293 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10294 			    pmap_pending_preemption())) {
10295 				break;
10296 			}
10297 		}
10298 
10299 unnest_subord_done:
10300 		if (flush_tlb) {
10301 			FLUSH_PTE_STRONG();
10302 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10303 		}
10304 
10305 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10306 		if (current_index < max_index) {
10307 			return vrestart;
10308 		}
10309 	}
10310 
10311 	/*
10312 	 * invalidate all pdes for segment at vaddr in pmap grand
10313 	 */
10314 	if (vrestart & PMAP_NEST_GRAND) {
10315 		addr = vrestart & ~PMAP_NEST_GRAND;
10316 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10317 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10318 		}
10319 	} else {
10320 		addr = vaddr;
10321 		vrestart = vaddr | PMAP_NEST_GRAND;
10322 	}
10323 
10324 	/**
10325 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10326 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10327 	 * upon reentry.
10328 	 */
10329 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10330 		return vrestart;
10331 	}
10332 
10333 	if (addr < grand->nested_pmap->nested_region_true_start) {
10334 		addr = grand->nested_pmap->nested_region_true_start;
10335 	}
10336 
10337 	start = addr;
10338 
10339 	while (addr < true_end) {
10340 		tte_p = pmap_tte(grand, addr);
10341 		/*
10342 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10343 		 * so it's possible that a region we're trying to unnest may not have been
10344 		 * nested in the first place.
10345 		 */
10346 		if (tte_p != NULL) {
10347 			*tte_p = ARM_TTE_TYPE_FAULT;
10348 		}
10349 		addr += pt_attr_twig_size(pt_attr);
10350 		vrestart = addr | PMAP_NEST_GRAND;
10351 		++entry_count;
10352 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10353 		    pmap_pending_preemption())) {
10354 			break;
10355 		}
10356 	}
10357 	if (addr >= true_end) {
10358 		vrestart = vend | PMAP_NEST_GRAND;
10359 	}
10360 
10361 	FLUSH_PTE_STRONG();
10362 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10363 
10364 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10365 
10366 	return vrestart;
10367 }
10368 
10369 kern_return_t
10370 pmap_unnest_options(
10371 	pmap_t grand,
10372 	addr64_t vaddr,
10373 	uint64_t size,
10374 	unsigned int option)
10375 {
10376 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10377 	vm_map_offset_t vend = vaddr + size;
10378 
10379 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10380 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10381 
10382 	pmap_verify_preemptible();
10383 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10384 #if XNU_MONITOR
10385 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10386 #else
10387 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10388 #endif
10389 	}
10390 
10391 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10392 
10393 	return KERN_SUCCESS;
10394 }
10395 
10396 boolean_t
10397 pmap_adjust_unnest_parameters(
10398 	__unused pmap_t p,
10399 	__unused vm_map_offset_t *s,
10400 	__unused vm_map_offset_t *e)
10401 {
10402 	return TRUE; /* to get to log_unnest_badness()... */
10403 }
10404 
10405 #if PMAP_FORK_NEST
10406 /**
10407  * Perform any necessary pre-nesting of the parent's shared region at fork()
10408  * time.
10409  *
10410  * @note This should only be called from vm_map_fork().
10411  *
10412  * @param old_pmap The pmap of the parent task.
10413  * @param new_pmap The pmap of the child task.
10414  * @param nesting_start An output parameter that is updated with the start
10415  *                      address of the range that was pre-nested
10416  * @param nesting_end An output parameter that is updated with the end
10417  *                      address of the range that was pre-nested
10418  *
10419  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10420  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10421  */
10422 kern_return_t
10423 pmap_fork_nest(
10424 	pmap_t old_pmap,
10425 	pmap_t new_pmap,
10426 	vm_map_offset_t *nesting_start,
10427 	vm_map_offset_t *nesting_end)
10428 {
10429 	if (old_pmap == NULL || new_pmap == NULL) {
10430 		return KERN_INVALID_ARGUMENT;
10431 	}
10432 	if (old_pmap->nested_pmap == NULL) {
10433 		return KERN_SUCCESS;
10434 	}
10435 	pmap_nest(new_pmap,
10436 	    old_pmap->nested_pmap,
10437 	    old_pmap->nested_region_addr,
10438 	    old_pmap->nested_region_size);
10439 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10440 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10441 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10442 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10443 	    new_pmap->nested_pmap,
10444 	    new_pmap->nested_region_addr,
10445 	    new_pmap->nested_region_size,
10446 	    old_pmap->nested_pmap,
10447 	    old_pmap->nested_region_addr,
10448 	    old_pmap->nested_region_size);
10449 	*nesting_start = old_pmap->nested_region_addr;
10450 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10451 	return KERN_SUCCESS;
10452 }
10453 #endif /* PMAP_FORK_NEST */
10454 
10455 /*
10456  * disable no-execute capability on
10457  * the specified pmap
10458  */
10459 #if DEVELOPMENT || DEBUG
10460 void
10461 pmap_disable_NX(
10462 	pmap_t pmap)
10463 {
10464 	pmap->nx_enabled = FALSE;
10465 }
10466 #else
10467 void
10468 pmap_disable_NX(
10469 	__unused pmap_t pmap)
10470 {
10471 }
10472 #endif
10473 
10474 /*
10475  * flush a range of hardware TLB entries.
10476  * NOTE: assumes the smallest TLB entry in use will be for
10477  * an ARM small page (4K).
10478  */
10479 
10480 #if __ARM_RANGE_TLBI__
10481 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10482 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10483 #else
10484 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10485 #endif // __ARM_RANGE_TLBI__
10486 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10487     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10488     "of npages to 32 bits below may truncate.");
10489 
10490 static void
10491 flush_mmu_tlb_region_asid_async(
10492 	vm_offset_t va,
10493 	size_t length,
10494 	pmap_t pmap,
10495 	bool last_level_only __unused,
10496 	bool strong __unused)
10497 {
10498 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10499 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10500 	size_t npages = length >> pmap_page_shift;
10501 	uint32_t asid;
10502 
10503 	asid = pmap->hw_asid;
10504 
10505 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10506 		boolean_t       flush_all = FALSE;
10507 
10508 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10509 			flush_all = TRUE;
10510 		}
10511 		if (flush_all) {
10512 			flush_mmu_tlb_async();
10513 		} else {
10514 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10515 		}
10516 		return;
10517 	}
10518 #if __ARM_RANGE_TLBI__
10519 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10520 		/**
10521 		 * Note that casting npages to 32 bits here is always safe thanks to
10522 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10523 		 */
10524 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10525 		if (pmap->type == PMAP_TYPE_NESTED) {
10526 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10527 		} else {
10528 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10529 		}
10530 		return;
10531 	}
10532 #endif
10533 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10534 	va = tlbi_asid(asid) | tlbi_addr(va);
10535 
10536 	if (pmap->type == PMAP_TYPE_NESTED) {
10537 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10538 	} else {
10539 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10540 	}
10541 }
10542 
10543 MARK_AS_PMAP_TEXT static void
10544 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10545 {
10546 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10547 }
10548 
10549 void
10550 flush_mmu_tlb_region(
10551 	vm_offset_t va,
10552 	unsigned length)
10553 {
10554 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10555 	sync_tlb_flush();
10556 }
10557 
10558 unsigned int
10559 pmap_cache_attributes(
10560 	ppnum_t pn)
10561 {
10562 	pmap_paddr_t    paddr;
10563 	unsigned int    pai;
10564 	unsigned int    result;
10565 	pp_attr_t       pp_attr_current;
10566 
10567 	paddr = ptoa(pn);
10568 
10569 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10570 
10571 	if (!pa_valid(paddr)) {
10572 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10573 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10574 	}
10575 
10576 	result = VM_WIMG_DEFAULT;
10577 
10578 	pai = pa_index(paddr);
10579 
10580 	pp_attr_current = pp_attr_table[pai];
10581 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10582 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10583 	}
10584 	return result;
10585 }
10586 
10587 MARK_AS_PMAP_TEXT static void
10588 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10589 {
10590 	if ((wimg_bits_prev != wimg_bits_new)
10591 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10592 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10593 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10594 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10595 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10596 		pmap_sync_page_attributes_phys(pn);
10597 	}
10598 
10599 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10600 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10601 	}
10602 }
10603 
10604 MARK_AS_PMAP_TEXT __unused void
10605 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10606 {
10607 	pmap_paddr_t paddr = ptoa(pn);
10608 	const unsigned int pai = pa_index(paddr);
10609 
10610 	if (__improbable(!pa_valid(paddr))) {
10611 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10612 	}
10613 
10614 	pvh_lock(pai);
10615 
10616 #if XNU_MONITOR
10617 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10618 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10619 	}
10620 #endif
10621 
10622 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10623 
10624 	pvh_unlock(pai);
10625 
10626 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10627 }
10628 
10629 void *
10630 pmap_map_compressor_page(ppnum_t pn)
10631 {
10632 #if __ARM_PTE_PHYSMAP__
10633 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10634 	if (cacheattr != VM_WIMG_DEFAULT) {
10635 #if XNU_MONITOR
10636 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10637 #else
10638 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10639 #endif
10640 	}
10641 #endif
10642 	return (void*)phystokv(ptoa(pn));
10643 }
10644 
10645 void
10646 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10647 {
10648 #if __ARM_PTE_PHYSMAP__
10649 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10650 	if (cacheattr != VM_WIMG_DEFAULT) {
10651 #if XNU_MONITOR
10652 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10653 #else
10654 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10655 #endif
10656 	}
10657 #endif
10658 }
10659 
10660 /**
10661  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10662  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10663  *
10664  * @param page_list List of pages to be updated.
10665  * @param cacheattr The new cache attribute.
10666  */
10667 void
10668 pmap_batch_set_cache_attributes(
10669 	const unified_page_list_t *page_list,
10670 	unsigned int cacheattr)
10671 {
10672 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10673 
10674 	if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10675 		/**
10676 		 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10677 		 * In an ideal world we would just use these iterator functions within
10678 		 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10679 		 * that means we'll need to take special care to handle pending preemption and
10680 		 * if necessary return the iterator position out to this function and then re-enter
10681 		 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10682 		 * secure manner.  Not impossible, but also not trivial, so unless someone asks for
10683 		 * this perf improvement on the PPL I'm going to take the lazy approach here.
10684 		 */
10685 		unified_page_list_iterator_t iter;
10686 
10687 		for (unified_page_list_iterator_init(page_list, &iter);
10688 		    !unified_page_list_iterator_end(&iter);
10689 		    unified_page_list_iterator_next(&iter)) {
10690 			bool is_fictitious = false;
10691 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10692 			if (__probable(!is_fictitious)) {
10693 #if XNU_MONITOR
10694 				pmap_set_cache_attributes_ppl(pn, cacheattr);
10695 #else /* !XNU_MONITOR */
10696 				pmap_set_cache_attributes_internal(pn, cacheattr);
10697 #endif /* XNU_MONITOR */
10698 			}
10699 		}
10700 		return;
10701 	}
10702 
10703 	if (page_list->upl.upl_size == 0) {
10704 		return;
10705 	}
10706 
10707 	batch_set_cache_attr_state_t states;
10708 	states.page_index = 0;
10709 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10710 	states.tlb_flush_pass_needed = false;
10711 	states.rt_cache_flush_pass_needed = false;
10712 
10713 	/* Verify we are being called from a preemptible context. */
10714 	pmap_verify_preemptible();
10715 
10716 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10717 #if XNU_MONITOR
10718 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10719 		    states, page_list->upl.upl_size, cacheattr);
10720 #else /* !XNU_MONITOR */
10721 		states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10722 		    states, page_list->upl.upl_size, cacheattr);
10723 #endif /* XNU_MONITOR */
10724 	}
10725 
10726 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10727 }
10728 
10729 /**
10730  * Flushes TLB entries associated with the page specified by paddr, but do not
10731  * issue barriers yet.
10732  *
10733  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10734  */
10735 MARK_AS_PMAP_TEXT static void
10736 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10737 {
10738 #if __ARM_PTE_PHYSMAP__
10739 	/* Flush the physical aperture mappings. */
10740 	const vm_offset_t kva = phystokv(paddr);
10741 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10742 #endif /* __ARM_PTE_PHYSMAP__ */
10743 
10744 	/* Flush the mappings tracked in the ptes. */
10745 	const unsigned int pai = pa_index(paddr);
10746 	pv_entry_t **pv_h = pai_to_pvh(pai);
10747 
10748 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10749 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10750 
10751 	pvh_assert_locked(pai);
10752 
10753 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10754 		pte_p = pvh_ptep(pv_h);
10755 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10756 		pve_p = pvh_pve_list(pv_h);
10757 		pte_p = PT_ENTRY_NULL;
10758 	}
10759 
10760 	int pve_ptep_idx = 0;
10761 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10762 		if (pve_p != PV_ENTRY_NULL) {
10763 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10764 			if (pte_p == PT_ENTRY_NULL) {
10765 				goto flush_tlb_skip_pte;
10766 			}
10767 		}
10768 
10769 #ifdef PVH_FLAG_IOMMU
10770 		if (pvh_ptep_is_iommu(pte_p)) {
10771 			goto flush_tlb_skip_pte;
10772 		}
10773 #endif /* PVH_FLAG_IOMMU */
10774 		pmap_t pmap = ptep_get_pmap(pte_p);
10775 		vm_map_address_t va = ptep_get_va(pte_p);
10776 
10777 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10778 		    pmap, true, false);
10779 
10780 flush_tlb_skip_pte:
10781 		pte_p = PT_ENTRY_NULL;
10782 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10783 			pve_ptep_idx = 0;
10784 			pve_p = pve_next(pve_p);
10785 		}
10786 	}
10787 }
10788 
10789 /**
10790  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10791  *
10792  * @param pai The Physical Address Index of the entry.
10793  * @param cacheattr The new cache attribute.
10794  */
10795 MARK_AS_PMAP_TEXT static void
10796 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10797 {
10798 	pvh_assert_locked(pai);
10799 
10800 	pp_attr_t pp_attr_current, pp_attr_template;
10801 	do {
10802 		pp_attr_current = pp_attr_table[pai];
10803 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10804 
10805 		/**
10806 		 * WIMG bits should only be updated under the PVH lock, but we should do
10807 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10808 		 */
10809 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10810 }
10811 
10812 /**
10813  * Batch updates the cache attributes of a list of pages in three passes.
10814  *
10815  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10816  * In pass two, TLB entries are flushed for each page in the list if necessary.
10817  * In pass three, caches are cleaned for each page in the list if necessary.
10818  *
10819  * When running in PPL, this function may decide to return to the caller in response
10820  * to AST_URGENT.
10821  *
10822  * @param user_page_list List of pages to be updated.
10823  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10824  * @param page_cnt Number of pages in total in user_page_list.
10825  * @param cacheattr The new cache attributes.
10826  *
10827  * @return The new state of the state machine.
10828  */
10829 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10830 pmap_batch_set_cache_attributes_internal(
10831 #if XNU_MONITOR
10832 	volatile upl_page_info_t *user_page_list,
10833 #else /* !XNU_MONITOR */
10834 	upl_page_info_array_t user_page_list,
10835 #endif /* XNU_MONITOR */
10836 	batch_set_cache_attr_state_t states,
10837 	unsigned int page_cnt,
10838 	unsigned int cacheattr)
10839 {
10840 	uint64_t page_index = states.page_index;
10841 	uint64_t state = states.state;
10842 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10843 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10844 
10845 	/* For verifying progress. */
10846 	__assert_only const uint64_t page_index_old = page_index;
10847 	__assert_only const uint64_t state_old = state;
10848 
10849 	/* Assert page_index and state are within their range. */
10850 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10851 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10852 	}
10853 
10854 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10855 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10856 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10857 		while (page_index < page_cnt) {
10858 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10859 			const pmap_paddr_t paddr = ptoa(pn);
10860 
10861 			if (!pa_valid(paddr)) {
10862 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10863 			}
10864 
10865 			const unsigned int pai = pa_index(paddr);
10866 
10867 			/* Lock the page. */
10868 			pvh_lock(pai);
10869 
10870 #if XNU_MONITOR
10871 			if (ppattr_pa_test_monitor(paddr)) {
10872 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10873 			}
10874 #endif /* XNU_MONITOR */
10875 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10876 
10877 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10878 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10879 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10880 			}
10881 
10882 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10883 
10884 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10885 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10886 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10887 			}
10888 
10889 			/* Update the cache attributes in PTE and PP_ATTR table. */
10890 			if (wimg_bits_new != wimg_bits_prev) {
10891 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10892 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10893 			}
10894 
10895 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10896 				rt_cache_flush_pass_needed = true;
10897 			}
10898 
10899 			pvh_unlock(pai);
10900 
10901 			page_index++;
10902 
10903 #if XNU_MONITOR
10904 			/**
10905 			 * Check for AST_URGENT every page, as the pve list search in cache
10906 			 * update can take non-constant time.
10907 			 */
10908 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10909 				goto pbscai_exit;
10910 			}
10911 #endif /* XNU_MONITOR */
10912 		}
10913 
10914 		/* page_index == page_cnt && !pmap_pending_preemption() */
10915 		if (tlb_flush_pass_needed) {
10916 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10917 		} else if (rt_cache_flush_pass_needed) {
10918 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10919 		} else {
10920 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10921 		}
10922 		page_index = 0;
10923 
10924 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10925 		FLUSH_PTE_STRONG();
10926 
10927 #if XNU_MONITOR
10928 		if (__improbable(pmap_pending_preemption())) {
10929 			goto pbscai_exit;
10930 		}
10931 #endif /* XNU_MONITOR */
10932 	}
10933 
10934 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10935 		/**
10936 		 * Pass 2: for each physical page and for each mapping, we need to flush
10937 		 * the TLB for it.
10938 		 */
10939 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10940 		while (page_index < page_cnt) {
10941 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10942 
10943 			const pmap_paddr_t paddr = ptoa(pn);
10944 			if (!pa_valid(paddr)) {
10945 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10946 			}
10947 
10948 			const unsigned int pai = pa_index(paddr);
10949 
10950 			pvh_lock(pai);
10951 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10952 			pvh_unlock(pai);
10953 
10954 			page_index++;
10955 
10956 #if XNU_MONITOR
10957 			/**
10958 			 * Check for AST_URGENT every page, as the pve list search in cache
10959 			 * update can take non-constant time.
10960 			 */
10961 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10962 				goto pbscai_exit;
10963 			}
10964 #endif /* XNU_MONITOR */
10965 		}
10966 
10967 #if HAS_FEAT_XS
10968 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10969 		arm64_sync_tlb(false);
10970 #else
10971 		/**
10972 		 * For targets that distinguish between mild and strong DSB, mild DSB
10973 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10974 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10975 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10976 		 */
10977 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10978 #endif
10979 
10980 		if (rt_cache_flush_pass_needed) {
10981 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10982 		} else {
10983 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10984 		}
10985 		page_index = 0;
10986 
10987 #if XNU_MONITOR
10988 		if (__improbable(pmap_pending_preemption())) {
10989 			goto pbscai_exit;
10990 		}
10991 #endif /* XNU_MONITOR */
10992 	}
10993 
10994 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10995 		/* Pass 3: Flush the cache if the page is recently set to RT */
10996 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10997 #if !XNU_MONITOR
10998 		/**
10999 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
11000 		 * in the state where DC by VA instructions remain enabled.
11001 		 */
11002 		disable_preemption();
11003 #endif /* !XNU_MONITOR */
11004 
11005 		assert(get_preemption_level() > 0);
11006 
11007 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11008 		/**
11009 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11010 		 * and the host will handle cache maintenance for it. So we don't need to
11011 		 * worry about enabling the ops here for AVP.
11012 		 */
11013 		enable_dc_mva_ops();
11014 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11015 
11016 		while (page_index < page_cnt) {
11017 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11018 
11019 			if (!pa_valid(paddr)) {
11020 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11021 			}
11022 
11023 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11024 
11025 			page_index++;
11026 
11027 #if XNU_MONITOR
11028 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11029 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11030 				disable_dc_mva_ops();
11031 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11032 				goto pbscai_exit;
11033 			}
11034 #endif /* XNU_MONITOR */
11035 		}
11036 
11037 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11038 		disable_dc_mva_ops();
11039 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11040 
11041 #if !XNU_MONITOR
11042 		enable_preemption();
11043 #endif /* !XNU_MONITOR */
11044 
11045 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11046 		page_index = 0;
11047 	}
11048 
11049 #if XNU_MONITOR
11050 pbscai_exit:
11051 #endif /* XNU_MONITOR */
11052 	/* Assert page_index and state are within their range. */
11053 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11054 
11055 	/* Make sure we are making progress in this call. */
11056 	assert(page_index > page_index_old || state > state_old);
11057 
11058 	batch_set_cache_attr_state_t states_new;
11059 	states_new.page_index = page_index;
11060 	states_new.state = state;
11061 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11062 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11063 	return states_new;
11064 }
11065 
11066 MARK_AS_PMAP_TEXT static void
11067 pmap_set_cache_attributes_priv(
11068 	ppnum_t pn,
11069 	unsigned int cacheattr,
11070 	boolean_t external __unused)
11071 {
11072 	pmap_paddr_t    paddr;
11073 	unsigned int    pai;
11074 	pp_attr_t       pp_attr_current;
11075 	pp_attr_t       pp_attr_template;
11076 	unsigned int    wimg_bits_prev, wimg_bits_new;
11077 
11078 	paddr = ptoa(pn);
11079 
11080 	if (!pa_valid(paddr)) {
11081 		return;                         /* Not a managed page. */
11082 	}
11083 
11084 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
11085 		cacheattr = VM_WIMG_DEFAULT;
11086 	}
11087 
11088 	pai = pa_index(paddr);
11089 
11090 	pvh_lock(pai);
11091 
11092 #if XNU_MONITOR
11093 	if (external && ppattr_pa_test_monitor(paddr)) {
11094 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11095 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
11096 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11097 	}
11098 #endif
11099 
11100 	do {
11101 		pp_attr_current = pp_attr_table[pai];
11102 		wimg_bits_prev = VM_WIMG_DEFAULT;
11103 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11104 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11105 		}
11106 
11107 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11108 
11109 		/**
11110 		 * WIMG bits should only be updated under the PVH lock, but we should do
11111 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11112 		 */
11113 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11114 
11115 	wimg_bits_new = VM_WIMG_DEFAULT;
11116 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11117 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11118 	}
11119 
11120 	if (wimg_bits_new != wimg_bits_prev) {
11121 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
11122 	}
11123 
11124 	pvh_unlock(pai);
11125 
11126 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11127 }
11128 
11129 MARK_AS_PMAP_TEXT void
11130 pmap_set_cache_attributes_internal(
11131 	ppnum_t pn,
11132 	unsigned int cacheattr)
11133 {
11134 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11135 }
11136 
11137 void
11138 pmap_set_cache_attributes(
11139 	ppnum_t pn,
11140 	unsigned int cacheattr)
11141 {
11142 #if XNU_MONITOR
11143 	pmap_set_cache_attributes_ppl(pn, cacheattr);
11144 #else
11145 	pmap_set_cache_attributes_internal(pn, cacheattr);
11146 #endif
11147 }
11148 
11149 /**
11150  * Updates the page numbered ppnum to have attribute specified by attributes.
11151  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11152  * The necessity of the TLB flush is returned in case this function is called
11153  * in a batched manner and the TLB flush is intended to be done at a different
11154  * timing.
11155  *
11156  * @param ppnum Page Number of the page to be updated.
11157  * @param attributes The new cache attributes.
11158  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11159  *        immediately.
11160  *
11161  * @return Returns true if a TLB flush is needed for this update regardless of
11162  *         whether a flush has occurred already.
11163  */
11164 MARK_AS_PMAP_TEXT bool
11165 pmap_update_cache_attributes_locked(
11166 	ppnum_t ppnum,
11167 	unsigned attributes,
11168 	bool perform_tlbi)
11169 {
11170 	pmap_paddr_t    phys = ptoa(ppnum);
11171 	pv_entry_t      *pve_p;
11172 	pt_entry_t      *pte_p;
11173 	pv_entry_t      **pv_h;
11174 	pt_entry_t      tmplate;
11175 	unsigned int    pai;
11176 	boolean_t       tlb_flush_needed = false;
11177 
11178 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11179 
11180 	if (pmap_panic_dev_wimg_on_managed) {
11181 		switch (attributes & VM_WIMG_MASK) {
11182 		case VM_WIMG_IO:                        // nGnRnE
11183 		case VM_WIMG_POSTED:                    // nGnRE
11184 		/* supported on DRAM, but slow, so we disallow */
11185 
11186 		case VM_WIMG_POSTED_REORDERED:          // nGRE
11187 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11188 			/* unsupported on DRAM */
11189 
11190 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11191 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11192 			break;
11193 
11194 		default:
11195 			/* not device type memory, all good */
11196 
11197 			break;
11198 		}
11199 	}
11200 
11201 #if __ARM_PTE_PHYSMAP__
11202 	vm_offset_t kva = phystokv(phys);
11203 	pte_p = pmap_pte(kernel_pmap, kva);
11204 
11205 	tmplate = *pte_p;
11206 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11207 #if XNU_MONITOR
11208 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11209 #else
11210 	tmplate |= wimg_to_pte(attributes, phys);
11211 #endif
11212 	if (tmplate & ARM_PTE_HINT_MASK) {
11213 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11214 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
11215 	}
11216 
11217 	if (perform_tlbi) {
11218 		write_pte_strong(pte_p, tmplate);
11219 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11220 	} else {
11221 		write_pte_fast(pte_p, tmplate);
11222 	}
11223 	tlb_flush_needed = true;
11224 #endif
11225 
11226 	pai = pa_index(phys);
11227 
11228 	pv_h = pai_to_pvh(pai);
11229 
11230 	pte_p = PT_ENTRY_NULL;
11231 	pve_p = PV_ENTRY_NULL;
11232 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11233 		pte_p = pvh_ptep(pv_h);
11234 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11235 		pve_p = pvh_pve_list(pv_h);
11236 		pte_p = PT_ENTRY_NULL;
11237 	}
11238 
11239 	int pve_ptep_idx = 0;
11240 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11241 		vm_map_address_t va;
11242 		pmap_t          pmap;
11243 
11244 		if (pve_p != PV_ENTRY_NULL) {
11245 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11246 			if (pte_p == PT_ENTRY_NULL) {
11247 				goto cache_skip_pve;
11248 			}
11249 		}
11250 
11251 #ifdef PVH_FLAG_IOMMU
11252 		if (pvh_ptep_is_iommu(pte_p)) {
11253 			goto cache_skip_pve;
11254 		}
11255 #endif
11256 		pmap = ptep_get_pmap(pte_p);
11257 #if HAS_FEAT_XS
11258 		/**
11259 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11260 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11261 		 */
11262 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11263 #endif /* HAS_FEAT_XS */
11264 		va = ptep_get_va(pte_p);
11265 
11266 		tmplate = *pte_p;
11267 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11268 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11269 
11270 		if (perform_tlbi) {
11271 			write_pte_strong(pte_p, tmplate);
11272 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11273 			    pmap, true, false);
11274 		} else {
11275 			write_pte_fast(pte_p, tmplate);
11276 		}
11277 		tlb_flush_needed = true;
11278 
11279 cache_skip_pve:
11280 		pte_p = PT_ENTRY_NULL;
11281 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11282 			pve_ptep_idx = 0;
11283 			pve_p = pve_next(pve_p);
11284 		}
11285 	}
11286 	if (perform_tlbi && tlb_flush_needed) {
11287 #if HAS_FEAT_XS
11288 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11289 		arm64_sync_tlb(false);
11290 #else
11291 		/**
11292 		 * For targets that distinguish between mild and strong DSB, mild DSB
11293 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11294 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11295 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11296 		 */
11297 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11298 #endif
11299 	}
11300 
11301 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11302 
11303 	return tlb_flush_needed;
11304 }
11305 
11306 /**
11307  * Mark a pmap as being dedicated to use for a commpage mapping.
11308  * The pmap itself will never be activated on a CPU; its mappings will
11309  * only be embedded in userspace pmaps at a fixed virtual address.
11310  *
11311  * @param pmap the pmap to mark as belonging to a commpage.
11312  */
11313 static void
11314 pmap_set_commpage(pmap_t pmap)
11315 {
11316 #if XNU_MONITOR
11317 	assert(!pmap_ppl_locked_down);
11318 #endif
11319 	assert(pmap->type == PMAP_TYPE_USER);
11320 	pmap->type = PMAP_TYPE_COMMPAGE;
11321 	/*
11322 	 * Free the pmap's ASID.  This pmap should not ever be directly
11323 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11324 	 * ASID space contention but will also cause pmap_switch() to panic
11325 	 * if an attacker tries to activate this pmap.  Disable preemption to
11326 	 * accommodate the *_nopreempt spinlock in free_asid().
11327 	 */
11328 	mp_disable_preemption();
11329 	pmap_get_pt_ops(pmap)->free_id(pmap);
11330 	mp_enable_preemption();
11331 }
11332 
11333 static void
11334 pmap_update_tt3e(
11335 	pmap_t pmap,
11336 	vm_address_t address,
11337 	tt_entry_t template)
11338 {
11339 	tt_entry_t *ptep, pte;
11340 
11341 	ptep = pmap_tt3e(pmap, address);
11342 	if (ptep == NULL) {
11343 		panic("%s: no ptep?", __FUNCTION__);
11344 	}
11345 
11346 	pte = *ptep;
11347 	pte = tte_to_pa(pte) | template;
11348 	write_pte_strong(ptep, pte);
11349 }
11350 
11351 /* Note absence of non-global bit */
11352 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11353 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11354 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11355 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11356 
11357 /* Note absence of non-global bit and no-execute bit.  */
11358 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11359 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11360 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11361 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11362 
11363 void
11364 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11365     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11366 {
11367 	kern_return_t kr;
11368 	pmap_paddr_t data_pa = 0; // data address
11369 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11370 	pmap_paddr_t text_pa = 0; // text address
11371 
11372 	*kernel_data_addr = 0;
11373 	*kernel_text_addr = 0;
11374 	*user_text_addr = 0;
11375 
11376 #if XNU_MONITOR
11377 	data_pa = pmap_alloc_page_for_kern(0);
11378 	assert(data_pa);
11379 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11380 	ro_data_pa = pmap_alloc_page_for_kern(0);
11381 	assert(ro_data_pa);
11382 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11383 #if CONFIG_ARM_PFZ
11384 	text_pa = pmap_alloc_page_for_kern(0);
11385 	assert(text_pa);
11386 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11387 #endif
11388 
11389 #else /* XNU_MONITOR */
11390 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11391 	/*
11392 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11393 	 * mapped at page granularity, so a separate page for kernel RO data would not
11394 	 * be useful.
11395 	 */
11396 	ro_data_pa = data_pa;
11397 #if CONFIG_ARM_PFZ
11398 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11399 #endif
11400 
11401 #endif /* XNU_MONITOR */
11402 
11403 	/*
11404 	 * In order to avoid burning extra pages on mapping the shared page, we
11405 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11406 	 * translation tables from this pmap into other pmaps.  The level we
11407 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11408 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11409 	 *
11410 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11411 	 * shared cache).
11412 	 *
11413 	 * Note that we update parameters of the entry for our unique needs (NG
11414 	 * entry, etc.).
11415 	 */
11416 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11417 	assert(commpage_pmap_default != NULL);
11418 	pmap_set_commpage(commpage_pmap_default);
11419 
11420 	/* The user 64-bit mappings... */
11421 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11422 	assert(kr == KERN_SUCCESS);
11423 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11424 
11425 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11426 	assert(kr == KERN_SUCCESS);
11427 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11428 #if CONFIG_ARM_PFZ
11429 	/* User mapping of comm page text section for 64 bit mapping only
11430 	 *
11431 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11432 	 * user processes to get this page mapped in, they should never call into
11433 	 * this page.
11434 	 *
11435 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11436 	 * is slid in the same L3 as the data commpage.  It is either outside the
11437 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11438 	 * it is reserved and unavailable to mach VM for future mappings.
11439 	 */
11440 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11441 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11442 
11443 	vm_map_address_t commpage_text_va = 0;
11444 
11445 	do {
11446 		int text_leaf_index = random() % num_ptes;
11447 
11448 		// Generate a VA for the commpage text with the same root and twig index as data
11449 		// comm page, but with new leaf index we've just generated.
11450 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11451 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11452 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11453 
11454 	// Assert that this is empty
11455 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11456 	assert(ptep != PT_ENTRY_NULL);
11457 	assert(*ptep == ARM_TTE_EMPTY);
11458 
11459 	// At this point, we've found the address we want to insert our comm page at
11460 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11461 	assert(kr == KERN_SUCCESS);
11462 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11463 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11464 
11465 	*user_text_addr = commpage_text_va;
11466 #endif
11467 
11468 	/* ...and the user 32-bit mappings. */
11469 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11470 	assert(kr == KERN_SUCCESS);
11471 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11472 
11473 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11474 	assert(kr == KERN_SUCCESS);
11475 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11476 #if __ARM_MIXED_PAGE_SIZE__
11477 	/**
11478 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11479 	 * new set of page tables that point to the exact same 16K shared page as
11480 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11481 	 * the only part that contains relevant data.
11482 	 */
11483 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11484 	assert(commpage_pmap_4k != NULL);
11485 	pmap_set_commpage(commpage_pmap_4k);
11486 
11487 	/* The user 64-bit mappings... */
11488 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11489 	assert(kr == KERN_SUCCESS);
11490 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11491 
11492 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11493 	assert(kr == KERN_SUCCESS);
11494 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11495 
11496 	/* ...and the user 32-bit mapping. */
11497 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11498 	assert(kr == KERN_SUCCESS);
11499 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11500 
11501 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11502 	assert(kr == KERN_SUCCESS);
11503 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11504 #endif
11505 
11506 	/* For manipulation in kernel, go straight to physical page */
11507 	*kernel_data_addr = phystokv(data_pa);
11508 	assert(commpage_ro_data_kva == 0);
11509 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11510 	assert(commpage_text_kva == 0);
11511 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11512 }
11513 
11514 
11515 /*
11516  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11517  * with user controlled TTEs for regions that aren't explicitly reserved by the
11518  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11519  */
11520 #if (ARM_PGSHIFT == 14)
11521 /**
11522  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11523  * commpage completely above the maximum 32-bit userspace VA.
11524  */
11525 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11526 
11527 /**
11528  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11529  * userspace VAs can nest the commpage completely above the maximum 64-bit
11530  * userpace VA, but that technically isn't true on macOS. On those systems, the
11531  * commpage lives within the userspace VA range, but is protected by the VM as
11532  * a reserved region (see vm_reserved_regions[] definition for more info).
11533  */
11534 
11535 #elif (ARM_PGSHIFT == 12)
11536 /**
11537  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11538  * above the maximum userspace VA.
11539  */
11540 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11541 #else
11542 #error Nested shared page mapping is unsupported on this config
11543 #endif
11544 
11545 MARK_AS_PMAP_TEXT kern_return_t
11546 pmap_insert_commpage_internal(
11547 	pmap_t pmap)
11548 {
11549 	kern_return_t kr = KERN_SUCCESS;
11550 	vm_offset_t commpage_vaddr;
11551 	pt_entry_t *ttep, *src_ttep;
11552 	int options = 0;
11553 	pmap_t commpage_pmap = commpage_pmap_default;
11554 
11555 	/* Validate the pmap input before accessing its data. */
11556 	validate_pmap_mutable(pmap);
11557 
11558 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11559 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11560 
11561 #if __ARM_MIXED_PAGE_SIZE__
11562 #if !__ARM_16K_PG__
11563 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11564 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11565 #endif /* !__ARM_16K_PG__ */
11566 
11567 	/* Choose the correct shared page pmap to use. */
11568 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11569 	if (pmap_page_size == 16384) {
11570 		commpage_pmap = commpage_pmap_default;
11571 	} else if (pmap_page_size == 4096) {
11572 		commpage_pmap = commpage_pmap_4k;
11573 	} else {
11574 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11575 	}
11576 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11577 
11578 #if XNU_MONITOR
11579 	options |= PMAP_OPTIONS_NOWAIT;
11580 #endif /* XNU_MONITOR */
11581 
11582 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11583 #error We assume a single page.
11584 #endif
11585 
11586 	if (pmap_is_64bit(pmap)) {
11587 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11588 	} else {
11589 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11590 	}
11591 
11592 
11593 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11594 
11595 	/*
11596 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11597 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11598 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11599 	 * to "nest".
11600 	 *
11601 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11602 	 * nesting just means inserting pointers to pre-allocated tables inside of
11603 	 * the passed in pmap to allow us to share page tables (which map the shared
11604 	 * page) for every task. This saves at least one page of memory per process
11605 	 * compared to creating new page tables in every process for mapping the
11606 	 * shared page.
11607 	 */
11608 
11609 	/**
11610 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11611 	 * page's tables into place.
11612 	 */
11613 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11614 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11615 
11616 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11617 
11618 		if (kr != KERN_SUCCESS) {
11619 #if XNU_MONITOR
11620 			if (kr == KERN_RESOURCE_SHORTAGE) {
11621 				return kr;
11622 			} else
11623 #endif
11624 			if (kr == KERN_ABORTED) {
11625 				return kr;
11626 			} else {
11627 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11628 			}
11629 		}
11630 
11631 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11632 	}
11633 
11634 	if (*ttep != ARM_PTE_EMPTY) {
11635 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11636 	}
11637 
11638 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11639 
11640 	*ttep = *src_ttep;
11641 	FLUSH_PTE_STRONG();
11642 
11643 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11644 
11645 	return kr;
11646 }
11647 
11648 static void
11649 pmap_unmap_commpage(
11650 	pmap_t pmap)
11651 {
11652 	pt_entry_t *ttep;
11653 	vm_offset_t commpage_vaddr;
11654 	pmap_t commpage_pmap = commpage_pmap_default;
11655 
11656 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11657 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11658 
11659 #if __ARM_MIXED_PAGE_SIZE__
11660 #if !__ARM_16K_PG__
11661 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11662 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11663 #endif /* !__ARM_16K_PG__ */
11664 
11665 	/* Choose the correct shared page pmap to use. */
11666 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11667 	if (pmap_page_size == 16384) {
11668 		commpage_pmap = commpage_pmap_default;
11669 	} else if (pmap_page_size == 4096) {
11670 		commpage_pmap = commpage_pmap_4k;
11671 	} else {
11672 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11673 	}
11674 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11675 
11676 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11677 #error We assume a single page.
11678 #endif
11679 
11680 	if (pmap_is_64bit(pmap)) {
11681 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11682 	} else {
11683 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11684 	}
11685 
11686 
11687 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11688 
11689 	if (ttep == NULL) {
11690 		return;
11691 	}
11692 
11693 	/* It had better be mapped to the shared page. */
11694 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11695 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11696 	}
11697 
11698 	*ttep = ARM_TTE_EMPTY;
11699 	FLUSH_PTE_STRONG();
11700 
11701 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11702 	sync_tlb_flush();
11703 }
11704 
11705 void
11706 pmap_insert_commpage(
11707 	pmap_t pmap)
11708 {
11709 	kern_return_t kr = KERN_FAILURE;
11710 #if XNU_MONITOR
11711 	do {
11712 		kr = pmap_insert_commpage_ppl(pmap);
11713 
11714 		if (kr == KERN_RESOURCE_SHORTAGE) {
11715 			pmap_alloc_page_for_ppl(0);
11716 		}
11717 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11718 
11719 	pmap_ledger_check_balance(pmap);
11720 #else
11721 	do {
11722 		kr = pmap_insert_commpage_internal(pmap);
11723 	} while (kr == KERN_ABORTED);
11724 #endif
11725 
11726 	if (kr != KERN_SUCCESS) {
11727 		panic("%s: failed to insert the shared page, kr=%d, "
11728 		    "pmap=%p",
11729 		    __FUNCTION__, kr,
11730 		    pmap);
11731 	}
11732 }
11733 
11734 static boolean_t
11735 pmap_is_64bit(
11736 	pmap_t pmap)
11737 {
11738 	return pmap->is_64bit;
11739 }
11740 
11741 bool
11742 pmap_is_exotic(
11743 	pmap_t pmap __unused)
11744 {
11745 	return false;
11746 }
11747 
11748 
11749 /* ARMTODO -- an implementation that accounts for
11750  * holes in the physical map, if any.
11751  */
11752 boolean_t
11753 pmap_valid_page(
11754 	ppnum_t pn)
11755 {
11756 	return pa_valid(ptoa(pn));
11757 }
11758 
11759 boolean_t
11760 pmap_bootloader_page(
11761 	ppnum_t pn)
11762 {
11763 	pmap_paddr_t paddr = ptoa(pn);
11764 
11765 	if (pa_valid(paddr)) {
11766 		return FALSE;
11767 	}
11768 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11769 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11770 }
11771 
11772 MARK_AS_PMAP_TEXT boolean_t
11773 pmap_is_empty_internal(
11774 	pmap_t pmap,
11775 	vm_map_offset_t va_start,
11776 	vm_map_offset_t va_end)
11777 {
11778 	vm_map_offset_t block_start, block_end;
11779 	tt_entry_t *tte_p;
11780 
11781 	if (pmap == NULL) {
11782 		return TRUE;
11783 	}
11784 
11785 	validate_pmap(pmap);
11786 
11787 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11788 	unsigned int initial_not_in_kdp = not_in_kdp;
11789 
11790 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11791 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11792 	}
11793 
11794 
11795 	/* TODO: This will be faster if we increment ttep at each level. */
11796 	block_start = va_start;
11797 
11798 	while (block_start < va_end) {
11799 		pt_entry_t     *bpte_p, *epte_p;
11800 		pt_entry_t     *pte_p;
11801 
11802 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11803 		if (block_end > va_end) {
11804 			block_end = va_end;
11805 		}
11806 
11807 		tte_p = pmap_tte(pmap, block_start);
11808 		if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11809 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11810 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11811 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11812 
11813 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11814 				if (*pte_p != ARM_PTE_EMPTY) {
11815 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11816 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11817 					}
11818 					return FALSE;
11819 				}
11820 			}
11821 		}
11822 		block_start = block_end;
11823 	}
11824 
11825 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11826 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11827 	}
11828 
11829 	return TRUE;
11830 }
11831 
11832 boolean_t
11833 pmap_is_empty(
11834 	pmap_t pmap,
11835 	vm_map_offset_t va_start,
11836 	vm_map_offset_t va_end)
11837 {
11838 #if XNU_MONITOR
11839 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11840 #else
11841 	return pmap_is_empty_internal(pmap, va_start, va_end);
11842 #endif
11843 }
11844 
11845 vm_map_offset_t
11846 pmap_max_offset(
11847 	boolean_t               is64,
11848 	unsigned int    option)
11849 {
11850 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11851 }
11852 
11853 vm_map_offset_t
11854 pmap_max_64bit_offset(
11855 	__unused unsigned int option)
11856 {
11857 	vm_map_offset_t max_offset_ret = 0;
11858 
11859 #if defined(__arm64__)
11860 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11861 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11862 		max_offset_ret = arm64_pmap_max_offset_default;
11863 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11864 		max_offset_ret = min_max_offset;
11865 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11866 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11867 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11868 		if (arm64_pmap_max_offset_default) {
11869 			max_offset_ret = arm64_pmap_max_offset_default;
11870 		} else if (max_mem > 0xC0000000) {
11871 			// devices with > 3GB of memory
11872 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11873 		} else if (max_mem > 0x40000000) {
11874 			// devices with > 1GB and <= 3GB of memory
11875 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11876 		} else {
11877 			// devices with <= 1 GB of memory
11878 			max_offset_ret = min_max_offset;
11879 		}
11880 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11881 		if (arm64_pmap_max_offset_default) {
11882 			// Allow the boot-arg to override jumbo size
11883 			max_offset_ret = arm64_pmap_max_offset_default;
11884 		} else {
11885 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11886 		}
11887 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11888 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11889 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11890 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11891 	} else {
11892 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11893 	}
11894 
11895 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11896 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11897 		assert(max_offset_ret >= min_max_offset);
11898 	}
11899 #else
11900 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11901 #endif
11902 
11903 	return max_offset_ret;
11904 }
11905 
11906 vm_map_offset_t
11907 pmap_max_32bit_offset(
11908 	unsigned int option)
11909 {
11910 	vm_map_offset_t max_offset_ret = 0;
11911 
11912 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11913 		max_offset_ret = arm_pmap_max_offset_default;
11914 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11915 		max_offset_ret = VM_MAX_ADDRESS;
11916 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11917 		max_offset_ret = VM_MAX_ADDRESS;
11918 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11919 		if (arm_pmap_max_offset_default) {
11920 			max_offset_ret = arm_pmap_max_offset_default;
11921 		} else if (max_mem > 0x20000000) {
11922 			max_offset_ret = VM_MAX_ADDRESS;
11923 		} else {
11924 			max_offset_ret = VM_MAX_ADDRESS;
11925 		}
11926 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11927 		max_offset_ret = VM_MAX_ADDRESS;
11928 	} else {
11929 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11930 	}
11931 
11932 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11933 	return max_offset_ret;
11934 }
11935 
11936 #if CONFIG_DTRACE
11937 /*
11938  * Constrain DTrace copyin/copyout actions
11939  */
11940 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11941 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11942 
11943 kern_return_t
11944 dtrace_copyio_preflight(
11945 	__unused addr64_t va)
11946 {
11947 	if (current_map() == kernel_map) {
11948 		return KERN_FAILURE;
11949 	} else {
11950 		return KERN_SUCCESS;
11951 	}
11952 }
11953 
11954 kern_return_t
11955 dtrace_copyio_postflight(
11956 	__unused addr64_t va)
11957 {
11958 	return KERN_SUCCESS;
11959 }
11960 #endif /* CONFIG_DTRACE */
11961 
11962 
11963 void
11964 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11965 {
11966 }
11967 
11968 
11969 void
11970 pmap_flush(
11971 	__unused pmap_flush_context *cpus_to_flush)
11972 {
11973 	/* not implemented yet */
11974 	return;
11975 }
11976 
11977 #if XNU_MONITOR
11978 
11979 /*
11980  * Enforce that the address range described by kva and nbytes is not currently
11981  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11982  * unintentionally writing to PPL-owned memory.
11983  */
11984 void
11985 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11986 {
11987 	vm_offset_t end;
11988 	if (os_add_overflow(kva, nbytes, &end)) {
11989 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11990 	}
11991 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11992 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11993 		unsigned int pai = pa_index(pa);
11994 		pp_attr_t attr;
11995 		if (__improbable(!pa_valid(pa))) {
11996 			panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11997 		}
11998 		pvh_lock(pai);
11999 		if (__improbable(ckva == phystokv(pa))) {
12000 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12001 		}
12002 		do {
12003 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12004 			if (__improbable(attr & PP_ATTR_MONITOR)) {
12005 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12006 			}
12007 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12008 		pvh_unlock(pai);
12009 		if (__improbable(kvtophys_nofail(ckva) != pa)) {
12010 			panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12011 		}
12012 	}
12013 }
12014 
12015 void
12016 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12017 {
12018 	vm_offset_t end;
12019 	if (os_add_overflow(kva, nbytes, &end)) {
12020 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12021 	}
12022 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12023 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12024 
12025 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12026 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12027 		}
12028 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12029 		ppattr_pa_clear_no_monitor(pa);
12030 	}
12031 }
12032 
12033 /**
12034  * Lock down a page, making all mappings read-only, and preventing further
12035  * mappings or removal of this particular kva's mapping. Effectively, it makes
12036  * the physical page at kva immutable (see the ppl_writable parameter for an
12037  * exception to this).
12038  *
12039  * @param kva Valid address to any mapping of the physical page to lockdown.
12040  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12041  * @param ppl_writable True if the PPL should still be able to write to the page
12042  *                     using the physical aperture mapping. False will make the
12043  *                     page read-only for both the kernel and PPL in the
12044  *                     physical aperture.
12045  */
12046 
12047 MARK_AS_PMAP_TEXT static void
12048 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12049 {
12050 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12051 }
12052 
12053 /**
12054  * Lock down a page, giving all mappings the specified maximum permissions, and
12055  * preventing further mappings or removal of this particular kva's mapping.
12056  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12057  * parameter for an exception to this).
12058  *
12059  * @param kva Valid address to any mapping of the physical page to lockdown.
12060  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12061  * @param ppl_writable True if the PPL should still be able to write to the page
12062  *                     using the physical aperture mapping. False will make the
12063  *                     page read-only for both the kernel and PPL in the
12064  *                     physical aperture.
12065  * @param prot Maximum permissions to allow in existing alias mappings
12066  */
12067 MARK_AS_PMAP_TEXT static void
12068 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12069 {
12070 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12071 	const unsigned int pai = pa_index(pa);
12072 
12073 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12074 	pvh_lock(pai);
12075 	pv_entry_t **pvh = pai_to_pvh(pai);
12076 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12077 
12078 	if (__improbable(ppattr_pa_test_monitor(pa))) {
12079 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12080 	}
12081 
12082 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12083 		panic("%s: %#lx already locked down/executable (%#llx)",
12084 		    __func__, kva, (uint64_t)pvh_flags);
12085 	}
12086 
12087 
12088 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12089 
12090 	/* Update the physical aperture mapping to prevent kernel write access. */
12091 	const unsigned int new_xprr_perm =
12092 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12093 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12094 
12095 	pvh_unlock(pai);
12096 
12097 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12098 
12099 	/**
12100 	 * Double-check that the mapping didn't change physical addresses before the
12101 	 * LOCKDOWN flag was set (there is a brief window between the above
12102 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12103 	 *
12104 	 * This doesn't solve the ABA problem, but this doesn't have to since once
12105 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
12106 	 * page without the LOCKDOWN flag already set (so any future mappings can
12107 	 * only be RO, and no existing mappings can be removed).
12108 	 */
12109 	if (kvtophys_nofail(kva) != pa) {
12110 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12111 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12112 	}
12113 }
12114 
12115 /**
12116  * Helper for releasing a page from being locked down to the PPL, making it writable to the
12117  * kernel once again.
12118  *
12119  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12120  *       to unlockdown a page that was never locked down, will panic.
12121  *
12122  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
12123  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12124  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12125  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12126  *                     deviation will result in a panic.
12127  */
12128 MARK_AS_PMAP_TEXT static void
12129 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12130 {
12131 	pvh_assert_locked(pai);
12132 	pv_entry_t **pvh = pai_to_pvh(pai);
12133 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12134 
12135 	if (__improbable(!(pvh_flags & lockdown_flag))) {
12136 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12137 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12138 	}
12139 
12140 
12141 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12142 
12143 	/* Restore the pre-lockdown physical aperture mapping permissions. */
12144 	const unsigned int old_xprr_perm =
12145 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12146 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12147 }
12148 
12149 /**
12150  * Release a page from being locked down to the PPL, making it writable to the
12151  * kernel once again.
12152  *
12153  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12154  *       to unlockdown a page that was never locked down, will panic.
12155  *
12156  * @param kva Valid address to any mapping of the physical page to unlockdown.
12157  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12158  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12159  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12160  *                     deviation will result in a panic.
12161  */
12162 MARK_AS_PMAP_TEXT static void
12163 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12164 {
12165 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12166 	const unsigned int pai = pa_index(pa);
12167 
12168 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12169 	pvh_lock(pai);
12170 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12171 	pvh_unlock(pai);
12172 }
12173 
12174 #else /* XNU_MONITOR */
12175 
12176 void __unused
12177 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12178 {
12179 }
12180 
12181 void __unused
12182 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12183 {
12184 }
12185 
12186 #endif /* !XNU_MONITOR */
12187 
12188 
12189 MARK_AS_PMAP_TEXT static inline void
12190 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12191 {
12192 #if XNU_MONITOR
12193 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12194 #else
12195 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12196 #endif
12197 }
12198 
12199 MARK_AS_PMAP_TEXT static inline void
12200 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12201 {
12202 #if XNU_MONITOR
12203 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12204 #else
12205 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12206 #endif
12207 }
12208 
12209 /**
12210  * Perform basic validation checks on the destination only and
12211  * corresponding offset/sizes prior to writing to a read only allocation.
12212  *
12213  * @note Should be called before writing to an allocation from the read
12214  * only allocator.
12215  *
12216  * @param zid The ID of the zone the allocation belongs to.
12217  * @param va VA of element being modified (destination).
12218  * @param offset Offset being written to, in the element.
12219  * @param new_data_size Size of modification.
12220  *
12221  */
12222 
12223 MARK_AS_PMAP_TEXT static void
12224 pmap_ro_zone_validate_element_dst(
12225 	zone_id_t           zid,
12226 	vm_offset_t         va,
12227 	vm_offset_t         offset,
12228 	vm_size_t           new_data_size)
12229 {
12230 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12231 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12232 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12233 	}
12234 
12235 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12236 
12237 	/* Check element is from correct zone and properly aligned */
12238 	zone_require_ro(zid, elem_size, (void*)va);
12239 
12240 	if (__improbable(new_data_size > (elem_size - offset))) {
12241 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12242 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12243 	}
12244 	if (__improbable(offset >= elem_size)) {
12245 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12246 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12247 	}
12248 }
12249 
12250 
12251 /**
12252  * Perform basic validation checks on the source, destination and
12253  * corresponding offset/sizes prior to writing to a read only allocation.
12254  *
12255  * @note Should be called before writing to an allocation from the read
12256  * only allocator.
12257  *
12258  * @param zid The ID of the zone the allocation belongs to.
12259  * @param va VA of element being modified (destination).
12260  * @param offset Offset being written to, in the element.
12261  * @param new_data Pointer to new data (source).
12262  * @param new_data_size Size of modification.
12263  *
12264  */
12265 
12266 MARK_AS_PMAP_TEXT static void
12267 pmap_ro_zone_validate_element(
12268 	zone_id_t           zid,
12269 	vm_offset_t         va,
12270 	vm_offset_t         offset,
12271 	const vm_offset_t   new_data,
12272 	vm_size_t           new_data_size)
12273 {
12274 	vm_offset_t sum = 0;
12275 
12276 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12277 		panic("%s: Integer addition overflow %p + %lu = %lu",
12278 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12279 	}
12280 
12281 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12282 }
12283 
12284 /**
12285  * Ensure that physical page is locked down before writing to it.
12286  *
12287  * @note Should be called before writing to an allocation from the read
12288  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12289  * ensure that it is called after the modification.
12290  *
12291  *
12292  * @param pa Physical address of the element being modified.
12293  * @param va Virtual address of element being modified.
12294  * @param size Size of the modification.
12295  *
12296  */
12297 
12298 MARK_AS_PMAP_TEXT static void
12299 pmap_ro_zone_lock_phy_page(
12300 	const pmap_paddr_t  pa,
12301 	vm_offset_t         va,
12302 	vm_size_t           size)
12303 {
12304 	if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12305 		panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12306 		    __func__, (unsigned long long)va, (unsigned long long)size);
12307 	}
12308 	const unsigned int pai = pa_index(pa);
12309 	pvh_lock(pai);
12310 
12311 	/* Ensure that the physical page is locked down */
12312 #if XNU_MONITOR
12313 	pv_entry_t **pvh = pai_to_pvh(pai);
12314 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12315 		panic("%s: Physical page not locked down %llx", __func__, pa);
12316 	}
12317 #endif /* XNU_MONITOR */
12318 }
12319 
12320 /**
12321  * Unlock physical page after writing to it.
12322  *
12323  * @note Should be called after writing to an allocation from the read
12324  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12325  * ensure that it has been called prior to the modification.
12326  *
12327  * @param pa Physical address of the element that was modified.
12328  * @param va Virtual address of element that was modified.
12329  * @param size Size of the modification.
12330  *
12331  */
12332 
12333 MARK_AS_PMAP_TEXT static void
12334 pmap_ro_zone_unlock_phy_page(
12335 	const pmap_paddr_t  pa,
12336 	vm_offset_t         va __unused,
12337 	vm_size_t           size __unused)
12338 {
12339 	const unsigned int pai = pa_index(pa);
12340 	pvh_unlock(pai);
12341 }
12342 
12343 /**
12344  * Function to copy kauth_cred from new_data to kv.
12345  * Function defined in "kern_prot.c"
12346  *
12347  * @note Will be removed upon completion of
12348  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12349  *
12350  * @param kv Address to copy new data to.
12351  * @param new_data Pointer to new data.
12352  *
12353  */
12354 
12355 extern void
12356 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12357 
12358 /**
12359  * Zalloc-specific memcpy that writes through the physical aperture
12360  * and ensures the element being modified is from a read-only zone.
12361  *
12362  * @note Designed to work only with the zone allocator's read-only submap.
12363  *
12364  * @param zid The ID of the zone to allocate from.
12365  * @param va VA of element to be modified.
12366  * @param offset Offset from element.
12367  * @param new_data Pointer to new data.
12368  * @param new_data_size	Size of modification.
12369  *
12370  */
12371 
12372 void
12373 pmap_ro_zone_memcpy(
12374 	zone_id_t           zid,
12375 	vm_offset_t         va,
12376 	vm_offset_t         offset,
12377 	const vm_offset_t   new_data,
12378 	vm_size_t           new_data_size)
12379 {
12380 #if XNU_MONITOR
12381 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12382 #else /* XNU_MONITOR */
12383 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12384 #endif /* XNU_MONITOR */
12385 }
12386 
12387 MARK_AS_PMAP_TEXT void
12388 pmap_ro_zone_memcpy_internal(
12389 	zone_id_t             zid,
12390 	vm_offset_t           va,
12391 	vm_offset_t           offset,
12392 	const vm_offset_t     new_data,
12393 	vm_size_t             new_data_size)
12394 {
12395 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12396 
12397 	if (!new_data || new_data_size == 0) {
12398 		return;
12399 	}
12400 
12401 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12402 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12403 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12404 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12405 }
12406 
12407 /**
12408  * Zalloc-specific function to atomically mutate fields of an element that
12409  * belongs to a read-only zone, via the physcial aperture.
12410  *
12411  * @note Designed to work only with the zone allocator's read-only submap.
12412  *
12413  * @param zid The ID of the zone the element belongs to.
12414  * @param va VA of element to be modified.
12415  * @param offset Offset in element.
12416  * @param op Atomic operation to perform.
12417  * @param value	Mutation value.
12418  *
12419  */
12420 
12421 uint64_t
12422 pmap_ro_zone_atomic_op(
12423 	zone_id_t             zid,
12424 	vm_offset_t           va,
12425 	vm_offset_t           offset,
12426 	zro_atomic_op_t       op,
12427 	uint64_t              value)
12428 {
12429 #if XNU_MONITOR
12430 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12431 #else /* XNU_MONITOR */
12432 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12433 #endif /* XNU_MONITOR */
12434 }
12435 
12436 MARK_AS_PMAP_TEXT uint64_t
12437 pmap_ro_zone_atomic_op_internal(
12438 	zone_id_t             zid,
12439 	vm_offset_t           va,
12440 	vm_offset_t           offset,
12441 	zro_atomic_op_t       op,
12442 	uint64_t              value)
12443 {
12444 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12445 	vm_size_t value_size = op & 0xf;
12446 
12447 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12448 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12449 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12450 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12451 
12452 	return value;
12453 }
12454 
12455 /**
12456  * bzero for allocations from read only zones, that writes through the
12457  * physical aperture.
12458  *
12459  * @note This is called by the zfree path of all allocations from read
12460  * only zones.
12461  *
12462  * @param zid The ID of the zone the allocation belongs to.
12463  * @param va VA of element to be zeroed.
12464  * @param offset Offset in the element.
12465  * @param size	Size of allocation.
12466  *
12467  */
12468 
12469 void
12470 pmap_ro_zone_bzero(
12471 	zone_id_t       zid,
12472 	vm_offset_t     va,
12473 	vm_offset_t     offset,
12474 	vm_size_t       size)
12475 {
12476 #if XNU_MONITOR
12477 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12478 #else /* XNU_MONITOR */
12479 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12480 #endif /* XNU_MONITOR */
12481 }
12482 
12483 MARK_AS_PMAP_TEXT void
12484 pmap_ro_zone_bzero_internal(
12485 	zone_id_t       zid,
12486 	vm_offset_t     va,
12487 	vm_offset_t     offset,
12488 	vm_size_t       size)
12489 {
12490 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12491 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12492 	pmap_ro_zone_lock_phy_page(pa, va, size);
12493 	bzero((void*)phystokv(pa), size);
12494 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12495 }
12496 
12497 /**
12498  * Removes write access from the Physical Aperture.
12499  *
12500  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12501  * @note Designed to work only with the zone allocator's read-only submap.
12502  *
12503  * @param va VA of the page to restore write access to.
12504  *
12505  */
12506 MARK_AS_PMAP_TEXT static void
12507 pmap_phys_write_disable(vm_address_t va)
12508 {
12509 #if XNU_MONITOR
12510 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12511 #else /* XNU_MONITOR */
12512 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12513 #endif /* XNU_MONITOR */
12514 }
12515 
12516 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12517 
12518 MARK_AS_PMAP_TEXT mach_vm_size_t
12519 pmap_query_resident_internal(
12520 	pmap_t                  pmap,
12521 	vm_map_address_t        start,
12522 	vm_map_address_t        end,
12523 	mach_vm_size_t          *compressed_bytes_p)
12524 {
12525 	mach_vm_size_t  resident_bytes = 0;
12526 	mach_vm_size_t  compressed_bytes = 0;
12527 
12528 	pt_entry_t     *bpte, *epte;
12529 	pt_entry_t     *pte_p;
12530 	tt_entry_t     *tte_p;
12531 
12532 	if (pmap == NULL) {
12533 		return PMAP_RESIDENT_INVALID;
12534 	}
12535 
12536 	validate_pmap(pmap);
12537 
12538 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12539 
12540 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12541 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12542 	    (end % pt_attr_page_size(pt_attr)))) {
12543 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12544 	}
12545 
12546 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12547 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12548 	}
12549 
12550 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12551 	tte_p = pmap_tte(pmap, start);
12552 	if (tte_p == (tt_entry_t *) NULL) {
12553 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12554 		return PMAP_RESIDENT_INVALID;
12555 	}
12556 	if (tte_is_valid_table(*tte_p)) {
12557 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12558 		bpte = &pte_p[pte_index(pt_attr, start)];
12559 		epte = &pte_p[pte_index(pt_attr, end)];
12560 
12561 		for (; bpte < epte; bpte++) {
12562 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12563 				compressed_bytes += pt_attr_page_size(pt_attr);
12564 			} else if (pa_valid(pte_to_pa(*bpte))) {
12565 				resident_bytes += pt_attr_page_size(pt_attr);
12566 			}
12567 		}
12568 	}
12569 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12570 
12571 	if (compressed_bytes_p) {
12572 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12573 		*compressed_bytes_p += compressed_bytes;
12574 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12575 	}
12576 
12577 	return resident_bytes;
12578 }
12579 
12580 mach_vm_size_t
12581 pmap_query_resident(
12582 	pmap_t                  pmap,
12583 	vm_map_address_t        start,
12584 	vm_map_address_t        end,
12585 	mach_vm_size_t          *compressed_bytes_p)
12586 {
12587 	mach_vm_size_t          total_resident_bytes;
12588 	mach_vm_size_t          compressed_bytes;
12589 	vm_map_address_t        va;
12590 
12591 
12592 	if (pmap == PMAP_NULL) {
12593 		if (compressed_bytes_p) {
12594 			*compressed_bytes_p = 0;
12595 		}
12596 		return 0;
12597 	}
12598 
12599 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12600 
12601 	total_resident_bytes = 0;
12602 	compressed_bytes = 0;
12603 
12604 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12605 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12606 	    VM_KERNEL_ADDRHIDE(end));
12607 
12608 	va = start;
12609 	while (va < end) {
12610 		vm_map_address_t l;
12611 		mach_vm_size_t resident_bytes;
12612 
12613 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12614 
12615 		if (l > end) {
12616 			l = end;
12617 		}
12618 #if XNU_MONITOR
12619 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12620 #else
12621 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12622 #endif
12623 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12624 			break;
12625 		}
12626 
12627 		total_resident_bytes += resident_bytes;
12628 
12629 		va = l;
12630 	}
12631 
12632 	if (compressed_bytes_p) {
12633 		*compressed_bytes_p = compressed_bytes;
12634 	}
12635 
12636 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12637 	    total_resident_bytes);
12638 
12639 	return total_resident_bytes;
12640 }
12641 
12642 #if MACH_ASSERT
12643 static void
12644 pmap_check_ledgers(
12645 	pmap_t pmap)
12646 {
12647 	int     pid;
12648 	char    *procname;
12649 
12650 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12651 		/*
12652 		 * This pmap was not or is no longer fully associated
12653 		 * with a task (e.g. the old pmap after a fork()/exec() or
12654 		 * spawn()).  Its "ledger" still points at a task that is
12655 		 * now using a different (and active) address space, so
12656 		 * we can't check that all the pmap ledgers are balanced here.
12657 		 *
12658 		 * If the "pid" is set, that means that we went through
12659 		 * pmap_set_process() in task_terminate_internal(), so
12660 		 * this task's ledger should not have been re-used and
12661 		 * all the pmap ledgers should be back to 0.
12662 		 */
12663 		return;
12664 	}
12665 
12666 	pid = pmap->pmap_pid;
12667 	procname = pmap->pmap_procname;
12668 
12669 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12670 }
12671 #endif /* MACH_ASSERT */
12672 
12673 void
12674 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12675 {
12676 }
12677 
12678 /**
12679  * The minimum shared region nesting size is used by the VM to determine when to
12680  * break up large mappings to nested regions. The smallest size that these
12681  * mappings can be broken into is determined by what page table level those
12682  * regions are being nested in at and the size of the page tables.
12683  *
12684  * For instance, if a nested region is nesting at L2 for a process utilizing
12685  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12686  * block entry).
12687  *
12688  * @param pmap The target pmap to determine the block size based on whether it's
12689  *             using 16KB or 4KB page tables.
12690  */
12691 uint64_t
12692 pmap_shared_region_size_min(__unused pmap_t pmap)
12693 {
12694 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12695 
12696 	/**
12697 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12698 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12699 	 * point to shared L3 page tables in the shared region pmap.
12700 	 */
12701 	return pt_attr_twig_size(pt_attr);
12702 }
12703 
12704 boolean_t
12705 pmap_enforces_execute_only(
12706 	pmap_t pmap)
12707 {
12708 	return pmap != kernel_pmap;
12709 }
12710 
12711 MARK_AS_PMAP_TEXT void
12712 pmap_set_vm_map_cs_enforced_internal(
12713 	pmap_t pmap,
12714 	bool new_value)
12715 {
12716 	validate_pmap_mutable(pmap);
12717 	pmap->pmap_vm_map_cs_enforced = new_value;
12718 }
12719 
12720 void
12721 pmap_set_vm_map_cs_enforced(
12722 	pmap_t pmap,
12723 	bool new_value)
12724 {
12725 #if XNU_MONITOR
12726 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12727 #else
12728 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12729 #endif
12730 }
12731 
12732 extern int cs_process_enforcement_enable;
12733 bool
12734 pmap_get_vm_map_cs_enforced(
12735 	pmap_t pmap)
12736 {
12737 	if (cs_process_enforcement_enable) {
12738 		return true;
12739 	}
12740 	return pmap->pmap_vm_map_cs_enforced;
12741 }
12742 
12743 MARK_AS_PMAP_TEXT void
12744 pmap_set_jit_entitled_internal(
12745 	__unused pmap_t pmap)
12746 {
12747 	return;
12748 }
12749 
12750 void
12751 pmap_set_jit_entitled(
12752 	pmap_t pmap)
12753 {
12754 #if XNU_MONITOR
12755 	pmap_set_jit_entitled_ppl(pmap);
12756 #else
12757 	pmap_set_jit_entitled_internal(pmap);
12758 #endif
12759 }
12760 
12761 bool
12762 pmap_get_jit_entitled(
12763 	__unused pmap_t pmap)
12764 {
12765 	return false;
12766 }
12767 
12768 MARK_AS_PMAP_TEXT void
12769 pmap_set_tpro_internal(
12770 	__unused pmap_t pmap)
12771 {
12772 	return;
12773 }
12774 
12775 void
12776 pmap_set_tpro(
12777 	pmap_t pmap)
12778 {
12779 #if XNU_MONITOR
12780 	pmap_set_tpro_ppl(pmap);
12781 #else /* XNU_MONITOR */
12782 	pmap_set_tpro_internal(pmap);
12783 #endif /* XNU_MONITOR */
12784 }
12785 
12786 bool
12787 pmap_get_tpro(
12788 	__unused pmap_t pmap)
12789 {
12790 	return false;
12791 }
12792 
12793 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12794 
12795 MARK_AS_PMAP_TEXT kern_return_t
12796 pmap_query_page_info_internal(
12797 	pmap_t          pmap,
12798 	vm_map_offset_t va,
12799 	int             *disp_p)
12800 {
12801 	pmap_paddr_t    pa;
12802 	int             disp;
12803 	unsigned int    pai;
12804 	pt_entry_t      *pte_p, pte;
12805 	pv_entry_t      **pv_h, *pve_p;
12806 
12807 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12808 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12809 		*disp_p = 0;
12810 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12811 		return KERN_INVALID_ARGUMENT;
12812 	}
12813 
12814 	validate_pmap(pmap);
12815 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12816 
12817 try_again:
12818 	disp = 0;
12819 	pte_p = pmap_pte(pmap, va);
12820 	if (pte_p == PT_ENTRY_NULL) {
12821 		goto done;
12822 	}
12823 	pte = *(volatile pt_entry_t*)pte_p;
12824 	pa = pte_to_pa(pte);
12825 	if (pa == 0) {
12826 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12827 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12828 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12829 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12830 			}
12831 		}
12832 	} else {
12833 		disp |= PMAP_QUERY_PAGE_PRESENT;
12834 		pai = pa_index(pa);
12835 		if (!pa_valid(pa)) {
12836 			goto done;
12837 		}
12838 		pvh_lock(pai);
12839 		if (pte != *(volatile pt_entry_t*)pte_p) {
12840 			/* something changed: try again */
12841 			pvh_unlock(pai);
12842 			pmap_query_page_info_retries++;
12843 			goto try_again;
12844 		}
12845 		pv_h = pai_to_pvh(pai);
12846 		pve_p = PV_ENTRY_NULL;
12847 		int pve_ptep_idx = 0;
12848 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12849 			pve_p = pvh_pve_list(pv_h);
12850 			while (pve_p != PV_ENTRY_NULL &&
12851 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12852 				pve_p = pve_next(pve_p);
12853 			}
12854 		}
12855 
12856 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12857 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12858 		} else if (ppattr_test_reusable(pai)) {
12859 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12860 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12861 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12862 		}
12863 		pvh_unlock(pai);
12864 	}
12865 
12866 done:
12867 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12868 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12869 	*disp_p = disp;
12870 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12871 	return KERN_SUCCESS;
12872 }
12873 
12874 kern_return_t
12875 pmap_query_page_info(
12876 	pmap_t          pmap,
12877 	vm_map_offset_t va,
12878 	int             *disp_p)
12879 {
12880 #if XNU_MONITOR
12881 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12882 #else
12883 	return pmap_query_page_info_internal(pmap, va, disp_p);
12884 #endif
12885 }
12886 
12887 
12888 
12889 uint32_t
12890 pmap_user_va_bits(pmap_t pmap __unused)
12891 {
12892 #if __ARM_MIXED_PAGE_SIZE__
12893 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12894 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12895 #else
12896 	return 64 - T0SZ_BOOT;
12897 #endif
12898 }
12899 
12900 uint32_t
12901 pmap_kernel_va_bits(void)
12902 {
12903 	return 64 - T1SZ_BOOT;
12904 }
12905 
12906 static vm_map_size_t
12907 pmap_user_va_size(pmap_t pmap)
12908 {
12909 	return 1ULL << pmap_user_va_bits(pmap);
12910 }
12911 
12912 
12913 
12914 bool
12915 pmap_in_ppl(void)
12916 {
12917 	// Unsupported
12918 	return false;
12919 }
12920 
12921 __attribute__((__noreturn__))
12922 void
12923 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12924 {
12925 	panic("%s called on an unsupported platform.", __FUNCTION__);
12926 }
12927 
12928 void *
12929 pmap_claim_reserved_ppl_page(void)
12930 {
12931 	// Unsupported
12932 	return NULL;
12933 }
12934 
12935 void
12936 pmap_free_reserved_ppl_page(void __unused *kva)
12937 {
12938 	// Unsupported
12939 }
12940 
12941 
12942 #if PMAP_CS_PPL_MONITOR
12943 
12944 /* Immutable part of the trust cache runtime */
12945 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12946 
12947 /* Mutable part of the trust cache runtime */
12948 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12949 
12950 /* Lock for the trust cache runtime */
12951 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12952 
12953 MARK_AS_PMAP_TEXT kern_return_t
12954 pmap_check_trust_cache_runtime_for_uuid_internal(
12955 	const uint8_t check_uuid[kUUIDSize])
12956 {
12957 	kern_return_t ret = KERN_DENIED;
12958 
12959 	/* Lock the runtime as shared */
12960 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12961 
12962 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12963 		&ppl_trust_cache_rt,
12964 		check_uuid,
12965 		NULL);
12966 
12967 	/* Unlock the runtime */
12968 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12969 
12970 	if (tc_ret.error == kTCReturnSuccess) {
12971 		ret = KERN_SUCCESS;
12972 	} else if (tc_ret.error == kTCReturnNotFound) {
12973 		ret = KERN_NOT_FOUND;
12974 	} else {
12975 		ret = KERN_FAILURE;
12976 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12977 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12978 	}
12979 
12980 	return ret;
12981 }
12982 
12983 kern_return_t
12984 pmap_check_trust_cache_runtime_for_uuid(
12985 	const uint8_t check_uuid[kUUIDSize])
12986 {
12987 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12988 }
12989 
12990 MARK_AS_PMAP_TEXT kern_return_t
12991 pmap_load_trust_cache_with_type_internal(
12992 	TCType_t type,
12993 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12994 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12995 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12996 {
12997 	kern_return_t ret = KERN_DENIED;
12998 	pmap_img4_payload_t *payload = NULL;
12999 	size_t img4_payload_len = 0;
13000 	size_t payload_len_aligned = 0;
13001 	size_t manifest_len_aligned = 0;
13002 
13003 	/* Ignore the auxiliary manifest until we add support for it */
13004 	(void)img4_aux_manifest;
13005 	(void)img4_aux_manifest_len;
13006 
13007 
13008 #if PMAP_CS_INCLUDE_CODE_SIGNING
13009 	if (pmap_cs) {
13010 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13011 			panic("trust cache type not loadable from interface: %u", type);
13012 		} else if (type >= kTCTypeTotal) {
13013 			panic("attempted to load an unsupported trust cache type: %u", type);
13014 		}
13015 
13016 		/* Validate entitlement for the calling process */
13017 		if (TCTypeConfig[type].entitlementValue != NULL) {
13018 			const bool entitlement_satisfied = check_entitlement_pmap(
13019 				NULL,
13020 				"com.apple.private.pmap.load-trust-cache",
13021 				TCTypeConfig[type].entitlementValue,
13022 				false,
13023 				true);
13024 
13025 			if (entitlement_satisfied == false) {
13026 				panic("attempted to load trust cache without entitlement: %u", type);
13027 			}
13028 		}
13029 	}
13030 #endif
13031 
13032 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13033 	ret = pmap_reserve_ppl_page();
13034 	if (ret != KERN_SUCCESS) {
13035 		if (ret != KERN_RESOURCE_SHORTAGE) {
13036 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13037 		}
13038 		return ret;
13039 	}
13040 
13041 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
13042 	payload_len_aligned = round_page(pmap_img4_payload_len);
13043 	manifest_len_aligned = round_page(img4_manifest_len);
13044 
13045 	/* Ensure we have valid data passed in */
13046 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13047 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13048 
13049 	/*
13050 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13051 	 * data structure used by libTrustCache to manage the payload. We need to be able to
13052 	 * write to that data structure, so we keep the payload PPL writable.
13053 	 */
13054 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13055 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13056 
13057 	/* Should be safe to read from this now */
13058 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
13059 
13060 	/* Acquire a writable version of the trust cache data structure */
13061 	TrustCache_t *trust_cache = &payload->trust_cache;
13062 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13063 
13064 	/* Calculate the correct length of the img4 payload */
13065 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13066 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13067 	}
13068 
13069 	/* Exclusively lock the runtime */
13070 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13071 
13072 	/* Load the trust cache */
13073 	TCReturn_t tc_ret = amfi->TrustCache.load(
13074 		&ppl_trust_cache_rt,
13075 		type,
13076 		trust_cache,
13077 		(const uintptr_t)payload->img4_payload, img4_payload_len,
13078 		(const uintptr_t)img4_manifest, img4_manifest_len);
13079 
13080 	/* Unlock the runtime */
13081 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13082 
13083 	if (tc_ret.error == kTCReturnSuccess) {
13084 		ret = KERN_SUCCESS;
13085 	} else {
13086 		if (tc_ret.error == kTCReturnDuplicate) {
13087 			ret = KERN_ALREADY_IN_SET;
13088 		} else {
13089 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13090 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13091 
13092 			ret = KERN_FAILURE;
13093 		}
13094 
13095 		/* Unlock the payload data */
13096 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13097 		trust_cache = NULL;
13098 		payload = NULL;
13099 	}
13100 
13101 	/* Unlock the manifest since it is no longer needed */
13102 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13103 
13104 	/* Return the CoreCrypto reserved page back to the free list */
13105 	pmap_release_reserved_ppl_page();
13106 
13107 	return ret;
13108 }
13109 
13110 kern_return_t
13111 pmap_load_trust_cache_with_type(
13112 	TCType_t type,
13113 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13114 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13115 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13116 {
13117 	kern_return_t ret = KERN_DENIED;
13118 
13119 	ret = pmap_load_trust_cache_with_type_ppl(
13120 		type,
13121 		pmap_img4_payload, pmap_img4_payload_len,
13122 		img4_manifest, img4_manifest_len,
13123 		img4_aux_manifest, img4_aux_manifest_len);
13124 
13125 	while (ret == KERN_RESOURCE_SHORTAGE) {
13126 		/* Allocate a page from the free list */
13127 		pmap_alloc_page_for_ppl(0);
13128 
13129 		/* Attempt the call again */
13130 		ret = pmap_load_trust_cache_with_type_ppl(
13131 			type,
13132 			pmap_img4_payload, pmap_img4_payload_len,
13133 			img4_manifest, img4_manifest_len,
13134 			img4_aux_manifest, img4_aux_manifest_len);
13135 	}
13136 
13137 	return ret;
13138 }
13139 
13140 MARK_AS_PMAP_TEXT kern_return_t
13141 pmap_query_trust_cache_safe(
13142 	TCQueryType_t query_type,
13143 	const uint8_t cdhash[kTCEntryHashSize],
13144 	TrustCacheQueryToken_t *query_token)
13145 {
13146 	kern_return_t ret = KERN_NOT_FOUND;
13147 
13148 	/* Validate the query type preemptively */
13149 	if (query_type >= kTCQueryTypeTotal) {
13150 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13151 		return KERN_INVALID_ARGUMENT;
13152 	}
13153 
13154 	/* Lock the runtime as shared */
13155 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13156 
13157 	TCReturn_t tc_ret = amfi->TrustCache.query(
13158 		&ppl_trust_cache_rt,
13159 		query_type,
13160 		cdhash,
13161 		query_token);
13162 
13163 	/* Unlock the runtime */
13164 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13165 
13166 	if (tc_ret.error == kTCReturnSuccess) {
13167 		ret = KERN_SUCCESS;
13168 	} else if (tc_ret.error == kTCReturnNotFound) {
13169 		ret = KERN_NOT_FOUND;
13170 	} else {
13171 		ret = KERN_FAILURE;
13172 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13173 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13174 	}
13175 
13176 	return ret;
13177 }
13178 
13179 MARK_AS_PMAP_TEXT kern_return_t
13180 pmap_query_trust_cache_internal(
13181 	TCQueryType_t query_type,
13182 	const uint8_t cdhash[kTCEntryHashSize],
13183 	TrustCacheQueryToken_t *query_token)
13184 {
13185 	kern_return_t ret = KERN_NOT_FOUND;
13186 	TrustCacheQueryToken_t query_token_safe = {0};
13187 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13188 
13189 	/* Copy in the CDHash into PPL storage */
13190 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13191 
13192 	/* Query through the safe API since we're in the PPL now */
13193 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13194 
13195 	if (query_token != NULL) {
13196 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13197 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13198 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13199 	}
13200 
13201 	return ret;
13202 }
13203 
13204 kern_return_t
13205 pmap_query_trust_cache(
13206 	TCQueryType_t query_type,
13207 	const uint8_t cdhash[kTCEntryHashSize],
13208 	TrustCacheQueryToken_t *query_token)
13209 {
13210 	kern_return_t ret = KERN_NOT_FOUND;
13211 
13212 	ret = pmap_query_trust_cache_ppl(
13213 		query_type,
13214 		cdhash,
13215 		query_token);
13216 
13217 	return ret;
13218 }
13219 
13220 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
13221 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13222 
13223 MARK_AS_PMAP_TEXT void
13224 pmap_toggle_developer_mode_internal(
13225 	bool state)
13226 {
13227 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13228 
13229 	/*
13230 	 * Only the following state transitions are allowed:
13231 	 * -- not set --> false
13232 	 * -- not set --> true
13233 	 * -- true --> false
13234 	 * -- true --> true
13235 	 * -- false --> false
13236 	 *
13237 	 * We never allow false --> true transitions.
13238 	 */
13239 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13240 
13241 	if ((current == false) && (state == true) && state_set) {
13242 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
13243 	}
13244 
13245 	/* We're going to update the developer mode state, so update this first */
13246 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13247 
13248 	/* Update the developer mode state on the system */
13249 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13250 }
13251 
13252 void
13253 pmap_toggle_developer_mode(
13254 	bool state)
13255 {
13256 	pmap_toggle_developer_mode_ppl(state);
13257 }
13258 
13259 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13260 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13261 
13262 #pragma mark Image4 - New
13263 
13264 typedef struct _pmap_image4_dispatch {
13265 	image4_cs_trap_t selector;
13266 	image4_cs_trap_handler_t handler;
13267 } pmap_image4_dispatch_t;
13268 
13269 MARK_AS_PMAP_TEXT static errno_t
13270 _pmap_image4_monitor_trap_set_release_type(
13271 	const pmap_image4_dispatch_t *dispatch,
13272 	const void *input_data)
13273 {
13274 	/*
13275 	 * csmx_release_type --> __cs_copy
13276 	 */
13277 	image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13278 
13279 	/* Copy the input data to prevent ToCToU */
13280 	memcpy(&input, input_data, sizeof(input));
13281 
13282 	/* Dispatch to AppleImage4 */
13283 	return dispatch->handler(
13284 		dispatch->selector,
13285 		&input, sizeof(input),
13286 		NULL, NULL);
13287 }
13288 
13289 
13290 
13291 MARK_AS_PMAP_TEXT static errno_t
13292 _pmap_image4_monitor_trap_nonce_set(
13293 	const pmap_image4_dispatch_t *dispatch,
13294 	const void *input_data)
13295 {
13296 	/*
13297 	 * csmx_clear --> __cs_copy
13298 	 * csmx_cipher --> __cs_copy
13299 	 */
13300 	image4_cs_trap_argv_nonce_set_t input = {0};
13301 
13302 	/* Copy the input data to prevent ToCToU */
13303 	memcpy(&input, input_data, sizeof(input));
13304 
13305 	/* Dispatch to AppleImage4 */
13306 	return dispatch->handler(
13307 		dispatch->selector,
13308 		&input, sizeof(input),
13309 		NULL, NULL);
13310 }
13311 
13312 MARK_AS_PMAP_TEXT static errno_t
13313 _pmap_image4_monitor_trap_nonce_roll(
13314 	const pmap_image4_dispatch_t *dispatch,
13315 	const void *input_data)
13316 {
13317 	image4_cs_trap_argv_nonce_roll_t input = {0};
13318 
13319 	/* Copy the input data to prevent ToCToU */
13320 	memcpy(&input, input_data, sizeof(input));
13321 
13322 	/* Dispatch to AppleImage4 */
13323 	return dispatch->handler(
13324 		dispatch->selector,
13325 		&input, sizeof(input),
13326 		NULL, NULL);
13327 }
13328 
13329 MARK_AS_PMAP_TEXT static errno_t
13330 _pmap_image4_monitor_trap_image_activate(
13331 	const pmap_image4_dispatch_t *dispatch,
13332 	const void *input_data)
13333 {
13334 	/*
13335 	 * csmx_payload (csmx_payload_len) --> __cs_xfer
13336 	 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13337 	 */
13338 	image4_cs_trap_argv_image_activate_t input = {0};
13339 
13340 	/* Copy the input data to prevent ToCToU */
13341 	memcpy(&input, input_data, sizeof(input));
13342 
13343 	/* Validate the payload region */
13344 	pmap_cs_assert_addr(
13345 		input.csmx_payload, round_page(input.csmx_payload_len),
13346 		false, false);
13347 
13348 	/* Validate the manifest region */
13349 	pmap_cs_assert_addr(
13350 		input.csmx_manifest, round_page(input.csmx_manifest_len),
13351 		false, false);
13352 
13353 	/* Lockdown the payload region */
13354 	pmap_cs_lockdown_pages(
13355 		input.csmx_payload, round_page(input.csmx_payload_len), false);
13356 
13357 	/* Lockdown the manifest region */
13358 	pmap_cs_lockdown_pages(
13359 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13360 
13361 	/* Dispatch the handler */
13362 	errno_t err = dispatch->handler(
13363 		dispatch->selector,
13364 		&input, sizeof(input),
13365 		NULL, NULL);
13366 
13367 	/*
13368 	 * Image activation always returns the manifest back to the kernel since it isn't
13369 	 * needed once the evaluation of the image has been completed. The payload must
13370 	 * remain owned by the monitor if the activation was successful.
13371 	 */
13372 	if (err != 0) {
13373 		/* Unlock the payload region */
13374 		pmap_cs_unlockdown_pages(
13375 			input.csmx_payload, round_page(input.csmx_payload_len), false);
13376 	}
13377 
13378 	/* Unlock the manifest region */
13379 	pmap_cs_unlockdown_pages(
13380 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13381 
13382 	return err;
13383 }
13384 
13385 MARK_AS_PMAP_TEXT static errno_t
13386 _pmap_image4_monitor_trap_passthrough(
13387 	__unused const pmap_image4_dispatch_t *dispatch,
13388 	__unused const void *input_data,
13389 	__unused size_t input_size)
13390 {
13391 #if DEVELOPMENT || DEBUG || KASAN
13392 	return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13393 #else
13394 	pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13395 	return ENOSYS;
13396 #endif
13397 }
13398 
13399 MARK_AS_PMAP_TEXT errno_t
13400 pmap_image4_monitor_trap_internal(
13401 	image4_cs_trap_t selector,
13402 	const void *input_data,
13403 	size_t input_size)
13404 {
13405 	kern_return_t ret = KERN_DENIED;
13406 	errno_t err = EPERM;
13407 
13408 	/* Acquire the handler for this selector */
13409 	image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13410 	if (handler == NULL) {
13411 		pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13412 		return EINVAL;
13413 	}
13414 
13415 	/* Verify input size for the handler */
13416 	if (input_size != image4_cs_trap_vector_size(selector)) {
13417 		pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13418 		return EINVAL;
13419 	}
13420 
13421 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13422 	ret = pmap_reserve_ppl_page();
13423 	if (ret != KERN_SUCCESS) {
13424 		if (ret == KERN_RESOURCE_SHORTAGE) {
13425 			return ENOMEM;
13426 		}
13427 		pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13428 		return EPERM;
13429 	}
13430 
13431 	/* Setup dispatch parameters */
13432 	pmap_image4_dispatch_t dispatch = {
13433 		.selector = selector,
13434 		.handler = handler
13435 	};
13436 
13437 	switch (selector) {
13438 	case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13439 		err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13440 		break;
13441 
13442 	case IMAGE4_CS_TRAP_NONCE_SET:
13443 		err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13444 		break;
13445 
13446 	case IMAGE4_CS_TRAP_NONCE_ROLL:
13447 		err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13448 		break;
13449 
13450 	case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13451 		err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13452 		break;
13453 
13454 	default:
13455 		err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13456 		break;
13457 	}
13458 
13459 	/* Return the CoreCrypto reserved page back to the free list */
13460 	pmap_release_reserved_ppl_page();
13461 
13462 	return err;
13463 }
13464 
13465 errno_t
13466 pmap_image4_monitor_trap(
13467 	image4_cs_trap_t selector,
13468 	const void *input_data,
13469 	size_t input_size)
13470 {
13471 	errno_t err = EPERM;
13472 
13473 	err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13474 	while (err == ENOMEM) {
13475 		/* Allocate a page from the free list */
13476 		pmap_alloc_page_for_ppl(0);
13477 
13478 		/* Call the monitor dispatch again */
13479 		err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13480 	}
13481 
13482 	return err;
13483 }
13484 
13485 #endif /* PMAP_CS_PPL_MONITOR */
13486 
13487 #if PMAP_CS_INCLUDE_CODE_SIGNING
13488 
13489 static int
13490 pmap_cs_profiles_rbtree_compare(
13491 	void *profile0,
13492 	void *profile1)
13493 {
13494 	if (profile0 < profile1) {
13495 		return -1;
13496 	} else if (profile0 > profile1) {
13497 		return 1;
13498 	}
13499 	return 0;
13500 }
13501 
13502 /* Red-black tree for managing provisioning profiles */
13503 MARK_AS_PMAP_DATA static
13504 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13505 
13506 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13507 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13508 
13509 /* Lock for the profile red-black tree */
13510 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13511 
13512 void
13513 pmap_initialize_provisioning_profiles(void)
13514 {
13515 	/* Initialize the profiles red-black tree lock */
13516 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13517 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13518 
13519 	/* Initialize the red-black tree itself */
13520 	RB_INIT(&pmap_cs_registered_profiles);
13521 
13522 	printf("initialized PPL provisioning profile data\n");
13523 }
13524 
13525 static bool
13526 pmap_is_testflight_profile(
13527 	pmap_cs_profile_t *profile_obj)
13528 {
13529 	const char *entitlement_name = "beta-reports-active";
13530 	const size_t entitlement_length = strlen(entitlement_name);
13531 	CEQueryOperation_t query[2] = {0};
13532 
13533 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13534 	if (profile_obj->entitlements_ctx == NULL) {
13535 		return false;
13536 	}
13537 
13538 	/* Build our CoreEntitlements query */
13539 	query[0].opcode = kCEOpSelectKey;
13540 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13541 	query[0].parameters.stringParameter.length = entitlement_length;
13542 	query[1] = CEMatchBool(true);
13543 
13544 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13545 		profile_obj->entitlements_ctx,
13546 		query, 2);
13547 
13548 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13549 		return true;
13550 	}
13551 
13552 	return false;
13553 }
13554 
13555 static bool
13556 pmap_is_development_profile(
13557 	pmap_cs_profile_t *profile_obj)
13558 {
13559 	/* Check for UPP */
13560 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13561 		*profile_obj->profile_ctx,
13562 		CESelectDictValue("ProvisionsAllDevices"));
13563 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13564 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13565 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13566 			return false;
13567 		}
13568 	}
13569 
13570 	/* Check for TestFlight profile */
13571 	if (pmap_is_testflight_profile(profile_obj) == true) {
13572 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13573 		return false;
13574 	}
13575 
13576 	pmap_cs_log_info("%p: development profile", profile_obj);
13577 	return true;
13578 }
13579 
13580 static kern_return_t
13581 pmap_initialize_profile_entitlements(
13582 	pmap_cs_profile_t *profile_obj)
13583 {
13584 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13585 		*profile_obj->profile_ctx,
13586 		CESelectDictValue("Entitlements"));
13587 
13588 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13589 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13590 		profile_obj->entitlements_ctx = NULL;
13591 
13592 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13593 		return KERN_NOT_FOUND;
13594 	}
13595 
13596 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13597 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13598 
13599 	CEValidationResult ce_result = {0};
13600 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13601 		pmap_cs_core_entitlements_runtime,
13602 		&ce_result,
13603 		der_start, der_end);
13604 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13605 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13606 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13607 
13608 		return KERN_ABORTED;
13609 	}
13610 
13611 	struct CEQueryContext query_ctx = {0};
13612 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13613 		pmap_cs_core_entitlements_runtime,
13614 		ce_result,
13615 		&query_ctx);
13616 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13617 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13618 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13619 
13620 		return KERN_ABORTED;
13621 	}
13622 
13623 	/* Setup the entitlements context within the profile object */
13624 	profile_obj->entitlements_ctx_storage = query_ctx;
13625 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13626 
13627 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13628 	return KERN_SUCCESS;
13629 }
13630 
13631 kern_return_t
13632 pmap_register_provisioning_profile_internal(
13633 	const vm_address_t payload_addr,
13634 	const vm_size_t payload_size)
13635 {
13636 	kern_return_t ret = KERN_DENIED;
13637 	pmap_cs_profile_t *profile_obj = NULL;
13638 	pmap_profile_payload_t *profile_payload = NULL;
13639 	vm_size_t max_profile_blob_size = 0;
13640 	const uint8_t *profile_content = NULL;
13641 	size_t profile_content_length = 0;
13642 
13643 
13644 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13645 	ret = pmap_reserve_ppl_page();
13646 	if (ret != KERN_SUCCESS) {
13647 		if (ret != KERN_RESOURCE_SHORTAGE) {
13648 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13649 		}
13650 		return ret;
13651 	}
13652 
13653 	/* Ensure we have valid data passed in */
13654 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13655 
13656 	/*
13657 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13658 	 * data structure used by the PPL to manage the payload. We need to be able to write
13659 	 * to that data structure, so we keep the payload PPL writable.
13660 	 */
13661 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13662 
13663 	/* Should be safe to read from this now */
13664 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13665 
13666 	/* Ensure the profile blob size provided is valid */
13667 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13668 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13669 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13670 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13671 	}
13672 
13673 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13674 	const bool allow_development_root_cert = true;
13675 #else
13676 	const bool allow_development_root_cert = false;
13677 #endif
13678 
13679 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13680 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13681 		allow_development_root_cert,
13682 		&profile_content, &profile_content_length);
13683 
13684 	/* Release the PPL page allocated for CoreCrypto */
13685 	pmap_release_reserved_ppl_page();
13686 
13687 	if (ct_result != 0) {
13688 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13689 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13690 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13691 		    profile_content, profile_content_length);
13692 	}
13693 
13694 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13695 		pmap_cs_core_entitlements_runtime,
13696 		CCDER_CONSTRUCTED_SET,
13697 		false,
13698 		profile_content, profile_content + profile_content_length);
13699 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13700 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13701 	}
13702 
13703 	/* Acquire a writable version of the profile data structure */
13704 	profile_obj = &profile_payload->profile_obj_storage;
13705 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13706 
13707 	profile_obj->original_payload = profile_payload;
13708 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13709 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13710 	os_atomic_store(&profile_obj->reference_count, 0, release);
13711 
13712 	/* Setup the entitlements provisioned by the profile */
13713 	ret = pmap_initialize_profile_entitlements(profile_obj);
13714 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13715 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13716 	}
13717 
13718 	/* Setup properties of the profile */
13719 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13720 
13721 	/* Mark as validated since it passed all checks */
13722 	profile_obj->profile_validated = true;
13723 
13724 	/* Add the profile to the red-black tree */
13725 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13726 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13727 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13728 	}
13729 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13730 
13731 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13732 	return KERN_SUCCESS;
13733 }
13734 
13735 kern_return_t
13736 pmap_register_provisioning_profile(
13737 	const vm_address_t payload_addr,
13738 	const vm_size_t payload_size)
13739 {
13740 	kern_return_t ret = KERN_DENIED;
13741 
13742 	ret = pmap_register_provisioning_profile_ppl(
13743 		payload_addr,
13744 		payload_size);
13745 
13746 	while (ret == KERN_RESOURCE_SHORTAGE) {
13747 		/* Allocate a page from the free list */
13748 		pmap_alloc_page_for_ppl(0);
13749 
13750 		/* Attempt the call again */
13751 		ret = pmap_register_provisioning_profile_ppl(
13752 			payload_addr,
13753 			payload_size);
13754 	}
13755 
13756 	return ret;
13757 }
13758 
13759 kern_return_t
13760 pmap_unregister_provisioning_profile_internal(
13761 	pmap_cs_profile_t *profile_obj)
13762 {
13763 	kern_return_t ret = KERN_DENIED;
13764 
13765 	/* Lock the red-black tree exclusively */
13766 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13767 
13768 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13769 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13770 	}
13771 
13772 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13773 	if (reference_count != 0) {
13774 		ret = KERN_FAILURE;
13775 		goto exit;
13776 	}
13777 
13778 	/* Remove the profile from the red-black tree */
13779 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13780 
13781 	/* Unregistration was a success */
13782 	ret = KERN_SUCCESS;
13783 
13784 exit:
13785 	/* Unlock the red-black tree */
13786 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13787 
13788 	if (ret == KERN_SUCCESS) {
13789 		/* Get the original payload address */
13790 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13791 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13792 
13793 		/* Get the original payload size */
13794 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13795 		payload_size = round_page(payload_size);
13796 
13797 		/* Unlock the profile payload */
13798 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13799 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13800 		    profile_payload, payload_size);
13801 
13802 		profile_obj = NULL;
13803 	}
13804 	return ret;
13805 }
13806 
13807 kern_return_t
13808 pmap_unregister_provisioning_profile(
13809 	pmap_cs_profile_t *profile_obj)
13810 {
13811 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13812 }
13813 
13814 kern_return_t
13815 pmap_associate_provisioning_profile_internal(
13816 	pmap_cs_code_directory_t *cd_entry,
13817 	pmap_cs_profile_t *profile_obj)
13818 {
13819 	kern_return_t ret = KERN_DENIED;
13820 
13821 	/* Acquire the lock on the code directory */
13822 	pmap_cs_lock_code_directory(cd_entry);
13823 
13824 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13825 		pmap_cs_log_error("disallowing profile association with verified signature");
13826 		goto exit;
13827 	} else if (cd_entry->profile_obj != NULL) {
13828 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13829 		goto exit;
13830 	}
13831 
13832 	/* Lock the red-black tree as shared */
13833 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13834 
13835 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13836 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13837 	} else if (profile_obj->profile_validated == false) {
13838 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13839 	}
13840 
13841 	/* Associate the profile with the signature */
13842 	cd_entry->profile_obj = profile_obj;
13843 
13844 	/* Increment the reference count on the profile object */
13845 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13846 	if (reference_count == 0) {
13847 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13848 	}
13849 
13850 	/* Unlock the red-black tree */
13851 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13852 
13853 	/* Association was a success */
13854 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13855 	ret = KERN_SUCCESS;
13856 
13857 exit:
13858 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13859 
13860 	return ret;
13861 }
13862 
13863 kern_return_t
13864 pmap_associate_provisioning_profile(
13865 	pmap_cs_code_directory_t *cd_entry,
13866 	pmap_cs_profile_t *profile_obj)
13867 {
13868 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13869 }
13870 
13871 kern_return_t
13872 pmap_disassociate_provisioning_profile_internal(
13873 	pmap_cs_code_directory_t *cd_entry)
13874 {
13875 	pmap_cs_profile_t *profile_obj = NULL;
13876 	kern_return_t ret = KERN_DENIED;
13877 
13878 	/* Acquire the lock on the code directory */
13879 	pmap_cs_lock_code_directory(cd_entry);
13880 
13881 	if (cd_entry->profile_obj == NULL) {
13882 		ret = KERN_NOT_FOUND;
13883 		goto exit;
13884 	}
13885 	profile_obj = cd_entry->profile_obj;
13886 
13887 	/* Disassociate the profile from the signature */
13888 	cd_entry->profile_obj = NULL;
13889 
13890 	/* Disassociation was a success */
13891 	ret = KERN_SUCCESS;
13892 
13893 exit:
13894 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13895 
13896 	if (ret == KERN_SUCCESS) {
13897 		/* Decrement the reference count on the profile object */
13898 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13899 		if (reference_count == UINT32_MAX) {
13900 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13901 		}
13902 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13903 	}
13904 	return ret;
13905 }
13906 
13907 kern_return_t
13908 pmap_disassociate_provisioning_profile(
13909 	pmap_cs_code_directory_t *cd_entry)
13910 {
13911 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13912 }
13913 
13914 kern_return_t
13915 pmap_associate_kernel_entitlements_internal(
13916 	pmap_cs_code_directory_t *cd_entry,
13917 	const void *kernel_entitlements)
13918 {
13919 	kern_return_t ret = KERN_DENIED;
13920 
13921 	if (kernel_entitlements == NULL) {
13922 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13923 	}
13924 
13925 	/* Acquire the lock on the code directory */
13926 	pmap_cs_lock_code_directory(cd_entry);
13927 
13928 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13929 		ret = KERN_DENIED;
13930 		goto out;
13931 	} else if (cd_entry->kernel_entitlements != NULL) {
13932 		ret = KERN_DENIED;
13933 		goto out;
13934 	}
13935 	cd_entry->kernel_entitlements = kernel_entitlements;
13936 
13937 	/* Association was a success */
13938 	ret = KERN_SUCCESS;
13939 
13940 out:
13941 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13942 	return ret;
13943 }
13944 
13945 kern_return_t
13946 pmap_associate_kernel_entitlements(
13947 	pmap_cs_code_directory_t *cd_entry,
13948 	const void *kernel_entitlements)
13949 {
13950 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13951 }
13952 
13953 kern_return_t
13954 pmap_resolve_kernel_entitlements_internal(
13955 	pmap_t pmap,
13956 	const void **kernel_entitlements)
13957 {
13958 	const void *entitlements = NULL;
13959 	pmap_cs_code_directory_t *cd_entry = NULL;
13960 	kern_return_t ret = KERN_DENIED;
13961 
13962 	/* Validate the PMAP object */
13963 	validate_pmap(pmap);
13964 
13965 	/* Ensure no kernel PMAP */
13966 	if (pmap == kernel_pmap) {
13967 		return KERN_NOT_FOUND;
13968 	}
13969 
13970 	/* Attempt a shared lock on the PMAP */
13971 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13972 		return KERN_ABORTED;
13973 	}
13974 
13975 	/*
13976 	 * Acquire the code signature from the PMAP. This function is called when
13977 	 * performing an entitlement check, and since we've confirmed this isn't
13978 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13979 	 * with a code signature.
13980 	 */
13981 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13982 	if (cd_entry == NULL) {
13983 		ret = KERN_NOT_FOUND;
13984 		goto out;
13985 	}
13986 
13987 	entitlements = cd_entry->kernel_entitlements;
13988 	if (entitlements == NULL) {
13989 		ret = KERN_NOT_FOUND;
13990 		goto out;
13991 	}
13992 
13993 	/* Pin and write out the entitlements object pointer */
13994 	if (kernel_entitlements != NULL) {
13995 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13996 		*kernel_entitlements = entitlements;
13997 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13998 	}
13999 
14000 	/* Successfully resolved the entitlements */
14001 	ret = KERN_SUCCESS;
14002 
14003 out:
14004 	/* Unlock the code signature object */
14005 	if (cd_entry != NULL) {
14006 		lck_rw_unlock_shared(&cd_entry->rwlock);
14007 		cd_entry = NULL;
14008 	}
14009 
14010 	/* Unlock the PMAP object */
14011 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
14012 
14013 	return ret;
14014 }
14015 
14016 kern_return_t
14017 pmap_resolve_kernel_entitlements(
14018 	pmap_t pmap,
14019 	const void **kernel_entitlements)
14020 {
14021 	kern_return_t ret = KERN_DENIED;
14022 
14023 	do {
14024 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14025 	} while (ret == KERN_ABORTED);
14026 
14027 	return ret;
14028 }
14029 
14030 kern_return_t
14031 pmap_accelerate_entitlements_internal(
14032 	pmap_cs_code_directory_t *cd_entry)
14033 {
14034 	const coreentitlements_t *CoreEntitlements = NULL;
14035 	const CS_SuperBlob *superblob = NULL;
14036 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14037 	size_t signature_length = 0;
14038 	size_t acceleration_length = 0;
14039 	size_t required_length = 0;
14040 	kern_return_t ret = KERN_DENIED;
14041 
14042 	/* Setup the CoreEntitlements interface */
14043 	CoreEntitlements = &amfi->CoreEntitlements;
14044 
14045 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14046 
14047 	/* Acquire the lock on the code directory */
14048 	pmap_cs_lock_code_directory(cd_entry);
14049 
14050 	/*
14051 	 * Only reconstituted code signatures can be accelerated. This is only a policy
14052 	 * decision we make since this allows us to re-use any unused space within the
14053 	 * locked down code signature region. There is also a decent bit of validation
14054 	 * within the reconstitution function to ensure blobs are ordered and do not
14055 	 * contain any padding around them which can cause issues here.
14056 	 *
14057 	 * This also serves as a check to ensure the signature is trusted.
14058 	 */
14059 	if (cd_entry->unneeded_code_signature_unlocked == false) {
14060 		ret = KERN_DENIED;
14061 		goto out;
14062 	}
14063 
14064 	if (cd_entry->ce_ctx == NULL) {
14065 		ret = KERN_SUCCESS;
14066 		goto out;
14067 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14068 		ret = KERN_SUCCESS;
14069 		goto out;
14070 	}
14071 
14072 	/* We only support accelerating when size <= PAGE_SIZE */
14073 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14074 	if (ce_err != CoreEntitlements->kNoError) {
14075 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14076 			/* Small entitlement blobs aren't eligible */
14077 			ret = KERN_SUCCESS;
14078 			goto out;
14079 		}
14080 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14081 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14082 	} else if (acceleration_length > PAGE_SIZE) {
14083 		ret = KERN_ABORTED;
14084 		goto out;
14085 	}
14086 	assert(acceleration_length > 0);
14087 
14088 	superblob = cd_entry->superblob;
14089 	signature_length = ntohl(superblob->length);
14090 
14091 	/* Adjust the required length for the overhead structure -- can't overflow */
14092 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14093 	if (required_length > PAGE_SIZE) {
14094 		ret = KERN_ABORTED;
14095 		goto out;
14096 	}
14097 
14098 	/*
14099 	 * First we'll check if the code signature has enough space within the locked down
14100 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14101 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
14102 	 * free list.
14103 	 *
14104 	 * When we're storing the buffer within the code signature, we also need to make
14105 	 * sure we account for alignment of the buffer.
14106 	 */
14107 	const vm_address_t align_mask = sizeof(void*) - 1;
14108 	size_t required_length_within_sig = required_length + align_mask;
14109 
14110 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14111 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14112 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14113 
14114 		/* We need to resolve to the physical aperture */
14115 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14116 		acceleration_buf = (void*)phystokv(phys_addr);
14117 
14118 		/* Ensure the offset within the page wasn't lost */
14119 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14120 
14121 		acceleration_buf->allocated = false;
14122 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14123 	} else {
14124 		if (required_length <= pmap_cs_blob_limit) {
14125 			struct pmap_cs_blob *bucket = NULL;
14126 			size_t bucket_size = 0;
14127 
14128 			/* Allocate a buffer from the blob allocator */
14129 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14130 			if (ret != KERN_SUCCESS) {
14131 				goto out;
14132 			}
14133 			acceleration_buf = (void*)bucket->blob;
14134 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14135 		} else {
14136 			pmap_paddr_t phys_addr = 0;
14137 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14138 			if (ret != KERN_SUCCESS) {
14139 				goto out;
14140 			}
14141 			acceleration_buf = (void*)phystokv(phys_addr);
14142 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14143 		}
14144 		acceleration_buf->allocated = true;
14145 	}
14146 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14147 	acceleration_buf->length = acceleration_length;
14148 
14149 	/* Take the acceleration buffer lock */
14150 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14151 
14152 	/* Setup the global acceleration buffer state */
14153 	pmap_cs_acceleration_buf = acceleration_buf;
14154 
14155 	/* Accelerate the entitlements */
14156 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14157 	if (ce_err != CoreEntitlements->kNoError) {
14158 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14159 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14160 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14161 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14162 	}
14163 
14164 	/*
14165 	 * The global acceleration buffer lock is unlocked by the allocation function itself
14166 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14167 	 * an assert that the lock is unlocked here since another thread could have acquired
14168 	 * it by now.
14169 	 */
14170 	ret = KERN_SUCCESS;
14171 
14172 out:
14173 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14174 	return ret;
14175 }
14176 
14177 kern_return_t
14178 pmap_accelerate_entitlements(
14179 	pmap_cs_code_directory_t *cd_entry)
14180 {
14181 	kern_return_t ret = KERN_DENIED;
14182 
14183 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
14184 	while (ret == KERN_RESOURCE_SHORTAGE) {
14185 		/* Allocate a page for the PPL */
14186 		pmap_alloc_page_for_ppl(0);
14187 
14188 		/* Try again */
14189 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
14190 	}
14191 
14192 	return ret;
14193 }
14194 
14195 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14196 
14197 MARK_AS_PMAP_TEXT bool
14198 pmap_lookup_in_loaded_trust_caches_internal(
14199 	const uint8_t cdhash[CS_CDHASH_LEN])
14200 {
14201 	kern_return_t kr = KERN_NOT_FOUND;
14202 
14203 #if PMAP_CS_PPL_MONITOR
14204 	/*
14205 	 * If we have the PPL monitor, then this function can only be called from
14206 	 * within the PPL. Calling it directly would've caused a panic, so we can
14207 	 * assume that we're in the PPL here.
14208 	 */
14209 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14210 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14211 
14212 	kr = pmap_query_trust_cache_safe(
14213 		kTCQueryTypeLoadable,
14214 		cdhash_safe,
14215 		NULL);
14216 #else
14217 	kr = query_trust_cache(
14218 		kTCQueryTypeLoadable,
14219 		cdhash,
14220 		NULL);
14221 #endif
14222 
14223 	if (kr == KERN_SUCCESS) {
14224 		return true;
14225 	}
14226 	return false;
14227 }
14228 
14229 bool
14230 pmap_lookup_in_loaded_trust_caches(
14231 	const uint8_t cdhash[CS_CDHASH_LEN])
14232 {
14233 #if XNU_MONITOR
14234 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14235 #else
14236 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14237 #endif
14238 }
14239 
14240 MARK_AS_PMAP_TEXT uint32_t
14241 pmap_lookup_in_static_trust_cache_internal(
14242 	const uint8_t cdhash[CS_CDHASH_LEN])
14243 {
14244 	TrustCacheQueryToken_t query_token = {0};
14245 	kern_return_t kr = KERN_NOT_FOUND;
14246 	uint64_t flags = 0;
14247 	uint8_t hash_type = 0;
14248 
14249 #if PMAP_CS_PPL_MONITOR
14250 	/*
14251 	 * If we have the PPL monitor, then this function can only be called from
14252 	 * within the PPL. Calling it directly would've caused a panic, so we can
14253 	 * assume that we're in the PPL here.
14254 	 */
14255 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14256 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14257 
14258 	kr = pmap_query_trust_cache_safe(
14259 		kTCQueryTypeStatic,
14260 		cdhash_safe,
14261 		&query_token);
14262 #else
14263 	kr = query_trust_cache(
14264 		kTCQueryTypeStatic,
14265 		cdhash,
14266 		&query_token);
14267 #endif
14268 
14269 	if (kr == KERN_SUCCESS) {
14270 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
14271 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14272 
14273 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14274 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14275 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14276 	}
14277 
14278 	return 0;
14279 }
14280 
14281 uint32_t
14282 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14283 {
14284 #if XNU_MONITOR
14285 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14286 #else
14287 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
14288 #endif
14289 }
14290 
14291 #if PMAP_CS_INCLUDE_CODE_SIGNING
14292 
14293 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14294 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14295 
14296 MARK_AS_PMAP_TEXT void
14297 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14298 {
14299 
14300 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14301 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14302 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14303 
14304 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14305 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14306 }
14307 
14308 MARK_AS_PMAP_TEXT bool
14309 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14310 {
14311 	bool match = false;
14312 
14313 	/* Lockdown mode disallows compilation service */
14314 	if (ppl_lockdown_mode_enabled == true) {
14315 		return false;
14316 	}
14317 
14318 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14319 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14320 		match = true;
14321 	}
14322 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14323 
14324 	if (match) {
14325 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14326 	}
14327 
14328 	return match;
14329 }
14330 
14331 void
14332 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14333 {
14334 #if XNU_MONITOR
14335 	pmap_set_compilation_service_cdhash_ppl(cdhash);
14336 #else
14337 	pmap_set_compilation_service_cdhash_internal(cdhash);
14338 #endif
14339 }
14340 
14341 bool
14342 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14343 {
14344 #if XNU_MONITOR
14345 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
14346 #else
14347 	return pmap_match_compilation_service_cdhash_internal(cdhash);
14348 #endif
14349 }
14350 
14351 /*
14352  * As part of supporting local signing on the device, we need the PMAP layer
14353  * to store the local signing key so that PMAP_CS can validate with it. We
14354  * store it at the PMAP layer such that it is accessible to both AMFI and
14355  * PMAP_CS should they need it.
14356  */
14357 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14358 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14359 
14360 MARK_AS_PMAP_TEXT void
14361 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14362 {
14363 	bool key_set = false;
14364 
14365 	/*
14366 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14367 	 * a successful exchange means that the local signing public key has _not_ been
14368 	 * set. In case the key has been set, we panic as we would never expect the
14369 	 * kernel to attempt to set the key more than once.
14370 	 */
14371 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14372 
14373 	if (key_set) {
14374 		panic("attempted to set the local signing public key multiple times");
14375 	}
14376 
14377 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14378 	pmap_cs_log_info("set local signing public key");
14379 }
14380 
14381 void
14382 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14383 {
14384 #if XNU_MONITOR
14385 	return pmap_set_local_signing_public_key_ppl(public_key);
14386 #else
14387 	return pmap_set_local_signing_public_key_internal(public_key);
14388 #endif
14389 }
14390 
14391 uint8_t*
14392 pmap_get_local_signing_public_key(void)
14393 {
14394 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14395 
14396 	if (key_set) {
14397 		return pmap_local_signing_public_key;
14398 	}
14399 
14400 	return NULL;
14401 }
14402 
14403 /*
14404  * Locally signed applications need to be explicitly authorized by an entitled application
14405  * before we allow them to run.
14406  */
14407 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14408 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14409 
14410 MARK_AS_PMAP_TEXT void
14411 pmap_unrestrict_local_signing_internal(
14412 	const uint8_t cdhash[CS_CDHASH_LEN])
14413 {
14414 
14415 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14416 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14417 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14418 
14419 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14420 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14421 }
14422 
14423 void
14424 pmap_unrestrict_local_signing(
14425 	const uint8_t cdhash[CS_CDHASH_LEN])
14426 {
14427 #if XNU_MONITOR
14428 	return pmap_unrestrict_local_signing_ppl(cdhash);
14429 #else
14430 	return pmap_unrestrict_local_signing_internal(cdhash);
14431 #endif
14432 }
14433 
14434 #if PMAP_CS
14435 MARK_AS_PMAP_TEXT static void
14436 pmap_restrict_local_signing(void)
14437 {
14438 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14439 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14440 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14441 }
14442 
14443 MARK_AS_PMAP_TEXT static bool
14444 pmap_local_signing_restricted(
14445 	const uint8_t cdhash[CS_CDHASH_LEN])
14446 {
14447 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14448 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14449 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14450 
14451 	return ret != 0;
14452 }
14453 
14454 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14455 #endif
14456 
14457 MARK_AS_PMAP_TEXT void
14458 pmap_footprint_suspend_internal(
14459 	vm_map_t        map,
14460 	boolean_t       suspend)
14461 {
14462 #if DEVELOPMENT || DEBUG
14463 	if (suspend) {
14464 		current_thread()->pmap_footprint_suspended = TRUE;
14465 		map->pmap->footprint_was_suspended = TRUE;
14466 	} else {
14467 		current_thread()->pmap_footprint_suspended = FALSE;
14468 	}
14469 #else /* DEVELOPMENT || DEBUG */
14470 	(void) map;
14471 	(void) suspend;
14472 #endif /* DEVELOPMENT || DEBUG */
14473 }
14474 
14475 void
14476 pmap_footprint_suspend(
14477 	vm_map_t map,
14478 	boolean_t suspend)
14479 {
14480 #if XNU_MONITOR
14481 	pmap_footprint_suspend_ppl(map, suspend);
14482 #else
14483 	pmap_footprint_suspend_internal(map, suspend);
14484 #endif
14485 }
14486 
14487 MARK_AS_PMAP_TEXT void
14488 pmap_nop_internal(pmap_t pmap __unused)
14489 {
14490 	validate_pmap_mutable(pmap);
14491 }
14492 
14493 void
14494 pmap_nop(pmap_t pmap)
14495 {
14496 #if XNU_MONITOR
14497 	pmap_nop_ppl(pmap);
14498 #else
14499 	pmap_nop_internal(pmap);
14500 #endif
14501 }
14502 
14503 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14504 
14505 struct page_table_dump_header {
14506 	uint64_t pa;
14507 	uint64_t num_entries;
14508 	uint64_t start_va;
14509 	uint64_t end_va;
14510 };
14511 
14512 static kern_return_t
14513 pmap_dump_page_tables_recurse(pmap_t pmap,
14514     const tt_entry_t *ttp,
14515     unsigned int cur_level,
14516     unsigned int level_mask,
14517     uint64_t start_va,
14518     void *buf_start,
14519     void *buf_end,
14520     size_t *bytes_copied)
14521 {
14522 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14523 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14524 
14525 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14526 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14527 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14528 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14529 
14530 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14531 
14532 	if (cur_level == pt_attr_root_level(pt_attr)) {
14533 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14534 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14535 	}
14536 
14537 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14538 	const tt_entry_t *tt_end = &ttp[num_entries];
14539 
14540 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14541 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14542 	}
14543 
14544 	if (level_mask & (1U << cur_level)) {
14545 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14546 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14547 		header->num_entries = num_entries;
14548 		header->start_va = start_va;
14549 		header->end_va = start_va + (num_entries * size);
14550 
14551 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14552 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14553 	}
14554 	uint64_t current_va = start_va;
14555 
14556 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14557 		tt_entry_t tte = *ttep;
14558 
14559 		if (!(tte & valid_mask)) {
14560 			continue;
14561 		}
14562 
14563 		if ((tte & type_mask) == type_block) {
14564 			continue;
14565 		} else {
14566 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14567 				panic("%s: corrupt entry %#llx at %p, "
14568 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14569 				    __FUNCTION__, tte, ttep,
14570 				    ttp, cur_level, bufp, buf_end);
14571 			}
14572 
14573 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14574 
14575 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14576 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14577 
14578 			if (recurse_result != KERN_SUCCESS) {
14579 				return recurse_result;
14580 			}
14581 		}
14582 	}
14583 
14584 	return KERN_SUCCESS;
14585 }
14586 
14587 kern_return_t
14588 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14589 {
14590 	if (not_in_kdp) {
14591 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14592 	}
14593 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14594 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14595 }
14596 
14597 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14598 
14599 kern_return_t
14600 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14601     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14602 {
14603 	return KERN_NOT_SUPPORTED;
14604 }
14605 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14606 
14607 
14608 #ifdef CONFIG_XNUPOST
14609 #ifdef __arm64__
14610 static volatile bool pmap_test_took_fault = false;
14611 
14612 static bool
14613 pmap_test_fault_handler(arm_saved_state_t * state)
14614 {
14615 	bool retval                 = false;
14616 	uint64_t esr                = get_saved_state_esr(state);
14617 	esr_exception_class_t class = ESR_EC(esr);
14618 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14619 
14620 	if ((class == ESR_EC_DABORT_EL1) &&
14621 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14622 		pmap_test_took_fault = true;
14623 		/* return to the instruction immediately after the call to NX page */
14624 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14625 		retval = true;
14626 	}
14627 
14628 	return retval;
14629 }
14630 
14631 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14632 static NOKASAN bool
14633 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14634 {
14635 	pmap_t old_pmap = NULL;
14636 	thread_t thread = current_thread();
14637 
14638 	pmap_test_took_fault = false;
14639 
14640 	/*
14641 	 * We're potentially switching pmaps without using the normal thread
14642 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14643 	 * memory accesses.
14644 	 */
14645 	uint64_t old_int_state = pmap_interrupts_disable();
14646 	mp_disable_preemption();
14647 
14648 	if (pmap != NULL) {
14649 		old_pmap = current_pmap();
14650 		pmap_switch(pmap, thread);
14651 
14652 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14653 #if __ARM_PAN_AVAILABLE__
14654 		__builtin_arm_wsr("pan", 0);
14655 #endif /* __ARM_PAN_AVAILABLE__ */
14656 	}
14657 
14658 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14659 
14660 	if (is_write) {
14661 		*((volatile uint64_t*)(va)) = 0xdec0de;
14662 	} else {
14663 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14664 		(void)tmp;
14665 	}
14666 
14667 	/* Save the fault bool, and undo the gross stuff we did. */
14668 	bool took_fault = pmap_test_took_fault;
14669 	ml_expect_fault_end();
14670 
14671 	if (pmap != NULL) {
14672 #if __ARM_PAN_AVAILABLE__
14673 		__builtin_arm_wsr("pan", 1);
14674 #endif /* __ARM_PAN_AVAILABLE__ */
14675 
14676 		pmap_switch(old_pmap, thread);
14677 	}
14678 
14679 	mp_enable_preemption();
14680 	pmap_interrupts_restore(old_int_state);
14681 	bool retval = (took_fault == should_fault);
14682 	return retval;
14683 }
14684 
14685 static bool
14686 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14687 {
14688 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14689 
14690 	if (!retval) {
14691 		T_FAIL("%s: %s, "
14692 		    "pmap=%p, va=%p, should_fault=%u",
14693 		    __func__, should_fault ? "did not fault" : "faulted",
14694 		    pmap, (void*)va, (unsigned)should_fault);
14695 	}
14696 
14697 	return retval;
14698 }
14699 
14700 static bool
14701 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14702 {
14703 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14704 
14705 	if (!retval) {
14706 		T_FAIL("%s: %s, "
14707 		    "pmap=%p, va=%p, should_fault=%u",
14708 		    __func__, should_fault ? "did not fault" : "faulted",
14709 		    pmap, (void*)va, (unsigned)should_fault);
14710 	}
14711 
14712 	return retval;
14713 }
14714 
14715 static bool
14716 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14717 {
14718 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14719 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14720 
14721 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14722 
14723 	if (!retval) {
14724 		T_FAIL("%s: bits=%u, "
14725 		    "pa=%p, should_be_set=%u",
14726 		    __func__, bits,
14727 		    (void*)pa, should_be_set);
14728 	}
14729 
14730 	return retval;
14731 }
14732 
14733 static __attribute__((noinline)) bool
14734 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14735 {
14736 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14737 	return retval;
14738 }
14739 
14740 static int
14741 pmap_test_test_config(unsigned int flags)
14742 {
14743 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14744 	unsigned int map_count = 0;
14745 	unsigned long page_ratio = 0;
14746 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14747 
14748 	if (!pmap) {
14749 		panic("Failed to allocate pmap");
14750 	}
14751 
14752 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14753 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14754 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14755 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14756 
14757 	if (pmap_page_size <= native_page_size) {
14758 		page_ratio = native_page_size / pmap_page_size;
14759 	} else {
14760 		/*
14761 		 * We claim to support a page_ratio of less than 1, which is
14762 		 * not currently supported by the pmap layer; panic.
14763 		 */
14764 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14765 		    "flags=%u",
14766 		    __func__, native_page_size, pmap_page_size,
14767 		    flags);
14768 	}
14769 
14770 	if (PAGE_RATIO > 1) {
14771 		/*
14772 		 * The kernel is deliberately pretending to have 16KB pages.
14773 		 * The pmap layer has code that supports this, so pretend the
14774 		 * page size is larger than it is.
14775 		 */
14776 		pmap_page_size = PAGE_SIZE;
14777 		native_page_size = PAGE_SIZE;
14778 	}
14779 
14780 	/*
14781 	 * Get two pages from the VM; one to be mapped wired, and one to be
14782 	 * mapped nonwired.
14783 	 */
14784 	vm_page_t unwired_vm_page = vm_page_grab();
14785 	vm_page_t wired_vm_page = vm_page_grab();
14786 
14787 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14788 		panic("Failed to grab VM pages");
14789 	}
14790 
14791 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14792 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14793 
14794 	pmap_paddr_t pa = ptoa(pn);
14795 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14796 
14797 	/*
14798 	 * We'll start mappings at the second twig TT.  This keeps us from only
14799 	 * using the first entry in each TT, which would trivially be address
14800 	 * 0; one of the things we will need to test is retrieving the VA for
14801 	 * a given PTE.
14802 	 */
14803 	vm_map_address_t va_base = pmap_twig_size;
14804 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14805 
14806 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14807 		/*
14808 		 * Not exactly a functional failure, but this test relies on
14809 		 * there being a spare PTE slot we can use to pin the TT.
14810 		 */
14811 		panic("Cannot pin translation table");
14812 	}
14813 
14814 	/*
14815 	 * Create the wired mapping; this will prevent the pmap layer from
14816 	 * reclaiming our test TTs, which would interfere with this test
14817 	 * ("interfere" -> "make it panic").
14818 	 */
14819 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14820 
14821 #if XNU_MONITOR
14822 	/*
14823 	 * If the PPL is enabled, make sure that the kernel cannot write
14824 	 * to PPL memory.
14825 	 */
14826 	if (!pmap_ppl_disable) {
14827 		T_LOG("Validate that kernel cannot write to PPL memory.");
14828 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14829 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14830 	}
14831 #endif
14832 
14833 	/*
14834 	 * Create read-only mappings of the nonwired page; if the pmap does
14835 	 * not use the same page size as the kernel, create multiple mappings
14836 	 * so that the kernel page is fully mapped.
14837 	 */
14838 	for (map_count = 0; map_count < page_ratio; map_count++) {
14839 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14840 	}
14841 
14842 	/* Validate that all the PTEs have the expected PA and VA. */
14843 	for (map_count = 0; map_count < page_ratio; map_count++) {
14844 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14845 
14846 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14847 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14848 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14849 		}
14850 
14851 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14852 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14853 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14854 		}
14855 	}
14856 
14857 	T_LOG("Validate that reads to our mapping do not fault.");
14858 	pmap_test_read(pmap, va_base, false);
14859 
14860 	T_LOG("Validate that writes to our mapping fault.");
14861 	pmap_test_write(pmap, va_base, true);
14862 
14863 	T_LOG("Make the first mapping writable.");
14864 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14865 
14866 	T_LOG("Validate that writes to our mapping do not fault.");
14867 	pmap_test_write(pmap, va_base, false);
14868 
14869 
14870 	T_LOG("Make the first mapping execute-only");
14871 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14872 
14873 
14874 	T_LOG("Validate that reads to our mapping do not fault.");
14875 	pmap_test_read(pmap, va_base, false);
14876 
14877 	T_LOG("Validate that writes to our mapping fault.");
14878 	pmap_test_write(pmap, va_base, true);
14879 
14880 
14881 	/*
14882 	 * For page ratios of greater than 1: validate that writes to the other
14883 	 * mappings still fault.  Remove the mappings afterwards (we're done
14884 	 * with page ratio testing).
14885 	 */
14886 	for (map_count = 1; map_count < page_ratio; map_count++) {
14887 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14888 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14889 	}
14890 
14891 	T_LOG("Mark the page unreferenced and unmodified.");
14892 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14893 	pmap_test_check_refmod(pa, 0);
14894 
14895 	/*
14896 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14897 	 * different protection/fault_type settings, and confirm that the
14898 	 * ref/mod state matches our expectations at each step.
14899 	 */
14900 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14901 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14902 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14903 
14904 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14905 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14906 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14907 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14908 
14909 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14910 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14911 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14912 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14913 
14914 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14915 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14916 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14917 
14918 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14919 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14920 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14921 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14922 
14923 	/*
14924 	 * Shared memory testing; we'll have two mappings; one read-only,
14925 	 * one read-write.
14926 	 */
14927 	vm_map_address_t rw_base = va_base;
14928 	vm_map_address_t ro_base = va_base + pmap_page_size;
14929 
14930 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14931 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14932 
14933 	/*
14934 	 * Test that we take faults as expected for unreferenced/unmodified
14935 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14936 	 * mapping permissions change as expected.
14937 	 */
14938 	T_LOG("!ref/!mod: expect no access");
14939 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14940 	pmap_test_read_write(pmap, ro_base, false, false);
14941 	pmap_test_read_write(pmap, rw_base, false, false);
14942 
14943 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14944 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14945 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14946 	pmap_test_read_write(pmap, ro_base, true, false);
14947 	pmap_test_read_write(pmap, rw_base, true, false);
14948 
14949 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14950 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14951 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14952 	pmap_test_read_write(pmap, ro_base, true, false);
14953 	pmap_test_read_write(pmap, rw_base, true, true);
14954 
14955 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14956 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14957 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14958 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14959 	pmap_test_read_write(pmap, ro_base, true, false);
14960 	pmap_test_read_write(pmap, rw_base, true, true);
14961 
14962 	T_LOG("RW protect both mappings; should not change protections.");
14963 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14964 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14965 	pmap_test_read_write(pmap, ro_base, true, false);
14966 	pmap_test_read_write(pmap, rw_base, true, true);
14967 
14968 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14969 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14970 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14971 	pmap_test_read_write(pmap, ro_base, true, false);
14972 	pmap_test_read_write(pmap, rw_base, true, false);
14973 
14974 	T_LOG("RW protect the page; mappings should not change protections.");
14975 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14976 	pmap_page_protect(pn, VM_PROT_ALL);
14977 	pmap_test_read_write(pmap, ro_base, true, false);
14978 	pmap_test_read_write(pmap, rw_base, true, true);
14979 
14980 	T_LOG("Read protect the page; RW mapping should become RO.");
14981 	pmap_page_protect(pn, VM_PROT_READ);
14982 	pmap_test_read_write(pmap, ro_base, true, false);
14983 	pmap_test_read_write(pmap, rw_base, true, false);
14984 
14985 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14986 	pmap_disconnect(pn);
14987 	if (!pmap_verify_free(pn)) {
14988 		T_FAIL("Page still has mappings");
14989 	}
14990 
14991 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14992 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14993 	pmap_destroy(pmap);
14994 
14995 	T_LOG("Release the pages back to the VM.");
14996 	vm_page_lock_queues();
14997 	vm_page_free(unwired_vm_page);
14998 	vm_page_free(wired_vm_page);
14999 	vm_page_unlock_queues();
15000 
15001 	T_LOG("Testing successful!");
15002 	return 0;
15003 }
15004 #endif /* __arm64__ */
15005 
15006 kern_return_t
15007 pmap_test(void)
15008 {
15009 	T_LOG("Starting pmap_tests");
15010 #ifdef __arm64__
15011 	int flags = 0;
15012 	flags |= PMAP_CREATE_64BIT;
15013 
15014 #if __ARM_MIXED_PAGE_SIZE__
15015 	T_LOG("Testing VM_PAGE_SIZE_4KB");
15016 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15017 	T_LOG("Testing VM_PAGE_SIZE_16KB");
15018 	pmap_test_test_config(flags);
15019 #else /* __ARM_MIXED_PAGE_SIZE__ */
15020 	pmap_test_test_config(flags);
15021 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15022 
15023 #endif /* __arm64__ */
15024 	T_PASS("completed pmap_test successfully");
15025 	return KERN_SUCCESS;
15026 }
15027 #endif /* CONFIG_XNUPOST */
15028 
15029 /*
15030  * The following function should never make it to RELEASE code, since
15031  * it provides a way to get the PPL to modify text pages.
15032  */
15033 #if DEVELOPMENT || DEBUG
15034 
15035 #define ARM_UNDEFINED_INSN 0xe7f000f0
15036 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15037 
15038 /**
15039  * Forcibly overwrite executable text with an illegal instruction.
15040  *
15041  * @note Only used for xnu unit testing.
15042  *
15043  * @param pa The physical address to corrupt.
15044  *
15045  * @return KERN_SUCCESS on success.
15046  */
15047 kern_return_t
15048 pmap_test_text_corruption(pmap_paddr_t pa)
15049 {
15050 #if XNU_MONITOR
15051 	return pmap_test_text_corruption_ppl(pa);
15052 #else /* XNU_MONITOR */
15053 	return pmap_test_text_corruption_internal(pa);
15054 #endif /* XNU_MONITOR */
15055 }
15056 
15057 MARK_AS_PMAP_TEXT kern_return_t
15058 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15059 {
15060 	vm_offset_t va = phystokv(pa);
15061 	unsigned int pai = pa_index(pa);
15062 
15063 	assert(pa_valid(pa));
15064 
15065 	pvh_lock(pai);
15066 
15067 	pv_entry_t **pv_h  = pai_to_pvh(pai);
15068 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15069 #if defined(PVH_FLAG_EXEC)
15070 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15071 
15072 	if (need_ap_twiddle) {
15073 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15074 	}
15075 #endif /* defined(PVH_FLAG_EXEC) */
15076 
15077 	/*
15078 	 * The low bit in an instruction address indicates a THUMB instruction
15079 	 */
15080 	if (va & 1) {
15081 		va &= ~(vm_offset_t)1;
15082 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15083 	} else {
15084 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
15085 	}
15086 
15087 #if defined(PVH_FLAG_EXEC)
15088 	if (need_ap_twiddle) {
15089 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15090 	}
15091 #endif /* defined(PVH_FLAG_EXEC) */
15092 
15093 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15094 
15095 	pvh_unlock(pai);
15096 
15097 	return KERN_SUCCESS;
15098 }
15099 
15100 #endif /* DEVELOPMENT || DEBUG */
15101