xref: /xnu-8020.101.4/osfmk/arm/pmap.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2011-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <mach_assert.h>
30 #include <mach_ldebug.h>
31 
32 #include <mach/shared_region.h>
33 #include <mach/vm_param.h>
34 #include <mach/vm_prot.h>
35 #include <mach/vm_map.h>
36 #include <mach/machine/vm_param.h>
37 #include <mach/machine/vm_types.h>
38 
39 #include <mach/boolean.h>
40 #include <kern/bits.h>
41 #include <kern/thread.h>
42 #include <kern/sched.h>
43 #include <kern/zalloc.h>
44 #include <kern/zalloc_internal.h>
45 #include <kern/kalloc.h>
46 #include <kern/spl.h>
47 #include <kern/startup.h>
48 #include <kern/trustcache.h>
49 
50 #include <os/overflow.h>
51 
52 #include <vm/pmap.h>
53 #include <vm/pmap_cs.h>
54 #include <vm/vm_map.h>
55 #include <vm/vm_kern.h>
56 #include <vm/vm_protos.h>
57 #include <vm/vm_object.h>
58 #include <vm/vm_page.h>
59 #include <vm/vm_pageout.h>
60 #include <vm/cpm.h>
61 
62 #include <libkern/img4/interface.h>
63 #include <libkern/section_keywords.h>
64 #include <sys/errno.h>
65 
66 #include <machine/atomic.h>
67 #include <machine/thread.h>
68 #include <machine/lowglobals.h>
69 
70 #include <arm/caches_internal.h>
71 #include <arm/cpu_data.h>
72 #include <arm/cpu_data_internal.h>
73 #include <arm/cpu_capabilities.h>
74 #include <arm/cpu_number.h>
75 #include <arm/machine_cpu.h>
76 #include <arm/misc_protos.h>
77 #include <arm/pmap/pmap_internal.h>
78 #include <arm/trap.h>
79 
80 #if     (__ARM_VMSA__ > 7)
81 #include <arm64/proc_reg.h>
82 #include <pexpert/arm64/boot.h>
83 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
84 #include <arm64/amcc_rorgn.h>
85 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
86 #endif
87 
88 #include <pexpert/device_tree.h>
89 
90 #include <san/kasan.h>
91 #include <sys/cdefs.h>
92 
93 #if defined(HAS_APPLE_PAC)
94 #include <ptrauth.h>
95 #endif
96 
97 #ifdef CONFIG_XNUPOST
98 #include <tests/xnupost.h>
99 #endif
100 
101 
102 #if HIBERNATION
103 #include <IOKit/IOHibernatePrivate.h>
104 #endif /* HIBERNATION */
105 
106 #ifdef __ARM64_PMAP_SUBPAGE_L1__
107 #if (__ARM_VMSA__ <= 7)
108 #error This is not supported for old-style page tables
109 #endif
110 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
111 #else
112 #if (__ARM_VMSA__ <= 7)
113 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES * 2)
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 #endif
118 
119 extern u_int32_t random(void); /* from <libkern/libkern.h> */
120 
121 static bool alloc_asid(pmap_t pmap);
122 static void free_asid(pmap_t pmap);
123 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
124 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
125 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
126 
127 static const struct page_table_ops native_pt_ops =
128 {
129 	.alloc_id = alloc_asid,
130 	.free_id = free_asid,
131 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
132 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
133 	.wimg_to_pte = wimg_to_pte,
134 };
135 
136 #if (__ARM_VMSA__ > 7)
137 const struct page_table_level_info pmap_table_level_info_16k[] =
138 {
139 	[0] = {
140 		.size       = ARM_16K_TT_L0_SIZE,
141 		.offmask    = ARM_16K_TT_L0_OFFMASK,
142 		.shift      = ARM_16K_TT_L0_SHIFT,
143 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
144 		.valid_mask = ARM_TTE_VALID,
145 		.type_mask  = ARM_TTE_TYPE_MASK,
146 		.type_block = ARM_TTE_TYPE_BLOCK
147 	},
148 	[1] = {
149 		.size       = ARM_16K_TT_L1_SIZE,
150 		.offmask    = ARM_16K_TT_L1_OFFMASK,
151 		.shift      = ARM_16K_TT_L1_SHIFT,
152 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
153 		.valid_mask = ARM_TTE_VALID,
154 		.type_mask  = ARM_TTE_TYPE_MASK,
155 		.type_block = ARM_TTE_TYPE_BLOCK
156 	},
157 	[2] = {
158 		.size       = ARM_16K_TT_L2_SIZE,
159 		.offmask    = ARM_16K_TT_L2_OFFMASK,
160 		.shift      = ARM_16K_TT_L2_SHIFT,
161 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
162 		.valid_mask = ARM_TTE_VALID,
163 		.type_mask  = ARM_TTE_TYPE_MASK,
164 		.type_block = ARM_TTE_TYPE_BLOCK
165 	},
166 	[3] = {
167 		.size       = ARM_16K_TT_L3_SIZE,
168 		.offmask    = ARM_16K_TT_L3_OFFMASK,
169 		.shift      = ARM_16K_TT_L3_SHIFT,
170 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
171 		.valid_mask = ARM_PTE_TYPE_VALID,
172 		.type_mask  = ARM_PTE_TYPE_MASK,
173 		.type_block = ARM_TTE_TYPE_L3BLOCK
174 	}
175 };
176 
177 const struct page_table_level_info pmap_table_level_info_4k[] =
178 {
179 	[0] = {
180 		.size       = ARM_4K_TT_L0_SIZE,
181 		.offmask    = ARM_4K_TT_L0_OFFMASK,
182 		.shift      = ARM_4K_TT_L0_SHIFT,
183 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
184 		.valid_mask = ARM_TTE_VALID,
185 		.type_mask  = ARM_TTE_TYPE_MASK,
186 		.type_block = ARM_TTE_TYPE_BLOCK
187 	},
188 	[1] = {
189 		.size       = ARM_4K_TT_L1_SIZE,
190 		.offmask    = ARM_4K_TT_L1_OFFMASK,
191 		.shift      = ARM_4K_TT_L1_SHIFT,
192 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
193 		.valid_mask = ARM_TTE_VALID,
194 		.type_mask  = ARM_TTE_TYPE_MASK,
195 		.type_block = ARM_TTE_TYPE_BLOCK
196 	},
197 	[2] = {
198 		.size       = ARM_4K_TT_L2_SIZE,
199 		.offmask    = ARM_4K_TT_L2_OFFMASK,
200 		.shift      = ARM_4K_TT_L2_SHIFT,
201 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
202 		.valid_mask = ARM_TTE_VALID,
203 		.type_mask  = ARM_TTE_TYPE_MASK,
204 		.type_block = ARM_TTE_TYPE_BLOCK
205 	},
206 	[3] = {
207 		.size       = ARM_4K_TT_L3_SIZE,
208 		.offmask    = ARM_4K_TT_L3_OFFMASK,
209 		.shift      = ARM_4K_TT_L3_SHIFT,
210 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
211 		.valid_mask = ARM_PTE_TYPE_VALID,
212 		.type_mask  = ARM_PTE_TYPE_MASK,
213 		.type_block = ARM_TTE_TYPE_L3BLOCK
214 	}
215 };
216 
217 const struct page_table_attr pmap_pt_attr_4k = {
218 	.pta_level_info = pmap_table_level_info_4k,
219 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
220 #if __ARM_MIXED_PAGE_SIZE__
221 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
222 #else /* __ARM_MIXED_PAGE_SIZE__ */
223 #if __ARM_16K_PG__
224 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
225 #else /* __ARM_16K_PG__ */
226 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
227 #endif /* __ARM_16K_PG__ */
228 #endif /* __ARM_MIXED_PAGE_SIZE__ */
229 	.pta_max_level  = PMAP_TT_L3_LEVEL,
230 	.pta_ops = &native_pt_ops,
231 	.ap_ro = ARM_PTE_AP(AP_RORO),
232 	.ap_rw = ARM_PTE_AP(AP_RWRW),
233 	.ap_rona = ARM_PTE_AP(AP_RONA),
234 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
235 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
236 	.ap_x = ARM_PTE_PNX,
237 #if __ARM_MIXED_PAGE_SIZE__
238 	.pta_tcr_value  = TCR_EL1_4KB,
239 #endif /* __ARM_MIXED_PAGE_SIZE__ */
240 	.pta_page_size  = 4096,
241 	.pta_page_shift = 12,
242 };
243 
244 const struct page_table_attr pmap_pt_attr_16k = {
245 	.pta_level_info = pmap_table_level_info_16k,
246 	.pta_root_level = PMAP_TT_L1_LEVEL,
247 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
248 	.pta_max_level  = PMAP_TT_L3_LEVEL,
249 	.pta_ops = &native_pt_ops,
250 	.ap_ro = ARM_PTE_AP(AP_RORO),
251 	.ap_rw = ARM_PTE_AP(AP_RWRW),
252 	.ap_rona = ARM_PTE_AP(AP_RONA),
253 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
254 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
255 	.ap_x = ARM_PTE_PNX,
256 #if __ARM_MIXED_PAGE_SIZE__
257 	.pta_tcr_value  = TCR_EL1_16KB,
258 #endif /* __ARM_MIXED_PAGE_SIZE__ */
259 	.pta_page_size  = 16384,
260 	.pta_page_shift = 14,
261 };
262 
263 #if __ARM_16K_PG__
264 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
265 #else /* !__ARM_16K_PG__ */
266 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
267 #endif /* !__ARM_16K_PG__ */
268 
269 
270 #else /* (__ARM_VMSA__ > 7) */
271 /*
272  * We don't support pmap parameterization for VMSA7, so use an opaque
273  * page_table_attr structure.
274  */
275 const struct page_table_attr * const native_pt_attr = NULL;
276 #endif /* (__ARM_VMSA__ > 7) */
277 
278 
279 static inline void
pmap_sync_tlb(bool strong __unused)280 pmap_sync_tlb(bool strong __unused)
281 {
282 	sync_tlb_flush();
283 }
284 
285 #if MACH_ASSERT
286 int vm_footprint_suspend_allowed = 1;
287 
288 extern int pmap_ledgers_panic;
289 extern int pmap_ledgers_panic_leeway;
290 
291 #endif /* MACH_ASSERT */
292 
293 #if DEVELOPMENT || DEBUG
294 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
295 	(current_thread()->pmap_footprint_suspended)
296 #else /* DEVELOPMENT || DEBUG */
297 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
298 #endif /* DEVELOPMENT || DEBUG */
299 
300 
301 #ifdef PLATFORM_BridgeOS
302 static struct pmap_legacy_trust_cache *pmap_legacy_trust_caches MARK_AS_PMAP_DATA = NULL;
303 #endif
304 static struct pmap_image4_trust_cache *pmap_image4_trust_caches MARK_AS_PMAP_DATA = NULL;
305 
306 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_loaded_trust_caches_lock, 0);
307 
308 SECURITY_READ_ONLY_LATE(int) srd_fused = 0;
309 
310 /*
311  * Represents a tlb range that will be flushed before exiting
312  * the ppl.
313  * Used by phys_attribute_clear_range to defer flushing pages in
314  * this range until the end of the operation.
315  */
316 typedef struct pmap_tlb_flush_range {
317 	pmap_t ptfr_pmap;
318 	vm_map_address_t ptfr_start;
319 	vm_map_address_t ptfr_end;
320 	bool ptfr_flush_needed;
321 } pmap_tlb_flush_range_t;
322 
323 #if XNU_MONITOR
324 /*
325  * PPL External References.
326  */
327 extern vm_offset_t   segPPLDATAB;
328 extern unsigned long segSizePPLDATA;
329 extern vm_offset_t   segPPLTEXTB;
330 extern unsigned long segSizePPLTEXT;
331 extern vm_offset_t   segPPLDATACONSTB;
332 extern unsigned long segSizePPLDATACONST;
333 
334 
335 /*
336  * PPL Global Variables
337  */
338 
339 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
340 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
341 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
342 #else
343 const boolean_t pmap_ppl_disable = FALSE;
344 #endif
345 
346 /*
347  * Indicates if the PPL has started applying APRR.
348  * This variable is accessed from various assembly trampolines, so be sure to change
349  * those if you change the size or layout of this variable.
350  */
351 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
352 
353 extern void *pmap_stacks_start;
354 extern void *pmap_stacks_end;
355 
356 #endif /* !XNU_MONITOR */
357 
358 
359 /* Virtual memory region for early allocation */
360 #if     (__ARM_VMSA__ == 7)
361 #define VREGION1_HIGH_WINDOW    (0)
362 #else
363 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
364 #endif
365 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
366 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
367 
368 extern uint8_t bootstrap_pagetables[];
369 
370 extern unsigned int not_in_kdp;
371 
372 extern vm_offset_t first_avail;
373 
374 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
375 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
376 extern vm_offset_t     static_memory_end;
377 
378 extern const vm_map_address_t physmap_base;
379 extern const vm_map_address_t physmap_end;
380 
381 extern int maxproc, hard_maxproc;
382 
383 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
384 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
385 
386 #if (__ARM_VMSA__ > 7)
387 /* The number of address bits one TTBR can cover. */
388 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
389 
390 /*
391  * The bounds on our TTBRs.  These are for sanity checking that
392  * an address is accessible by a TTBR before we attempt to map it.
393  */
394 
395 /* The level of the root of a page table. */
396 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
397 
398 /* The number of entries in the root TT of a page table. */
399 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
400 #else
401 const uint64_t arm64_root_pgtable_level = 0;
402 const uint64_t arm64_root_pgtable_num_ttes = 0;
403 #endif
404 
405 struct pmap                     kernel_pmap_store MARK_AS_PMAP_DATA;
406 SECURITY_READ_ONLY_LATE(pmap_t) kernel_pmap = &kernel_pmap_store;
407 
408 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
409 
410 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
411 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
412 unsigned int    pmap_stamp MARK_AS_PMAP_DATA;
413 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
414 
415 typedef struct tt_free_entry {
416 	struct tt_free_entry    *next;
417 } tt_free_entry_t;
418 
419 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
420 
421 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
422 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
423 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
424 #define FREE_PAGE_SIZE_TT_MAX   4
425 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
426 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
427 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
428 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
429 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
430 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
431 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
432 
433 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
434 
435 boolean_t pmap_gc_allowed MARK_AS_PMAP_DATA = TRUE;
436 boolean_t pmap_gc_forced MARK_AS_PMAP_DATA = FALSE;
437 boolean_t pmap_gc_allowed_by_time_throttle = TRUE;
438 
439 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
440 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
441 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
442 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
443 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
444 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
445 
446 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
447 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
448 
449 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
450 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
451 
452 /* Lock group used for all pmap object locks. */
453 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
454 
455 #if DEVELOPMENT || DEBUG
456 int nx_enabled = 1;                                     /* enable no-execute protection */
457 int allow_data_exec  = 0;                               /* No apps may execute data */
458 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
459 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
460 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
461 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
462 #else /* DEVELOPMENT || DEBUG */
463 const int nx_enabled = 1;                                       /* enable no-execute protection */
464 const int allow_data_exec  = 0;                         /* No apps may execute data */
465 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
466 #endif /* DEVELOPMENT || DEBUG */
467 
468 /**
469  * This variable is set true during hibernation entry to protect pmap data structures
470  * during image copying, and reset false on hibernation exit.
471  */
472 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
473 
474 #if MACH_ASSERT
475 static void pmap_check_ledgers(pmap_t pmap);
476 #else
477 static inline void
pmap_check_ledgers(__unused pmap_t pmap)478 pmap_check_ledgers(__unused pmap_t pmap)
479 {
480 }
481 #endif /* MACH_ASSERT */
482 
483 /**
484  * This helper function ensures that potentially-long-running batched PPL operations are
485  * called in preemptible context before entering the PPL, so that the PPL call may
486  * periodically exit to allow pending urgent ASTs to be taken.
487  */
488 static inline void
pmap_verify_preemptible(void)489 pmap_verify_preemptible(void)
490 {
491 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
492 }
493 
494 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
495 
496 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
497 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
498 
499 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
500 
501 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
502 #if defined(__arm64__)
503 #  ifdef XNU_TARGET_OS_OSX
504 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
505 #  else
506 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
507 #  endif
508 #endif /* __arm64__ */
509 
510 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
511 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
512 #else
513 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
514 #endif
515 
516 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
517 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
518 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
519 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
520 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
521 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
522 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
523 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
524 
525 
526 #if (__ARM_VMSA__ > 7)
527 #if __ARM_MIXED_PAGE_SIZE__
528 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_4k;
529 #endif
530 SECURITY_READ_ONLY_LATE(pmap_t) sharedpage_pmap_default;
531 #endif
532 
533 /* PTE Define Macros */
534 
535 #define pte_is_wired(pte)                                                               \
536 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
537 
538 #define pte_was_writeable(pte) \
539 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
540 
541 #define pte_set_was_writeable(pte, was_writeable) \
542 	do {                                         \
543 	        if ((was_writeable)) {               \
544 	                (pte) |= ARM_PTE_WRITEABLE;  \
545 	        } else {                             \
546 	                (pte) &= ~ARM_PTE_WRITEABLE; \
547 	        }                                    \
548 	} while(0)
549 
550 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)551 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
552 {
553 	if (wired) {
554 		*ptep |= ARM_PTE_WIRED;
555 	} else {
556 		*ptep &= ~ARM_PTE_WIRED;
557 	}
558 	/*
559 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
560 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
561 	 * never reclaimed.
562 	 */
563 	if (pmap == kernel_pmap) {
564 		return;
565 	}
566 	unsigned short *ptd_wiredcnt_ptr;
567 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
568 	if (wired) {
569 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
570 	} else {
571 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
572 		if (__improbable(prev_wired == 0)) {
573 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
574 		}
575 	}
576 }
577 
578 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                       \
579 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
580 	pmap_sync_tlb(strong);                                                                        \
581 }
582 
583 /*
584  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
585  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
586  * will observe the updated PTE.
587  */
588 #define FLUSH_PTE()                                                                     \
589 	__builtin_arm_dmb(DMB_ISH);
590 
591 /*
592  * Synchronize updates to PTEs that were previously valid and thus may be cached in
593  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
594  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
595  * program order will not issue until the DSB completes.  Prior loads may be reordered
596  * after the barrier, but their behavior should not be materially affected by the
597  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
598  * matter for loads until the access is re-driven well after the TLB update is
599  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
600  * we should be in a position to handle access faults.  For "voluntary" PTE access
601  * restriction due to unmapping or protection, the decision to restrict access should
602  * have a data dependency on prior loads in order to avoid a data race.
603  */
604 #define FLUSH_PTE_STRONG()                                                             \
605 	__builtin_arm_dsb(DSB_ISHST);
606 
607 /**
608  * Write enough page table entries to map a single VM page. On systems where the
609  * VM page size does not match the hardware page size, multiple page table
610  * entries will need to be written.
611  *
612  * @note This function does not emit a barrier to ensure these page table writes
613  *       have completed before continuing. This is commonly needed. In the case
614  *       where a DMB or DSB barrier is needed, then use the write_pte() and
615  *       write_pte_strong() functions respectively instead of this one.
616  *
617  * @param ptep Pointer to the first page table entry to update.
618  * @param pte The value to write into each page table entry. In the case that
619  *            multiple PTEs are updated to a non-empty value, then the address
620  *            in this value will automatically be incremented for each PTE
621  *            write.
622  */
623 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)624 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
625 {
626 	/**
627 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
628 	 * systems, which is why it's checked at runtime instead of compile time.
629 	 * The "unreachable" warning needs to be suppressed because it still is a
630 	 * compile time constant on some systems.
631 	 */
632 	__unreachable_ok_push
633 	if (TEST_PAGE_RATIO_4) {
634 		if (((uintptr_t)ptep) & 0x1f) {
635 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
636 			    __func__, ptep, (void*)pte);
637 		}
638 
639 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
640 			/**
641 			 * If we're writing an empty/compressed PTE value, then don't
642 			 * auto-increment the address for each PTE write.
643 			 */
644 			*ptep = pte;
645 			*(ptep + 1) = pte;
646 			*(ptep + 2) = pte;
647 			*(ptep + 3) = pte;
648 		} else {
649 			*ptep = pte;
650 			*(ptep + 1) = pte | 0x1000;
651 			*(ptep + 2) = pte | 0x2000;
652 			*(ptep + 3) = pte | 0x3000;
653 		}
654 	} else {
655 		*ptep = pte;
656 	}
657 	__unreachable_ok_pop
658 }
659 
660 /**
661  * Writes enough page table entries to map a single VM page and then ensures
662  * those writes complete by executing a Data Memory Barrier.
663  *
664  * @note The DMB issued by this function is not strong enough to protect against
665  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
666  *       instruction is going to immediately be called after this write, it's
667  *       recommended to call write_pte_strong() instead of this function.
668  *
669  * See the function header for write_pte_fast() for more details on the
670  * parameters.
671  */
672 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)673 write_pte(pt_entry_t *ptep, pt_entry_t pte)
674 {
675 	write_pte_fast(ptep, pte);
676 	FLUSH_PTE();
677 }
678 
679 /**
680  * Writes enough page table entries to map a single VM page and then ensures
681  * those writes complete by executing a Data Synchronization Barrier. This
682  * barrier provides stronger guarantees than the DMB executed by write_pte().
683  *
684  * @note This function is useful if you're going to immediately flush the TLB
685  *       after making the PTE write. A DSB is required to protect against the
686  *       TLB invalidate being reordered before the PTE write.
687  *
688  * See the function header for write_pte_fast() for more details on the
689  * parameters.
690  */
691 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)692 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
693 {
694 	write_pte_fast(ptep, pte);
695 	FLUSH_PTE_STRONG();
696 }
697 
698 /**
699  * Retrieve the pmap structure for the thread running on the current CPU.
700  */
701 pmap_t
current_pmap()702 current_pmap()
703 {
704 	const pmap_t current = vm_map_pmap(current_thread()->map);
705 
706 	assert(current != NULL);
707 
708 #if XNU_MONITOR
709 	/**
710 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
711 	 * decided by kernel-writable memory. This function is used in various parts
712 	 * of the PPL, and besides validating that the pointer returned by this
713 	 * function is indeed a pmap structure, it's also important to ensure that
714 	 * it's actually the current thread's pmap. This is because different pmaps
715 	 * will have access to different entitlements based on the code signature of
716 	 * their loaded process. So if a different user pmap is set in the current
717 	 * thread structure (in an effort to bypass code signing restrictions), even
718 	 * though the structure would validate correctly as it is a real pmap
719 	 * structure, it should fail here.
720 	 *
721 	 * This only needs to occur for user pmaps because the kernel pmap's root
722 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
723 	 * changed so it'd be redundant to check), and its code signing fields are
724 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
725 	 * it shouldn't be possible to set those fields. Due to that, an attacker
726 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
727 	 * this check won't accomplish anything as it doesn't provide any extra code
728 	 * signing entitlements.
729 	 */
730 	if ((current != kernel_pmap) &&
731 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
732 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
733 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
734 	}
735 #endif /* XNU_MONITOR */
736 
737 	return current;
738 }
739 
740 #if DEVELOPMENT || DEBUG
741 
742 /*
743  * Trace levels are controlled by a bitmask in which each
744  * level can be enabled/disabled by the (1<<level) position
745  * in the boot arg
746  * Level 0: PPL extension functionality
747  * Level 1: pmap lifecycle (create/destroy/switch)
748  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
749  * Level 3: internal state management (attributes/fast-fault)
750  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
751  */
752 
753 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
754 
755 #define PMAP_TRACE(level, ...) \
756 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
757 	        KDBG_RELEASE(__VA_ARGS__); \
758 	}
759 #else /* DEVELOPMENT || DEBUG */
760 
761 #define PMAP_TRACE(level, ...)
762 
763 #endif /* DEVELOPMENT || DEBUG */
764 
765 
766 /*
767  * Internal function prototypes (forward declarations).
768  */
769 
770 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
771 
772 static void pmap_set_reference(ppnum_t pn);
773 
774 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
775 
776 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
777 
778 static kern_return_t pmap_expand(
779 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
780 
781 static int pmap_remove_range(
782 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
783 
784 static tt_entry_t *pmap_tt1_allocate(
785 	pmap_t, vm_size_t, unsigned int);
786 
787 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
788 
789 static void pmap_tt1_deallocate(
790 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
791 
792 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
793 
794 static kern_return_t pmap_tt_allocate(
795 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
796 
797 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
798 
799 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
800 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
801 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
802 
803 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
804 
805 #if     (__ARM_VMSA__ > 7)
806 
807 static void pmap_unmap_sharedpage(
808 	pmap_t pmap);
809 
810 static boolean_t
811 pmap_is_64bit(pmap_t);
812 
813 
814 #endif /* (__ARM_VMSA__ > 7) */
815 
816 static void pmap_update_cache_attributes_locked(
817 	ppnum_t, unsigned);
818 
819 static boolean_t arm_clear_fast_fault(
820 	ppnum_t ppnum,
821 	vm_prot_t fault_type,
822 	pt_entry_t *pte_p);
823 
824 static void pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes);
825 
826 static void pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes);
827 
828 static void pmap_trim_self(pmap_t pmap);
829 static void pmap_trim_subord(pmap_t subord);
830 
831 
832 /*
833  * Temporary prototypes, while we wait for pmap_enter to move to taking an
834  * address instead of a page number.
835  */
836 static kern_return_t
837 pmap_enter_addr(
838 	pmap_t pmap,
839 	vm_map_address_t v,
840 	pmap_paddr_t pa,
841 	vm_prot_t prot,
842 	vm_prot_t fault_type,
843 	unsigned int flags,
844 	boolean_t wired);
845 
846 kern_return_t
847 pmap_enter_options_addr(
848 	pmap_t pmap,
849 	vm_map_address_t v,
850 	pmap_paddr_t pa,
851 	vm_prot_t prot,
852 	vm_prot_t fault_type,
853 	unsigned int flags,
854 	boolean_t wired,
855 	unsigned int options,
856 	__unused void   *arg);
857 
858 #ifdef CONFIG_XNUPOST
859 kern_return_t pmap_test(void);
860 #endif /* CONFIG_XNUPOST */
861 
862 PMAP_SUPPORT_PROTOTYPES(
863 	kern_return_t,
864 	arm_fast_fault, (pmap_t pmap,
865 	vm_map_address_t va,
866 	vm_prot_t fault_type,
867 	bool was_af_fault,
868 	bool from_user), ARM_FAST_FAULT_INDEX);
869 
870 PMAP_SUPPORT_PROTOTYPES(
871 	boolean_t,
872 	arm_force_fast_fault, (ppnum_t ppnum,
873 	vm_prot_t allow_mode,
874 	int options), ARM_FORCE_FAST_FAULT_INDEX);
875 
876 MARK_AS_PMAP_TEXT static boolean_t
877 arm_force_fast_fault_with_flush_range(
878 	ppnum_t ppnum,
879 	vm_prot_t allow_mode,
880 	int options,
881 	pmap_tlb_flush_range_t *flush_range);
882 
883 PMAP_SUPPORT_PROTOTYPES(
884 	boolean_t,
885 	pmap_batch_set_cache_attributes, (ppnum_t pn,
886 	unsigned int cacheattr,
887 	unsigned int page_cnt,
888 	unsigned int page_index,
889 	boolean_t doit,
890 	unsigned int *res), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
891 
892 PMAP_SUPPORT_PROTOTYPES(
893 	void,
894 	pmap_change_wiring, (pmap_t pmap,
895 	vm_map_address_t v,
896 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
897 
898 PMAP_SUPPORT_PROTOTYPES(
899 	pmap_t,
900 	pmap_create_options, (ledger_t ledger,
901 	vm_map_size_t size,
902 	unsigned int flags,
903 	kern_return_t * kr), PMAP_CREATE_INDEX);
904 
905 PMAP_SUPPORT_PROTOTYPES(
906 	void,
907 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
908 
909 PMAP_SUPPORT_PROTOTYPES(
910 	kern_return_t,
911 	pmap_enter_options, (pmap_t pmap,
912 	vm_map_address_t v,
913 	pmap_paddr_t pa,
914 	vm_prot_t prot,
915 	vm_prot_t fault_type,
916 	unsigned int flags,
917 	boolean_t wired,
918 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
919 
920 PMAP_SUPPORT_PROTOTYPES(
921 	pmap_paddr_t,
922 	pmap_find_pa, (pmap_t pmap,
923 	addr64_t va), PMAP_FIND_PA_INDEX);
924 
925 #if (__ARM_VMSA__ > 7)
926 PMAP_SUPPORT_PROTOTYPES(
927 	kern_return_t,
928 	pmap_insert_sharedpage, (pmap_t pmap), PMAP_INSERT_SHAREDPAGE_INDEX);
929 #endif
930 
931 
932 PMAP_SUPPORT_PROTOTYPES(
933 	boolean_t,
934 	pmap_is_empty, (pmap_t pmap,
935 	vm_map_offset_t va_start,
936 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
937 
938 
939 PMAP_SUPPORT_PROTOTYPES(
940 	unsigned int,
941 	pmap_map_cpu_windows_copy, (ppnum_t pn,
942 	vm_prot_t prot,
943 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
944 
945 PMAP_SUPPORT_PROTOTYPES(
946 	void,
947 	pmap_ro_zone_memcpy, (zone_id_t zid,
948 	vm_offset_t va,
949 	vm_offset_t offset,
950 	const vm_offset_t new_data,
951 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
952 
953 PMAP_SUPPORT_PROTOTYPES(
954 	uint64_t,
955 	pmap_ro_zone_atomic_op, (zone_id_t zid,
956 	vm_offset_t va,
957 	vm_offset_t offset,
958 	zro_atomic_op_t op,
959 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
960 
961 PMAP_SUPPORT_PROTOTYPES(
962 	void,
963 	pmap_ro_zone_bzero, (zone_id_t zid,
964 	vm_offset_t va,
965 	vm_offset_t offset,
966 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
967 
968 PMAP_SUPPORT_PROTOTYPES(
969 	vm_map_offset_t,
970 	pmap_nest, (pmap_t grand,
971 	pmap_t subord,
972 	addr64_t vstart,
973 	uint64_t size,
974 	vm_map_offset_t vrestart,
975 	kern_return_t * krp), PMAP_NEST_INDEX);
976 
977 PMAP_SUPPORT_PROTOTYPES(
978 	void,
979 	pmap_page_protect_options, (ppnum_t ppnum,
980 	vm_prot_t prot,
981 	unsigned int options,
982 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
983 
984 PMAP_SUPPORT_PROTOTYPES(
985 	vm_map_address_t,
986 	pmap_protect_options, (pmap_t pmap,
987 	vm_map_address_t start,
988 	vm_map_address_t end,
989 	vm_prot_t prot,
990 	unsigned int options,
991 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
992 
993 PMAP_SUPPORT_PROTOTYPES(
994 	kern_return_t,
995 	pmap_query_page_info, (pmap_t pmap,
996 	vm_map_offset_t va,
997 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
998 
999 PMAP_SUPPORT_PROTOTYPES(
1000 	mach_vm_size_t,
1001 	pmap_query_resident, (pmap_t pmap,
1002 	vm_map_address_t start,
1003 	vm_map_address_t end,
1004 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1005 
1006 PMAP_SUPPORT_PROTOTYPES(
1007 	void,
1008 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1009 
1010 PMAP_SUPPORT_PROTOTYPES(
1011 	vm_map_address_t,
1012 	pmap_remove_options, (pmap_t pmap,
1013 	vm_map_address_t start,
1014 	vm_map_address_t end,
1015 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1016 
1017 
1018 PMAP_SUPPORT_PROTOTYPES(
1019 	void,
1020 	pmap_set_cache_attributes, (ppnum_t pn,
1021 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1022 
1023 PMAP_SUPPORT_PROTOTYPES(
1024 	void,
1025 	pmap_update_compressor_page, (ppnum_t pn,
1026 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1027 
1028 PMAP_SUPPORT_PROTOTYPES(
1029 	void,
1030 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1031 
1032 #if MACH_ASSERT || XNU_MONITOR
1033 PMAP_SUPPORT_PROTOTYPES(
1034 	void,
1035 	pmap_set_process, (pmap_t pmap,
1036 	int pid,
1037 	char *procname), PMAP_SET_PROCESS_INDEX);
1038 #endif
1039 
1040 PMAP_SUPPORT_PROTOTYPES(
1041 	void,
1042 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1043 
1044 PMAP_SUPPORT_PROTOTYPES(
1045 	vm_map_offset_t,
1046 	pmap_unnest_options, (pmap_t grand,
1047 	addr64_t vaddr,
1048 	uint64_t size,
1049 	vm_map_offset_t vrestart,
1050 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1051 
1052 PMAP_SUPPORT_PROTOTYPES(
1053 	void,
1054 	phys_attribute_set, (ppnum_t pn,
1055 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1056 
1057 PMAP_SUPPORT_PROTOTYPES(
1058 	void,
1059 	phys_attribute_clear, (ppnum_t pn,
1060 	unsigned int bits,
1061 	int options,
1062 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1063 
1064 #if __ARM_RANGE_TLBI__
1065 PMAP_SUPPORT_PROTOTYPES(
1066 	vm_map_address_t,
1067 	phys_attribute_clear_range, (pmap_t pmap,
1068 	vm_map_address_t start,
1069 	vm_map_address_t end,
1070 	unsigned int bits,
1071 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1072 #endif /* __ARM_RANGE_TLBI__ */
1073 
1074 
1075 PMAP_SUPPORT_PROTOTYPES(
1076 	void,
1077 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1078 
1079 PMAP_SUPPORT_PROTOTYPES(
1080 	void,
1081 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1082 
1083 PMAP_SUPPORT_PROTOTYPES(
1084 	void,
1085 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1086 
1087 PMAP_SUPPORT_PROTOTYPES(
1088 	void,
1089 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1090 
1091 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1092 PMAP_SUPPORT_PROTOTYPES(
1093 	void,
1094 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1095 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1096 
1097 PMAP_SUPPORT_PROTOTYPES(
1098 	void,
1099 	pmap_trim, (pmap_t grand,
1100 	pmap_t subord,
1101 	addr64_t vstart,
1102 	uint64_t size), PMAP_TRIM_INDEX);
1103 
1104 #if HAS_APPLE_PAC
1105 PMAP_SUPPORT_PROTOTYPES(
1106 	void *,
1107 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1108 PMAP_SUPPORT_PROTOTYPES(
1109 	void *,
1110 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1111 #endif /* HAS_APPLE_PAC */
1112 
1113 
1114 
1115 
1116 PMAP_SUPPORT_PROTOTYPES(
1117 	bool,
1118 	pmap_is_trust_cache_loaded, (const uuid_t uuid), PMAP_IS_TRUST_CACHE_LOADED_INDEX);
1119 
1120 PMAP_SUPPORT_PROTOTYPES(
1121 	uint32_t,
1122 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1123 
1124 PMAP_SUPPORT_PROTOTYPES(
1125 	bool,
1126 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1127 
1128 PMAP_SUPPORT_PROTOTYPES(
1129 	void,
1130 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1131 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1132 
1133 PMAP_SUPPORT_PROTOTYPES(
1134 	bool,
1135 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1136 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1137 
1138 PMAP_SUPPORT_PROTOTYPES(
1139 	void,
1140 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE]),
1141 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1142 
1143 PMAP_SUPPORT_PROTOTYPES(
1144 	void,
1145 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1146 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1147 
1148 PMAP_SUPPORT_PROTOTYPES(
1149 	void,
1150 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1151 
1152 void pmap_footprint_suspend(vm_map_t    map,
1153     boolean_t   suspend);
1154 PMAP_SUPPORT_PROTOTYPES(
1155 	void,
1156 	pmap_footprint_suspend, (vm_map_t map,
1157 	boolean_t suspend),
1158 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1159 
1160 
1161 
1162 
1163 #if DEVELOPMENT || DEBUG
1164 PMAP_SUPPORT_PROTOTYPES(
1165 	kern_return_t,
1166 	pmap_test_text_corruption, (pmap_paddr_t),
1167 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1168 #endif /* DEVELOPMENT || DEBUG */
1169 
1170 #if     (__ARM_VMSA__ > 7)
1171 /*
1172  * The low global vector page is mapped at a fixed alias.
1173  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1174  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1175  * to check both addresses anyway for backward compatibility. So for now
1176  * we leave H6 and H7 where they were.
1177  */
1178 #if (ARM_PGSHIFT == 14)
1179 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1180 #else
1181 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1182 #endif
1183 
1184 #else
1185 #define LOWGLOBAL_ALIAS         (0xFFFF1000)
1186 #endif
1187 
1188 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1189 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1190 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1191 
1192 #if XNU_MONITOR
1193 
1194 #if __has_feature(ptrauth_calls)
1195 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1196 #else
1197 #define __ptrauth_ppl_handler
1198 #endif
1199 
1200 /*
1201  * Table of function pointers used for PPL dispatch.
1202  */
1203 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1204 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1205 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1206 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1207 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1208 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1209 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1210 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1211 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1212 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1213 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1214 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1215 	[PMAP_INSERT_SHAREDPAGE_INDEX] = pmap_insert_sharedpage_internal,
1216 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1217 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1218 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1219 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1220 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1221 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1222 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1223 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1224 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1225 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1226 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1227 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1228 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1229 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1230 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1231 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1232 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1233 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1234 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1235 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1236 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1237 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1238 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1239 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1240 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1241 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1242 	[PMAP_IS_TRUST_CACHE_LOADED_INDEX] = pmap_is_trust_cache_loaded_internal,
1243 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1244 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1245 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1246 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1247 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1248 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1249 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1250 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1251 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1252 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1253 #if HAS_APPLE_PAC
1254 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1255 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1256 #endif /* HAS_APPLE_PAC */
1257 #if __ARM_RANGE_TLBI__
1258 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1259 #endif /* __ARM_RANGE_TLBI__ */
1260 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1261 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1262 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1263 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1264 
1265 #if DEVELOPMENT || DEBUG
1266 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1267 #endif /* DEVELOPMENT || DEBUG */
1268 };
1269 #endif
1270 
1271 #if XNU_MONITOR
1272 /**
1273  * A convenience function for setting protections on a single physical
1274  * aperture or static region mapping without invalidating the TLB.
1275  *
1276  * @note This function does not perform any TLB invalidations. That must be done
1277  *       separately to be able to safely use the updated mapping.
1278  *
1279  * @note This function understands the difference between the VM page size and
1280  *       the kernel page size and will update multiple PTEs if the sizes differ.
1281  *       In other words, enough PTEs will always get updated to change the
1282  *       permissions on a PAGE_SIZE amount of memory.
1283  *
1284  * @note The PVH lock for the physical page represented by this mapping must
1285  *       already be locked.
1286  *
1287  * @note This function assumes the caller has already verified that the PTE
1288  *       pointer does indeed point to a physical aperture or static region page
1289  *       table. Please validate your inputs before passing it along to this
1290  *       function.
1291  *
1292  * @param ptep Pointer to the physical aperture or static region page table to
1293  *             update with a new XPRR index.
1294  * @param expected_perm The XPRR index that is expected to already exist at the
1295  *                      current mapping. If the current index doesn't match this
1296  *                      then the system will panic.
1297  * @param new_perm The new XPRR index to update the mapping with.
1298  */
1299 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1300 pmap_set_pte_xprr_perm(
1301 	pt_entry_t * const ptep,
1302 	unsigned int expected_perm,
1303 	unsigned int new_perm)
1304 {
1305 	assert(ptep != NULL);
1306 
1307 	pt_entry_t spte = *ptep;
1308 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1309 
1310 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1311 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1312 		    __func__, ptep, new_perm, expected_perm);
1313 	}
1314 
1315 	/**
1316 	 * The PTE involved should be valid, should not have the hint bit set, and
1317 	 * should have the expected XPRR index.
1318 	 */
1319 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1320 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1321 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1322 		    __func__, ptep, spte, new_perm, expected_perm);
1323 	}
1324 
1325 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1326 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1327 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1328 		    __func__, ptep, spte, new_perm, expected_perm);
1329 	}
1330 
1331 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1332 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1333 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1334 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1335 	}
1336 
1337 	pt_entry_t template = spte;
1338 	template &= ~ARM_PTE_XPRR_MASK;
1339 	template |= xprr_perm_to_pte(new_perm);
1340 
1341 	write_pte_strong(ptep, template);
1342 }
1343 
1344 /**
1345  * Update the protections on a single physical aperture mapping and invalidate
1346  * the TLB so the mapping can be used.
1347  *
1348  * @note The PVH lock for the physical page must already be locked.
1349  *
1350  * @param pai The physical address index of the page whose physical aperture
1351  *            mapping will be updated with new permissions.
1352  * @param expected_perm The XPRR index that is expected to already exist at the
1353  *                      current mapping. If the current index doesn't match this
1354  *                      then the system will panic.
1355  * @param new_perm The new XPRR index to update the mapping with.
1356  */
1357 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1358 pmap_set_xprr_perm(
1359 	unsigned int pai,
1360 	unsigned int expected_perm,
1361 	unsigned int new_perm)
1362 {
1363 	pvh_assert_locked(pai);
1364 
1365 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1366 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1367 
1368 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1369 
1370 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1371 	sync_tlb_flush();
1372 }
1373 
1374 /**
1375  * Update the protections on a range of physical aperture or static region
1376  * mappings and invalidate the TLB so the mappings can be used.
1377  *
1378  * @note Static region mappings can only be updated before machine_lockdown().
1379  *       Physical aperture mappings can be updated at any time.
1380  *
1381  * @param start The starting virtual address of the static region or physical
1382  *              aperture range whose permissions will be updated.
1383  * @param end The final (inclusive) virtual address of the static region or
1384  *            physical aperture range whose permissions will be updated.
1385  * @param expected_perm The XPRR index that is expected to already exist at the
1386  *                      current mappings. If the current indices don't match
1387  *                      this then the system will panic.
1388  * @param new_perm The new XPRR index to update the mappings with.
1389  */
1390 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1391 pmap_set_range_xprr_perm(
1392 	vm_address_t start,
1393 	vm_address_t end,
1394 	unsigned int expected_perm,
1395 	unsigned int new_perm)
1396 {
1397 #if (__ARM_VMSA__ == 7)
1398 #error This function is not supported on older ARM hardware.
1399 #endif /* (__ARM_VMSA__ == 7) */
1400 
1401 	/**
1402 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1403 	 */
1404 	if (__improbable((start | end) & ARM_PGMASK)) {
1405 		panic_plain("%s: start or end not page aligned, "
1406 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1407 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1408 	}
1409 
1410 	if (__improbable(start > end)) {
1411 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1412 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1413 	}
1414 
1415 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1416 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1417 
1418 	if (__improbable(!(in_physmap || in_static))) {
1419 		panic_plain("%s: address not in static region or physical aperture, "
1420 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1421 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1422 	}
1423 
1424 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1425 		panic_plain("%s: invalid XPRR index, "
1426 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1427 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1428 	}
1429 
1430 	/*
1431 	 * Walk over the PTEs for the given range, and set the protections on those
1432 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1433 	 * one twig entry (whichever twig entry currently maps "va").
1434 	 */
1435 	vm_address_t va = start;
1436 	while (va < end) {
1437 		/**
1438 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1439 		 * PTEs from va to tte_va_end will have their permissions updated.
1440 		 */
1441 		vm_address_t tte_va_end =
1442 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1443 
1444 		if (tte_va_end > end) {
1445 			tte_va_end = end;
1446 		}
1447 
1448 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1449 
1450 		if (ttep == NULL) {
1451 			panic_plain("%s: physical aperture or static region tte is NULL, "
1452 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1453 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1454 		}
1455 
1456 		tt_entry_t tte = *ttep;
1457 
1458 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1459 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1460 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1461 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1462 		}
1463 
1464 		/* Walk over the given L3 page table page and update the PTEs. */
1465 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1466 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1467 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1468 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1469 
1470 		/**
1471 		 * The current PTE pointer is incremented by the page ratio (ratio of
1472 		 * VM page size to kernel hardware page size) because one call to
1473 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1474 		 * a PAGE_SIZE worth of hardware pages.
1475 		 */
1476 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1477 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1478 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1479 			pvh_lock(pai);
1480 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1481 			pvh_unlock(pai);
1482 		}
1483 
1484 		va = tte_va_end;
1485 	}
1486 
1487 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1488 }
1489 
1490 #endif /* XNU_MONITOR */
1491 
1492 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1493 PMAP_ZINFO_PALLOC(
1494 	pmap_t pmap, int bytes)
1495 {
1496 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1497 }
1498 
1499 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1500 PMAP_ZINFO_PFREE(
1501 	pmap_t pmap,
1502 	int bytes)
1503 {
1504 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1505 }
1506 
1507 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1508 pmap_tt_ledger_credit(
1509 	pmap_t          pmap,
1510 	vm_size_t       size)
1511 {
1512 	if (pmap != kernel_pmap) {
1513 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1514 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1515 	}
1516 }
1517 
1518 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1519 pmap_tt_ledger_debit(
1520 	pmap_t          pmap,
1521 	vm_size_t       size)
1522 {
1523 	if (pmap != kernel_pmap) {
1524 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1525 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1526 	}
1527 }
1528 
1529 static inline void
pmap_update_plru(uint16_t asid_index)1530 pmap_update_plru(uint16_t asid_index)
1531 {
1532 	if (__probable(pmap_asid_plru)) {
1533 		unsigned plru_index = asid_index >> 6;
1534 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1535 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1536 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1537 		}
1538 	}
1539 }
1540 
1541 static bool
alloc_asid(pmap_t pmap)1542 alloc_asid(pmap_t pmap)
1543 {
1544 	int vasid = -1;
1545 	uint16_t hw_asid;
1546 
1547 	pmap_simple_lock(&asid_lock);
1548 
1549 	if (__probable(pmap_asid_plru)) {
1550 		unsigned plru_index = 0;
1551 		uint64_t lowest_gen = asid_plru_generation[0];
1552 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1553 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1554 			if (asid_plru_generation[i] < lowest_gen) {
1555 				plru_index = i;
1556 				lowest_gen = asid_plru_generation[i];
1557 				lowest_gen_bitmap = asid_plru_bitmap[i];
1558 			}
1559 		}
1560 
1561 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1562 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1563 			if (temp_plru) {
1564 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1565 #if DEVELOPMENT || DEBUG
1566 				++pmap_asid_hits;
1567 #endif
1568 				break;
1569 			}
1570 		}
1571 	}
1572 	if (__improbable(vasid < 0)) {
1573 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1574 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1575 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1576 #if DEVELOPMENT || DEBUG
1577 		++pmap_asid_misses;
1578 #endif
1579 	}
1580 	if (__improbable(vasid < 0)) {
1581 		pmap_simple_unlock(&asid_lock);
1582 		return false;
1583 	}
1584 	assert((uint32_t)vasid < pmap_max_asids);
1585 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1586 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1587 	pmap_simple_unlock(&asid_lock);
1588 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1589 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1590 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1591 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1592 		 * reassign to a reserved VASID. */
1593 		assert(pmap->sw_asid < UINT8_MAX);
1594 		pmap->sw_asid = UINT8_MAX;
1595 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1596 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1597 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1598 		assert(hw_asid < MAX_HW_ASIDS);
1599 	}
1600 	pmap_update_plru(hw_asid);
1601 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1602 #if __ARM_KERNEL_PROTECT__
1603 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1604 #endif
1605 	pmap->hw_asid = hw_asid;
1606 	return true;
1607 }
1608 
1609 static void
free_asid(pmap_t pmap)1610 free_asid(pmap_t pmap)
1611 {
1612 	unsigned int vasid;
1613 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1614 	if (__improbable(hw_asid == 0)) {
1615 		return;
1616 	}
1617 
1618 #if __ARM_KERNEL_PROTECT__
1619 	hw_asid >>= 1;
1620 #endif
1621 	hw_asid -= 1;
1622 
1623 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1624 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1625 	} else {
1626 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1627 	}
1628 
1629 	if (__probable(pmap_asid_plru)) {
1630 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1631 	}
1632 	pmap_simple_lock(&asid_lock);
1633 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1634 	bitmap_set(&asid_bitmap[0], vasid);
1635 	pmap_simple_unlock(&asid_lock);
1636 }
1637 
1638 
1639 boolean_t
pmap_valid_address(pmap_paddr_t addr)1640 pmap_valid_address(
1641 	pmap_paddr_t addr)
1642 {
1643 	return pa_valid(addr);
1644 }
1645 
1646 
1647 
1648 
1649 
1650 
1651 /*
1652  *      Map memory at initialization.  The physical addresses being
1653  *      mapped are not managed and are never unmapped.
1654  *
1655  *      For now, VM is already on, we only need to map the
1656  *      specified memory.
1657  */
1658 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1659 pmap_map(
1660 	vm_map_address_t virt,
1661 	vm_offset_t start,
1662 	vm_offset_t end,
1663 	vm_prot_t prot,
1664 	unsigned int flags)
1665 {
1666 	kern_return_t   kr;
1667 	vm_size_t       ps;
1668 
1669 	ps = PAGE_SIZE;
1670 	while (start < end) {
1671 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1672 		    prot, VM_PROT_NONE, flags, FALSE);
1673 
1674 		if (kr != KERN_SUCCESS) {
1675 			panic("%s: failed pmap_enter, "
1676 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1677 			    __FUNCTION__,
1678 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1679 		}
1680 
1681 		virt += ps;
1682 		start += ps;
1683 	}
1684 	return virt;
1685 }
1686 
1687 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1688 pmap_map_bd_with_options(
1689 	vm_map_address_t virt,
1690 	vm_offset_t start,
1691 	vm_offset_t end,
1692 	vm_prot_t prot,
1693 	int32_t options)
1694 {
1695 	pt_entry_t      tmplate;
1696 	pt_entry_t     *ptep;
1697 	vm_map_address_t vaddr;
1698 	vm_offset_t     paddr;
1699 	pt_entry_t      mem_attr;
1700 
1701 	switch (options & PMAP_MAP_BD_MASK) {
1702 	case PMAP_MAP_BD_WCOMB:
1703 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1704 #if     (__ARM_VMSA__ > 7)
1705 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1706 #else
1707 		mem_attr |= ARM_PTE_SH;
1708 #endif
1709 		break;
1710 	case PMAP_MAP_BD_POSTED:
1711 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1712 		break;
1713 	case PMAP_MAP_BD_POSTED_REORDERED:
1714 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1715 		break;
1716 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1717 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1718 		break;
1719 	default:
1720 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1721 		break;
1722 	}
1723 
1724 	tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1725 	    mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1726 #if __ARM_KERNEL_PROTECT__
1727 	tmplate |= ARM_PTE_NG;
1728 #endif /* __ARM_KERNEL_PROTECT__ */
1729 
1730 	vaddr = virt;
1731 	paddr = start;
1732 	while (paddr < end) {
1733 		ptep = pmap_pte(kernel_pmap, vaddr);
1734 		if (ptep == PT_ENTRY_NULL) {
1735 			panic("%s: no PTE for vaddr=%p, "
1736 			    "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1737 			    __FUNCTION__, (void*)vaddr,
1738 			    (void*)virt, (void*)start, (void*)end, prot, options);
1739 		}
1740 
1741 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1742 		write_pte_strong(ptep, tmplate);
1743 
1744 		pte_increment_pa(tmplate);
1745 		vaddr += PAGE_SIZE;
1746 		paddr += PAGE_SIZE;
1747 	}
1748 
1749 	if (end >= start) {
1750 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1751 	}
1752 
1753 	return vaddr;
1754 }
1755 
1756 /*
1757  *      Back-door routine for mapping kernel VM at initialization.
1758  *      Useful for mapping memory outside the range
1759  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1760  *      Otherwise like pmap_map.
1761  */
1762 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1763 pmap_map_bd(
1764 	vm_map_address_t virt,
1765 	vm_offset_t start,
1766 	vm_offset_t end,
1767 	vm_prot_t prot)
1768 {
1769 	pt_entry_t      tmplate;
1770 	pt_entry_t              *ptep;
1771 	vm_map_address_t vaddr;
1772 	vm_offset_t             paddr;
1773 
1774 	/* not cacheable and not buffered */
1775 	tmplate = pa_to_pte(start)
1776 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1777 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1778 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1779 #if __ARM_KERNEL_PROTECT__
1780 	tmplate |= ARM_PTE_NG;
1781 #endif /* __ARM_KERNEL_PROTECT__ */
1782 
1783 	vaddr = virt;
1784 	paddr = start;
1785 	while (paddr < end) {
1786 		ptep = pmap_pte(kernel_pmap, vaddr);
1787 		if (ptep == PT_ENTRY_NULL) {
1788 			panic("pmap_map_bd");
1789 		}
1790 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1791 		write_pte_strong(ptep, tmplate);
1792 
1793 		pte_increment_pa(tmplate);
1794 		vaddr += PAGE_SIZE;
1795 		paddr += PAGE_SIZE;
1796 	}
1797 
1798 	if (end >= start) {
1799 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1800 	}
1801 
1802 	return vaddr;
1803 }
1804 
1805 /*
1806  *      Back-door routine for mapping kernel VM at initialization.
1807  *      Useful for mapping memory specific physical addresses in early
1808  *      boot (i.e., before kernel_map is initialized).
1809  *
1810  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1811  */
1812 
1813 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1814 pmap_map_high_window_bd(
1815 	vm_offset_t pa_start,
1816 	vm_size_t len,
1817 	vm_prot_t prot)
1818 {
1819 	pt_entry_t              *ptep, pte;
1820 #if (__ARM_VMSA__ == 7)
1821 	vm_map_address_t        va_start = VM_HIGH_KERNEL_WINDOW;
1822 	vm_map_address_t        va_max = VM_MAX_KERNEL_ADDRESS;
1823 #else
1824 	vm_map_address_t        va_start = VREGION1_START;
1825 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1826 #endif
1827 	vm_map_address_t        va_end;
1828 	vm_map_address_t        va;
1829 	vm_size_t               offset;
1830 
1831 	offset = pa_start & PAGE_MASK;
1832 	pa_start -= offset;
1833 	len += offset;
1834 
1835 	if (len > (va_max - va_start)) {
1836 		panic("%s: area too large, "
1837 		    "pa_start=%p, len=%p, prot=0x%x",
1838 		    __FUNCTION__,
1839 		    (void*)pa_start, (void*)len, prot);
1840 	}
1841 
1842 scan:
1843 	for (; va_start < va_max; va_start += PAGE_SIZE) {
1844 		ptep = pmap_pte(kernel_pmap, va_start);
1845 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1846 		if (*ptep == ARM_PTE_TYPE_FAULT) {
1847 			break;
1848 		}
1849 	}
1850 	if (va_start > va_max) {
1851 		panic("%s: insufficient pages, "
1852 		    "pa_start=%p, len=%p, prot=0x%x",
1853 		    __FUNCTION__,
1854 		    (void*)pa_start, (void*)len, prot);
1855 	}
1856 
1857 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1858 		ptep = pmap_pte(kernel_pmap, va_end);
1859 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1860 		if (*ptep != ARM_PTE_TYPE_FAULT) {
1861 			va_start = va_end + PAGE_SIZE;
1862 			goto scan;
1863 		}
1864 	}
1865 
1866 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1867 		ptep = pmap_pte(kernel_pmap, va);
1868 		pte = pa_to_pte(pa_start)
1869 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1870 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1871 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1872 #if     (__ARM_VMSA__ > 7)
1873 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1874 #else
1875 		pte |= ARM_PTE_SH;
1876 #endif
1877 #if __ARM_KERNEL_PROTECT__
1878 		pte |= ARM_PTE_NG;
1879 #endif /* __ARM_KERNEL_PROTECT__ */
1880 		write_pte_strong(ptep, pte);
1881 	}
1882 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1883 #if KASAN
1884 	kasan_notify_address(va_start, len);
1885 #endif
1886 	return va_start;
1887 }
1888 
1889 static uint32_t
pmap_compute_max_asids(void)1890 pmap_compute_max_asids(void)
1891 {
1892 	DTEntry entry;
1893 	void const *prop = NULL;
1894 	uint32_t max_asids;
1895 	int err;
1896 	unsigned int prop_size;
1897 
1898 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1899 	assert(err == kSuccess);
1900 
1901 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1902 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
1903 		 * we can choose a more flexible default value here. */
1904 		return MAX_ASIDS;
1905 	}
1906 
1907 	if (prop_size != sizeof(max_asids)) {
1908 		panic("pmap-max-asids property is not a 32-bit integer");
1909 	}
1910 
1911 	max_asids = *((uint32_t const *)prop);
1912 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
1913 	max_asids = (max_asids + 63) & ~63UL;
1914 
1915 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
1916 		/* currently capped by size of pmap->sw_asid */
1917 		panic("pmap-max-asids too large");
1918 	}
1919 	if (max_asids == 0) {
1920 		panic("pmap-max-asids cannot be zero");
1921 	}
1922 	return max_asids;
1923 }
1924 
1925 #if __arm64__
1926 /*
1927  * pmap_get_arm64_prot
1928  *
1929  * return effective armv8 VMSA block protections including
1930  * table AP/PXN/XN overrides of a pmap entry
1931  *
1932  */
1933 
1934 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1935 pmap_get_arm64_prot(
1936 	pmap_t pmap,
1937 	vm_offset_t addr)
1938 {
1939 	tt_entry_t tte = 0;
1940 	unsigned int level = 0;
1941 	uint64_t tte_type = 0;
1942 	uint64_t effective_prot_bits = 0;
1943 	uint64_t aggregate_tte = 0;
1944 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1945 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1946 
1947 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1948 		tte = *pmap_ttne(pmap, level, addr);
1949 
1950 		if (!(tte & ARM_TTE_VALID)) {
1951 			return 0;
1952 		}
1953 
1954 		tte_type = tte & ARM_TTE_TYPE_MASK;
1955 
1956 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
1957 		    (level == pt_attr->pta_max_level)) {
1958 			/* Block or page mapping; both have the same protection bit layout. */
1959 			break;
1960 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
1961 			/* All of the table bits we care about are overrides, so just OR them together. */
1962 			aggregate_tte |= tte;
1963 		}
1964 	}
1965 
1966 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1967 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1968 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1969 
1970 	/* Start with the PTE bits. */
1971 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1972 
1973 	/* Table AP bits mask out block/page AP bits */
1974 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1975 
1976 	/* XN/PXN bits can be OR'd in. */
1977 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1978 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1979 
1980 	return effective_prot_bits;
1981 }
1982 #endif /* __arm64__ */
1983 
1984 static void
pmap_set_srd_fusing()1985 pmap_set_srd_fusing()
1986 {
1987 	DTEntry entry;
1988 	uint32_t const *prop = NULL;
1989 	int err;
1990 	unsigned int prop_size = 0;
1991 
1992 	err = SecureDTLookupEntry(NULL, "/chosen", &entry);
1993 	if (err != kSuccess) {
1994 		panic("PMAP: no chosen DT node");
1995 	}
1996 
1997 	if (kSuccess == SecureDTGetProperty(entry, "research-enabled", (const void**)&prop, &prop_size)) {
1998 		if (prop_size == sizeof(uint32_t)) {
1999 			srd_fused = *prop;
2000 		}
2001 	}
2002 
2003 #if DEVELOPMENT || DEBUG
2004 	PE_parse_boot_argn("srd_fusing", &srd_fused, sizeof(srd_fused));
2005 #endif
2006 }
2007 
2008 /*
2009  *	Bootstrap the system enough to run with virtual memory.
2010  *
2011  *	The early VM initialization code has already allocated
2012  *	the first CPU's translation table and made entries for
2013  *	all the one-to-one mappings to be found there.
2014  *
2015  *	We must set up the kernel pmap structures, the
2016  *	physical-to-virtual translation lookup tables for the
2017  *	physical memory to be managed (between avail_start and
2018  *	avail_end).
2019  *
2020  *	Map the kernel's code and data, and allocate the system page table.
2021  *	Page_size must already be set.
2022  *
2023  *	Parameters:
2024  *	first_avail	first available physical page -
2025  *			   after kernel page tables
2026  *	avail_start	PA of first managed physical page
2027  *	avail_end	PA of last managed physical page
2028  */
2029 
2030 void
pmap_bootstrap(vm_offset_t vstart)2031 pmap_bootstrap(
2032 	vm_offset_t vstart)
2033 {
2034 	vm_map_offset_t maxoffset;
2035 
2036 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2037 
2038 	pmap_set_srd_fusing();
2039 
2040 #if XNU_MONITOR
2041 
2042 #if DEVELOPMENT || DEBUG
2043 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2044 #endif
2045 
2046 #if CONFIG_CSR_FROM_DT
2047 	if (csr_unsafe_kernel_text) {
2048 		pmap_ppl_disable = true;
2049 	}
2050 #endif /* CONFIG_CSR_FROM_DT */
2051 
2052 #endif /* XNU_MONITOR */
2053 
2054 #if DEVELOPMENT || DEBUG
2055 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2056 		kprintf("Kernel traces for pmap operations enabled\n");
2057 	}
2058 #endif
2059 
2060 	/*
2061 	 *	Initialize the kernel pmap.
2062 	 */
2063 	pmap_stamp = 1;
2064 #if ARM_PARAMETERIZED_PMAP
2065 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2066 #endif /* ARM_PARAMETERIZED_PMAP */
2067 #if HAS_APPLE_PAC
2068 	kernel_pmap->disable_jop = 0;
2069 #endif /* HAS_APPLE_PAC */
2070 	kernel_pmap->tte = cpu_tte;
2071 	kernel_pmap->ttep = cpu_ttep;
2072 #if (__ARM_VMSA__ > 7)
2073 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2074 #else
2075 	kernel_pmap->min = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
2076 #endif
2077 	kernel_pmap->max = UINTPTR_MAX;
2078 	os_atomic_init(&kernel_pmap->ref_count, 1);
2079 #if XNU_MONITOR
2080 	os_atomic_init(&kernel_pmap->nested_count, 0);
2081 #endif
2082 	kernel_pmap->gc_status = 0;
2083 	kernel_pmap->nx_enabled = TRUE;
2084 #ifdef  __arm64__
2085 	kernel_pmap->is_64bit = TRUE;
2086 #else
2087 	kernel_pmap->is_64bit = FALSE;
2088 #endif
2089 	kernel_pmap->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2090 
2091 #if ARM_PARAMETERIZED_PMAP
2092 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2093 #endif /* ARM_PARAMETERIZED_PMAP */
2094 
2095 	kernel_pmap->nested_region_addr = 0x0ULL;
2096 	kernel_pmap->nested_region_size = 0x0ULL;
2097 	kernel_pmap->nested_region_asid_bitmap = NULL;
2098 	kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2099 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2100 
2101 #if (__ARM_VMSA__ == 7)
2102 	kernel_pmap->tte_index_max = 4 * (ARM_PGBYTES / sizeof(tt_entry_t));
2103 #endif
2104 	kernel_pmap->hw_asid = 0;
2105 	kernel_pmap->sw_asid = 0;
2106 
2107 	pmap_lock_init(kernel_pmap);
2108 
2109 	pmap_max_asids = pmap_compute_max_asids();
2110 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2111 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2112 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2113 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2114 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2115 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2116 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2117 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2118 
2119 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2120 
2121 	/**
2122 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2123 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2124 	 * space for these data structures.
2125 	 * */
2126 	pmap_data_bootstrap();
2127 
2128 	/**
2129 	 * Don't make any assumptions about the alignment of avail_start before this
2130 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2131 	 */
2132 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2133 
2134 	const pmap_paddr_t pmap_struct_start = avail_start;
2135 
2136 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2137 	avail_start = round_page(avail_start + asid_table_size);
2138 
2139 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2140 
2141 	vm_first_phys = gPhysBase;
2142 	vm_last_phys = trunc_page(avail_end);
2143 
2144 	queue_init(&map_pmap_list);
2145 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2146 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2147 	free_page_size_tt_count = 0;
2148 	free_page_size_tt_max = 0;
2149 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2150 	free_two_page_size_tt_count = 0;
2151 	free_two_page_size_tt_max = 0;
2152 	free_tt_list = TT_FREE_ENTRY_NULL;
2153 	free_tt_count = 0;
2154 	free_tt_max = 0;
2155 
2156 	virtual_space_start = vstart;
2157 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2158 
2159 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2160 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2161 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2162 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2163 
2164 
2165 
2166 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2167 		maxoffset = trunc_page(maxoffset);
2168 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2169 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2170 			arm_pmap_max_offset_default = maxoffset;
2171 		}
2172 	}
2173 #if defined(__arm64__)
2174 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2175 		maxoffset = trunc_page(maxoffset);
2176 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2177 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2178 			arm64_pmap_max_offset_default = maxoffset;
2179 		}
2180 	}
2181 #endif
2182 
2183 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2184 
2185 
2186 #if MACH_ASSERT
2187 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2188 	    &vm_footprint_suspend_allowed,
2189 	    sizeof(vm_footprint_suspend_allowed));
2190 #endif /* MACH_ASSERT */
2191 
2192 #if KASAN
2193 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2194 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2195 #endif /* KASAN */
2196 
2197 	/**
2198 	 * Ensure that avail_start is always left on a page boundary. The calling
2199 	 * code might not perform any alignment before allocating page tables so
2200 	 * this is important.
2201 	 */
2202 	avail_start = round_page(avail_start);
2203 }
2204 
2205 #if XNU_MONITOR
2206 
2207 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2208 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2209 {
2210 	pmap_paddr_t cur_pa;
2211 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2212 		assert(pa_valid(cur_pa));
2213 		ppattr_pa_set_monitor(cur_pa);
2214 	}
2215 }
2216 
2217 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2218 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2219     pmap_paddr_t end_pa,
2220     unsigned int expected_perm,
2221     unsigned int new_perm)
2222 {
2223 	vm_offset_t start_va = phystokv(start_pa);
2224 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2225 
2226 	pa_set_range_monitor(start_pa, end_pa);
2227 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2228 }
2229 
2230 static void
pmap_lockdown_kc(void)2231 pmap_lockdown_kc(void)
2232 {
2233 	extern vm_offset_t vm_kernelcache_base;
2234 	extern vm_offset_t vm_kernelcache_top;
2235 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2236 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2237 	pmap_paddr_t cur_pa = start_pa;
2238 	vm_offset_t cur_va = vm_kernelcache_base;
2239 	while (cur_pa < end_pa) {
2240 		vm_size_t range_size = end_pa - cur_pa;
2241 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2242 		if (ptov_va != cur_va) {
2243 			/*
2244 			 * If the physical address maps back to a virtual address that is non-linear
2245 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2246 			 * reclaimed by the OS and should therefore not be locked down.
2247 			 */
2248 			cur_pa += range_size;
2249 			cur_va += range_size;
2250 			continue;
2251 		}
2252 		unsigned int pai = pa_index(cur_pa);
2253 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2254 
2255 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2256 
2257 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2258 			panic("pai %d already locked down", pai);
2259 		}
2260 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2261 		cur_pa += ARM_PGBYTES;
2262 		cur_va += ARM_PGBYTES;
2263 	}
2264 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2265 	extern uint64_t ctrr_ro_test;
2266 	extern uint64_t ctrr_nx_test;
2267 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2268 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2269 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2270 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2271 	}
2272 #endif
2273 }
2274 
2275 void
pmap_static_allocations_done(void)2276 pmap_static_allocations_done(void)
2277 {
2278 	pmap_paddr_t monitor_start_pa;
2279 	pmap_paddr_t monitor_end_pa;
2280 
2281 	/*
2282 	 * Protect the bootstrap (V=P and V->P) page tables.
2283 	 *
2284 	 * These bootstrap allocations will be used primarily for page tables.
2285 	 * If we wish to secure the page tables, we need to start by marking
2286 	 * these bootstrap allocations as pages that we want to protect.
2287 	 */
2288 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2289 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2290 
2291 	/* The bootstrap page tables are mapped RW at boostrap. */
2292 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2293 
2294 	/*
2295 	 * We use avail_start as a pointer to the first address that has not
2296 	 * been reserved for bootstrap, so we know which pages to give to the
2297 	 * virtual memory layer.
2298 	 */
2299 	monitor_start_pa = BootArgs->topOfKernelData;
2300 	monitor_end_pa = avail_start;
2301 
2302 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2303 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2304 
2305 	/*
2306 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2307 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2308 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2309 	 * they can't be allocated for other uses.  We don't need a special xPRR
2310 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2311 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2312 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2313 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2314 	 * to believe we are dealing with an user XO page upon performing a translation.
2315 	 */
2316 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2317 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2318 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2319 
2320 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2321 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2322 
2323 	/* PPL data is RW for the PPL, RO for the kernel. */
2324 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2325 
2326 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2327 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2328 
2329 	/* PPL text is RX for the PPL, RO for the kernel. */
2330 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2331 
2332 
2333 	/*
2334 	 * In order to support DTrace, the save areas for the PPL must be
2335 	 * writable.  This is due to the fact that DTrace will try to update
2336 	 * register state.
2337 	 */
2338 	if (pmap_ppl_disable) {
2339 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2340 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2341 
2342 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2343 	}
2344 
2345 
2346 	if (segSizePPLDATACONST > 0) {
2347 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2348 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2349 
2350 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2351 	}
2352 
2353 	/*
2354 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2355 	 * precaution.  The real RW mappings are at a different location with guard pages.
2356 	 */
2357 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2358 
2359 	/* Prevent remapping of the kernelcache */
2360 	pmap_lockdown_kc();
2361 }
2362 
2363 void
pmap_lockdown_ppl(void)2364 pmap_lockdown_ppl(void)
2365 {
2366 	/* Mark the PPL as being locked down. */
2367 
2368 #error "XPRR configuration error"
2369 }
2370 #endif /* XNU_MONITOR */
2371 
2372 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2373 pmap_virtual_space(
2374 	vm_offset_t *startp,
2375 	vm_offset_t *endp
2376 	)
2377 {
2378 	*startp = virtual_space_start;
2379 	*endp = virtual_space_end;
2380 }
2381 
2382 
2383 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2384 pmap_virtual_region(
2385 	unsigned int region_select,
2386 	vm_map_offset_t *startp,
2387 	vm_map_size_t *size
2388 	)
2389 {
2390 	boolean_t       ret = FALSE;
2391 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2392 	if (region_select == 0) {
2393 		/*
2394 		 * In this config, the bootstrap mappings should occupy their own L2
2395 		 * TTs, as they should be immutable after boot.  Having the associated
2396 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2397 		 * while allowing the rest of the kernel address range to be remapped.
2398 		 */
2399 #if     (__ARM_VMSA__ > 7)
2400 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2401 #else
2402 #error Unsupported configuration
2403 #endif
2404 #if defined(ARM_LARGE_MEMORY)
2405 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2406 #else
2407 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2408 #endif
2409 		ret = TRUE;
2410 	}
2411 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2412 #if defined(ARM_LARGE_MEMORY)
2413 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2414 #if     (__ARM_VMSA__ > 7)
2415 	*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2416 #else
2417 #error Unsupported configuration
2418 #endif
2419 	if (region_select == 0) {
2420 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2421 		ret = TRUE;
2422 	}
2423 #else /* !defined(ARM_LARGE_MEMORY) */
2424 #if     (__ARM_VMSA__ > 7)
2425 	unsigned long low_global_vr_mask = 0;
2426 	vm_map_size_t low_global_vr_size = 0;
2427 #endif
2428 
2429 	if (region_select == 0) {
2430 #if     (__ARM_VMSA__ == 7)
2431 		*startp = gVirtBase & 0xFFC00000;
2432 		*size = ((virtual_space_start - (gVirtBase & 0xFFC00000)) + ~0xFFC00000) & 0xFFC00000;
2433 #else
2434 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2435 		if (!TEST_PAGE_SIZE_4K) {
2436 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2437 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2438 		} else {
2439 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2440 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2441 		}
2442 #endif
2443 		ret = TRUE;
2444 	}
2445 	if (region_select == 1) {
2446 		*startp = VREGION1_START;
2447 		*size = VREGION1_SIZE;
2448 		ret = TRUE;
2449 	}
2450 #if     (__ARM_VMSA__ > 7)
2451 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2452 	if (!TEST_PAGE_SIZE_4K) {
2453 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2454 		low_global_vr_size = 0x2000000;
2455 	} else {
2456 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2457 		low_global_vr_size = 0x800000;
2458 	}
2459 
2460 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2461 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2462 		*size = low_global_vr_size;
2463 		ret = TRUE;
2464 	}
2465 
2466 	if (region_select == 3) {
2467 		/* In this config, we allow the bootstrap mappings to occupy the same
2468 		 * page table pages as the heap.
2469 		 */
2470 		*startp = VM_MIN_KERNEL_ADDRESS;
2471 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2472 		ret = TRUE;
2473 	}
2474 #endif
2475 #endif /* defined(ARM_LARGE_MEMORY) */
2476 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2477 	return ret;
2478 }
2479 
2480 /*
2481  * Routines to track and allocate physical pages during early boot.
2482  * On most systems that memory runs from first_avail through to avail_end
2483  * with no gaps.
2484  *
2485  * However if the system supports ECC and bad_ram_pages_count > 0, we
2486  * need to be careful and skip those pages.
2487  */
2488 static unsigned int avail_page_count = 0;
2489 static bool need_ram_ranges_init = true;
2490 
2491 #if defined(__arm64__)
2492 pmap_paddr_t *bad_ram_pages = NULL;
2493 unsigned int bad_ram_pages_count = 0;
2494 
2495 /*
2496  * We use this sub-range of bad_ram_pages for pmap_next_page()
2497  */
2498 static pmap_paddr_t *skip_pages;
2499 static unsigned int skip_pages_count = 0;
2500 
2501 #define MAX_BAD_RAM_PAGE_COUNT 64
2502 static pmap_paddr_t bad_ram_pages_arr[MAX_BAD_RAM_PAGE_COUNT];
2503 
2504 /*
2505  * XXX - temporary code to get the bad pages array from boot-args.
2506  * expects a comma separated list of offsets from the start
2507  * of physical memory to be considered bad.
2508  *
2509  * HERE JOE -- will eventually be replaced by data provided by iboot
2510  */
2511 static void
parse_bad_ram_pages_boot_arg(void)2512 parse_bad_ram_pages_boot_arg(void)
2513 {
2514 	char buf[256] = {0};
2515 	char *s = buf;
2516 	char *end;
2517 	int count = 0;
2518 	pmap_paddr_t num;
2519 	extern uint64_t strtouq(const char *, char **, int);
2520 
2521 	if (!PE_parse_boot_arg_str("bad_ram_pages", buf, sizeof(buf))) {
2522 		goto done;
2523 	}
2524 
2525 	while (*s && count < MAX_BAD_RAM_PAGE_COUNT) {
2526 		num = (pmap_paddr_t)strtouq(s, &end, 0);
2527 		if (num == 0) {
2528 			break;
2529 		}
2530 		num &= ~PAGE_MASK;
2531 
2532 		bad_ram_pages_arr[count++] = gDramBase + num;
2533 
2534 		if (*end != ',') {
2535 			break;
2536 		}
2537 
2538 		s = end + 1;
2539 	}
2540 
2541 done:
2542 	bad_ram_pages = bad_ram_pages_arr;
2543 	bad_ram_pages_count = count;
2544 }
2545 
2546 /*
2547  * Comparison routine for qsort of array of physical addresses.
2548  */
2549 static int
pmap_paddr_cmp(void * a,void * b)2550 pmap_paddr_cmp(void *a, void *b)
2551 {
2552 	pmap_paddr_t *x = a;
2553 	pmap_paddr_t *y = b;
2554 	if (*x < *y) {
2555 		return -1;
2556 	}
2557 	return *x > *y;
2558 }
2559 #endif /* defined(__arm64__) */
2560 
2561 /*
2562  * Look up ppn in the sorted bad_ram_pages array.
2563  */
2564 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2565 pmap_is_bad_ram(__unused ppnum_t ppn)
2566 {
2567 #if defined(__arm64__)
2568 	pmap_paddr_t pa = ptoa(ppn);
2569 	int low = 0;
2570 	int high = bad_ram_pages_count - 1;
2571 	int mid;
2572 
2573 	while (low <= high) {
2574 		mid = (low + high) / 2;
2575 		if (bad_ram_pages[mid] < pa) {
2576 			low = mid + 1;
2577 		} else if (bad_ram_pages[mid] > pa) {
2578 			high = mid - 1;
2579 		} else {
2580 			return true;
2581 		}
2582 	}
2583 #endif /* defined(__arm64__) */
2584 	return false;
2585 }
2586 
2587 /*
2588  * Initialize the count of available pages. If we have bad_ram_pages, then sort the list of them.
2589  * No lock needed here, as this code is called while kernel boot up is single threaded.
2590  */
2591 static void
initialize_ram_ranges(void)2592 initialize_ram_ranges(void)
2593 {
2594 	pmap_paddr_t first = first_avail;
2595 	pmap_paddr_t end = avail_end;
2596 
2597 	assert(first <= end);
2598 	assert(first == (first & ~PAGE_MASK));
2599 	assert(end == (end & ~PAGE_MASK));
2600 	avail_page_count = atop(end - first);
2601 
2602 #if defined(__arm64__)
2603 	/*
2604 	 * XXX Temporary code for testing, until there is iboot support
2605 	 *
2606 	 * Parse a list of known bad pages from a boot-args.
2607 	 */
2608 	parse_bad_ram_pages_boot_arg();
2609 
2610 	/*
2611 	 * Sort and filter the bad pages list and adjust avail_page_count.
2612 	 */
2613 	if (bad_ram_pages_count != 0) {
2614 		qsort(bad_ram_pages, bad_ram_pages_count, sizeof(*bad_ram_pages), (cmpfunc_t)pmap_paddr_cmp);
2615 		skip_pages = bad_ram_pages;
2616 		skip_pages_count = bad_ram_pages_count;
2617 
2618 		/* ignore any pages before first */
2619 		while (skip_pages_count > 0 && skip_pages[0] < first) {
2620 			--skip_pages_count;
2621 			++skip_pages;
2622 		}
2623 
2624 		/* ignore any pages at or after end */
2625 		while (skip_pages_count > 0 && skip_pages[skip_pages_count - 1] >= end) {
2626 			--skip_pages_count;
2627 		}
2628 
2629 		avail_page_count -= skip_pages_count;
2630 	}
2631 #endif /* defined(__arm64__) */
2632 	need_ram_ranges_init = false;
2633 }
2634 
2635 unsigned int
pmap_free_pages(void)2636 pmap_free_pages(
2637 	void)
2638 {
2639 	if (need_ram_ranges_init) {
2640 		initialize_ram_ranges();
2641 	}
2642 	return avail_page_count;
2643 }
2644 
2645 unsigned int
pmap_free_pages_span(void)2646 pmap_free_pages_span(
2647 	void)
2648 {
2649 	if (need_ram_ranges_init) {
2650 		initialize_ram_ranges();
2651 	}
2652 	return (unsigned int)atop(avail_end - first_avail);
2653 }
2654 
2655 
2656 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2657 pmap_next_page_hi(
2658 	ppnum_t            * pnum,
2659 	__unused boolean_t might_free)
2660 {
2661 	return pmap_next_page(pnum);
2662 }
2663 
2664 
2665 boolean_t
pmap_next_page(ppnum_t * pnum)2666 pmap_next_page(
2667 	ppnum_t *pnum)
2668 {
2669 	if (need_ram_ranges_init) {
2670 		initialize_ram_ranges();
2671 	}
2672 
2673 #if defined(__arm64__)
2674 	/*
2675 	 * Skip over any known bad pages.
2676 	 */
2677 	while (skip_pages_count > 0 && first_avail == skip_pages[0]) {
2678 		first_avail += PAGE_SIZE;
2679 		++skip_pages;
2680 		--skip_pages_count;
2681 	}
2682 #endif /* defined(__arm64__) */
2683 
2684 	if (first_avail != avail_end) {
2685 		*pnum = (ppnum_t)atop(first_avail);
2686 		first_avail += PAGE_SIZE;
2687 		assert(avail_page_count > 0);
2688 		--avail_page_count;
2689 		return TRUE;
2690 	}
2691 	assert(avail_page_count == 0);
2692 	return FALSE;
2693 }
2694 
2695 void
pmap_retire_page(__unused ppnum_t pnum)2696 pmap_retire_page(
2697 	__unused ppnum_t pnum)
2698 {
2699 	/* XXX Justin TBD - mark the page as unusable in pmap data structures */
2700 }
2701 
2702 
2703 /*
2704  *	Initialize the pmap module.
2705  *	Called by vm_init, to initialize any structures that the pmap
2706  *	system needs to map virtual memory.
2707  */
2708 void
pmap_init(void)2709 pmap_init(
2710 	void)
2711 {
2712 	/*
2713 	 *	Protect page zero in the kernel map.
2714 	 *	(can be overruled by permanent transltion
2715 	 *	table entries at page zero - see arm_vm_init).
2716 	 */
2717 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2718 
2719 	pmap_initialized = TRUE;
2720 
2721 	/*
2722 	 *	Create the zone of physical maps
2723 	 *	and the physical-to-virtual entries.
2724 	 */
2725 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2726 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2727 
2728 
2729 	/*
2730 	 *	Initialize the pmap object (for tracking the vm_page_t
2731 	 *	structures for pages we allocate to be page tables in
2732 	 *	pmap_expand().
2733 	 */
2734 	_vm_object_allocate(mem_size, pmap_object);
2735 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2736 
2737 	/*
2738 	 * The values of [hard_]maxproc may have been scaled, make sure
2739 	 * they are still less than the value of pmap_max_asids.
2740 	 */
2741 	if ((uint32_t)maxproc > pmap_max_asids) {
2742 		maxproc = pmap_max_asids;
2743 	}
2744 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2745 		hard_maxproc = pmap_max_asids;
2746 	}
2747 }
2748 
2749 /**
2750  * Verify that a given physical page contains no mappings (outside of the
2751  * default physical aperture mapping).
2752  *
2753  * @param ppnum Physical page number to check there are no mappings to.
2754  *
2755  * @return True if there are no mappings, false otherwise or if the page is not
2756  *         kernel-managed.
2757  */
2758 bool
pmap_verify_free(ppnum_t ppnum)2759 pmap_verify_free(ppnum_t ppnum)
2760 {
2761 	const pmap_paddr_t pa = ptoa(ppnum);
2762 
2763 	assert(pa != vm_page_fictitious_addr);
2764 
2765 	/* Only mappings to kernel-managed physical memory are tracked. */
2766 	if (!pa_valid(pa)) {
2767 		return false;
2768 	}
2769 
2770 	const unsigned int pai = pa_index(pa);
2771 	pv_entry_t **pvh = pai_to_pvh(pai);
2772 
2773 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2774 }
2775 
2776 #if MACH_ASSERT
2777 /**
2778  * Verify that a given physical page contains no mappings (outside of the
2779  * default physical aperture mapping) and if it does, then panic.
2780  *
2781  * @note It's recommended to use pmap_verify_free() directly when operating in
2782  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2783  *       normally being called from outside of the PPL, and the pv_head_table
2784  *       can't be modified outside of the PPL).
2785  *
2786  * @param ppnum Physical page number to check there are no mappings to.
2787  */
2788 void
pmap_assert_free(ppnum_t ppnum)2789 pmap_assert_free(ppnum_t ppnum)
2790 {
2791 	const pmap_paddr_t pa = ptoa(ppnum);
2792 
2793 	/* Only mappings to kernel-managed physical memory are tracked. */
2794 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2795 		return;
2796 	}
2797 
2798 	const unsigned int pai = pa_index(pa);
2799 	pv_entry_t **pvh = pai_to_pvh(pai);
2800 
2801 	/**
2802 	 * This function is always called from outside of the PPL. Because of this,
2803 	 * the PVH entry can't be locked. This function is generally only called
2804 	 * before the VM reclaims a physical page and shouldn't be creating new
2805 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2806 	 * the worst case is that the system will panic in another way, and we were
2807 	 * already about to panic anyway.
2808 	 */
2809 
2810 	/**
2811 	 * Since pmap_verify_free() returned false, that means there is at least one
2812 	 * mapping left. Let's get some extra info on the first mapping we find to
2813 	 * dump in the panic string (the common case is that there is one spare
2814 	 * mapping that was never unmapped).
2815 	 */
2816 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2817 
2818 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2819 		first_ptep = pvh_ptep(pvh);
2820 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2821 		pv_entry_t *pvep = pvh_pve_list(pvh);
2822 
2823 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2824 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2825 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2826 			if (first_ptep != PT_ENTRY_NULL) {
2827 				break;
2828 			}
2829 		}
2830 
2831 		/* The PVE should have at least one valid PTE. */
2832 		assert(first_ptep != PT_ENTRY_NULL);
2833 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2834 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2835 		    __func__, pvh, pai);
2836 	} else {
2837 		/**
2838 		 * The mapping disappeared between here and the pmap_verify_free() call.
2839 		 * The only way that can happen is if the VM was racing this call with
2840 		 * a call that unmaps PTEs. Operations on this page should not be
2841 		 * occurring at the same time as this check, and unfortunately we can't
2842 		 * lock the PVH entry to prevent it, so just panic instead.
2843 		 */
2844 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2845 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2846 		    __func__, pvh, pai);
2847 	}
2848 
2849 	/* Panic with a unique string identifying the first bad mapping and owner. */
2850 	{
2851 		/* First PTE is mapped by the main CPUs. */
2852 		pmap_t pmap = ptep_get_pmap(first_ptep);
2853 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2854 
2855 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2856 		    "%s CPU mapping (pmap: %p)",
2857 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2858 	}
2859 }
2860 #endif
2861 
2862 
2863 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2864 pmap_root_alloc_size(pmap_t pmap)
2865 {
2866 #if (__ARM_VMSA__ > 7)
2867 #pragma unused(pmap)
2868 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2869 	unsigned int root_level = pt_attr_root_level(pt_attr);
2870 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2871 #else
2872 	(void)pmap;
2873 	return PMAP_ROOT_ALLOC_SIZE;
2874 #endif
2875 }
2876 
2877 /*
2878  *	Create and return a physical map.
2879  *
2880  *	If the size specified for the map
2881  *	is zero, the map is an actual physical
2882  *	map, and may be referenced by the
2883  *	hardware.
2884  *
2885  *	If the size specified is non-zero,
2886  *	the map will be used in software only, and
2887  *	is bounded by that size.
2888  */
2889 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2890 pmap_create_options_internal(
2891 	ledger_t ledger,
2892 	vm_map_size_t size,
2893 	unsigned int flags,
2894 	kern_return_t *kr)
2895 {
2896 	unsigned        i;
2897 	unsigned        tte_index_max;
2898 	pmap_t          p;
2899 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2900 #if defined(HAS_APPLE_PAC)
2901 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2902 #endif /* defined(HAS_APPLE_PAC) */
2903 	kern_return_t   local_kr = KERN_SUCCESS;
2904 
2905 	/*
2906 	 *	A software use-only map doesn't even need a pmap.
2907 	 */
2908 	if (size != 0) {
2909 		return PMAP_NULL;
2910 	}
2911 
2912 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2913 		return PMAP_NULL;
2914 	}
2915 
2916 #if XNU_MONITOR
2917 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2918 		goto pmap_create_fail;
2919 	}
2920 
2921 	assert(p != PMAP_NULL);
2922 
2923 	if (ledger) {
2924 		pmap_ledger_validate(ledger);
2925 		pmap_ledger_retain(ledger);
2926 	}
2927 #else
2928 	/*
2929 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2930 	 *	the translation table of the right size for the pmap.
2931 	 */
2932 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2933 		local_kr = KERN_RESOURCE_SHORTAGE;
2934 		goto pmap_create_fail;
2935 	}
2936 #endif
2937 
2938 	p->ledger = ledger;
2939 
2940 
2941 	p->pmap_vm_map_cs_enforced = false;
2942 
2943 	p->min = 0;
2944 	if (flags & PMAP_CREATE_64BIT) {
2945 	} else {
2946 	}
2947 
2948 #if defined(HAS_APPLE_PAC)
2949 	p->disable_jop = disable_jop;
2950 #endif /* defined(HAS_APPLE_PAC) */
2951 
2952 	p->nested_region_true_start = 0;
2953 	p->nested_region_true_end = ~0;
2954 
2955 	p->gc_status = 0;
2956 	p->stamp = os_atomic_inc(&pmap_stamp, relaxed);
2957 	p->nx_enabled = true;
2958 	p->is_64bit = is_64bit;
2959 	p->nested_pmap = PMAP_NULL;
2960 	p->type = PMAP_TYPE_USER;
2961 
2962 #if ARM_PARAMETERIZED_PMAP
2963 	/* Default to the native pt_attr */
2964 	p->pmap_pt_attr = native_pt_attr;
2965 #endif /* ARM_PARAMETERIZED_PMAP */
2966 #if __ARM_MIXED_PAGE_SIZE__
2967 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2968 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2969 	}
2970 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2971 	p->max = pmap_user_va_size(p);
2972 
2973 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2974 		local_kr = KERN_NO_SPACE;
2975 		goto id_alloc_fail;
2976 	}
2977 
2978 	pmap_lock_init(p);
2979 
2980 	p->tt_entry_free = (tt_entry_t *)0;
2981 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2982 
2983 #if     (__ARM_VMSA__ == 7)
2984 	p->tte_index_max = tte_index_max;
2985 #endif
2986 
2987 #if XNU_MONITOR
2988 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2989 #else
2990 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2991 #endif
2992 	if (!(p->tte)) {
2993 		local_kr = KERN_RESOURCE_SHORTAGE;
2994 		goto tt1_alloc_fail;
2995 	}
2996 
2997 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
2998 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2999 
3000 	/* nullify the translation table */
3001 	for (i = 0; i < tte_index_max; i++) {
3002 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3003 	}
3004 
3005 	FLUSH_PTE();
3006 
3007 	/*
3008 	 *  initialize the rest of the structure
3009 	 */
3010 	p->nested_region_addr = 0x0ULL;
3011 	p->nested_region_size = 0x0ULL;
3012 	p->nested_region_asid_bitmap = NULL;
3013 	p->nested_region_asid_bitmap_size = 0x0UL;
3014 
3015 	p->nested_has_no_bounds_ref = false;
3016 	p->nested_no_bounds_refcnt = 0;
3017 	p->nested_bounds_set = false;
3018 
3019 
3020 #if MACH_ASSERT
3021 	p->pmap_stats_assert = TRUE;
3022 	p->pmap_pid = 0;
3023 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3024 #endif /* MACH_ASSERT */
3025 #if DEVELOPMENT || DEBUG
3026 	p->footprint_was_suspended = FALSE;
3027 #endif /* DEVELOPMENT || DEBUG */
3028 
3029 #if XNU_MONITOR
3030 	os_atomic_init(&p->nested_count, 0);
3031 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3032 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3033 	os_atomic_thread_fence(release);
3034 #endif
3035 	os_atomic_init(&p->ref_count, 1);
3036 	pmap_simple_lock(&pmaps_lock);
3037 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3038 	pmap_simple_unlock(&pmaps_lock);
3039 
3040 	return p;
3041 
3042 tt1_alloc_fail:
3043 	pmap_get_pt_ops(p)->free_id(p);
3044 id_alloc_fail:
3045 #if XNU_MONITOR
3046 	pmap_free_pmap(p);
3047 
3048 	if (ledger) {
3049 		pmap_ledger_release(ledger);
3050 	}
3051 #else
3052 	zfree(pmap_zone, p);
3053 #endif
3054 pmap_create_fail:
3055 #if XNU_MONITOR
3056 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3057 #endif
3058 	*kr = local_kr;
3059 #if XNU_MONITOR
3060 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3061 #endif
3062 	return PMAP_NULL;
3063 }
3064 
3065 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3066 pmap_create_options(
3067 	ledger_t ledger,
3068 	vm_map_size_t size,
3069 	unsigned int flags)
3070 {
3071 	pmap_t pmap;
3072 	kern_return_t kr = KERN_SUCCESS;
3073 
3074 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3075 
3076 	ledger_reference(ledger);
3077 
3078 #if XNU_MONITOR
3079 	for (;;) {
3080 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3081 		if (kr != KERN_RESOURCE_SHORTAGE) {
3082 			break;
3083 		}
3084 		assert(pmap == PMAP_NULL);
3085 		pmap_alloc_page_for_ppl(0);
3086 		kr = KERN_SUCCESS;
3087 	}
3088 #else
3089 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3090 #endif
3091 
3092 	if (pmap == PMAP_NULL) {
3093 		ledger_dereference(ledger);
3094 	}
3095 
3096 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3097 
3098 	return pmap;
3099 }
3100 
3101 #if XNU_MONITOR
3102 /*
3103  * This symbol remains in place when the PPL is enabled so that the dispatch
3104  * table does not change from development to release configurations.
3105  */
3106 #endif
3107 #if MACH_ASSERT || XNU_MONITOR
3108 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3109 pmap_set_process_internal(
3110 	__unused pmap_t pmap,
3111 	__unused int pid,
3112 	__unused char *procname)
3113 {
3114 #if MACH_ASSERT
3115 	if (pmap == NULL) {
3116 		return;
3117 	}
3118 
3119 	validate_pmap_mutable(pmap);
3120 
3121 	pmap->pmap_pid = pid;
3122 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3123 	if (pmap_ledgers_panic_leeway) {
3124 		/*
3125 		 * XXX FBDP
3126 		 * Some processes somehow trigger some issues that make
3127 		 * the pmap stats and ledgers go off track, causing
3128 		 * some assertion failures and ledger panics.
3129 		 * Turn off the sanity checks if we allow some ledger leeway
3130 		 * because of that.  We'll still do a final check in
3131 		 * pmap_check_ledgers() for discrepancies larger than the
3132 		 * allowed leeway after the address space has been fully
3133 		 * cleaned up.
3134 		 */
3135 		pmap->pmap_stats_assert = FALSE;
3136 		ledger_disable_panic_on_negative(pmap->ledger,
3137 		    task_ledgers.phys_footprint);
3138 		ledger_disable_panic_on_negative(pmap->ledger,
3139 		    task_ledgers.internal);
3140 		ledger_disable_panic_on_negative(pmap->ledger,
3141 		    task_ledgers.internal_compressed);
3142 		ledger_disable_panic_on_negative(pmap->ledger,
3143 		    task_ledgers.iokit_mapped);
3144 		ledger_disable_panic_on_negative(pmap->ledger,
3145 		    task_ledgers.alternate_accounting);
3146 		ledger_disable_panic_on_negative(pmap->ledger,
3147 		    task_ledgers.alternate_accounting_compressed);
3148 	}
3149 #endif /* MACH_ASSERT */
3150 }
3151 #endif /* MACH_ASSERT || XNU_MONITOR */
3152 
3153 #if MACH_ASSERT
3154 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3155 pmap_set_process(
3156 	pmap_t pmap,
3157 	int pid,
3158 	char *procname)
3159 {
3160 #if XNU_MONITOR
3161 	pmap_set_process_ppl(pmap, pid, procname);
3162 #else
3163 	pmap_set_process_internal(pmap, pid, procname);
3164 #endif
3165 }
3166 #endif /* MACH_ASSERT */
3167 
3168 #if (__ARM_VMSA__ > 7)
3169 /*
3170  * pmap_deallocate_all_leaf_tts:
3171  *
3172  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3173  * removing and deallocating all TTEs.
3174  */
3175 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3176 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3177 {
3178 	tt_entry_t tte = ARM_TTE_EMPTY;
3179 	tt_entry_t * ttep = NULL;
3180 	tt_entry_t * last_ttep = NULL;
3181 
3182 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3183 
3184 	assert(level < pt_attr_leaf_level(pt_attr));
3185 
3186 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3187 
3188 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3189 		tte = *ttep;
3190 
3191 		if (!(tte & ARM_TTE_VALID)) {
3192 			continue;
3193 		}
3194 
3195 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3196 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3197 			    "pmap=%p, first_ttep=%p, level=%u",
3198 			    __FUNCTION__, ttep, (void *)tte,
3199 			    pmap, first_ttep, level);
3200 		}
3201 
3202 		/* Must be valid, type table */
3203 		if (level < pt_attr_twig_level(pt_attr)) {
3204 			/* If we haven't reached the twig level, recurse to the next level. */
3205 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3206 		}
3207 
3208 		/* Remove the TTE. */
3209 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3210 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3211 	}
3212 }
3213 #endif /* (__ARM_VMSA__ > 7) */
3214 
3215 /*
3216  * We maintain stats and ledgers so that a task's physical footprint is:
3217  * phys_footprint = ((internal - alternate_accounting)
3218  *                   + (internal_compressed - alternate_accounting_compressed)
3219  *                   + iokit_mapped
3220  *                   + purgeable_nonvolatile
3221  *                   + purgeable_nonvolatile_compressed
3222  *                   + page_table)
3223  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3224  */
3225 
3226 /*
3227  *	Retire the given physical map from service.
3228  *	Should only be called if the map contains
3229  *	no valid mappings.
3230  */
3231 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3232 pmap_destroy_internal(
3233 	pmap_t pmap)
3234 {
3235 	if (pmap == PMAP_NULL) {
3236 		return;
3237 	}
3238 
3239 	validate_pmap(pmap);
3240 
3241 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3242 
3243 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3244 	if (ref_count > 0) {
3245 		return;
3246 	} else if (__improbable(ref_count < 0)) {
3247 		panic("pmap %p: refcount underflow", pmap);
3248 	} else if (__improbable(pmap == kernel_pmap)) {
3249 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3250 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3251 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3252 	}
3253 
3254 #if XNU_MONITOR
3255 	/*
3256 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3257 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3258 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3259 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3260 	 * ref_count of 0 and panic.
3261 	 */
3262 	os_atomic_thread_fence(seq_cst);
3263 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3264 		panic("pmap %p: attempt to destroy while nested", pmap);
3265 	}
3266 	const int max_cpu = ml_get_max_cpu_number();
3267 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3268 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3269 		if (cpu_data == NULL) {
3270 			continue;
3271 		}
3272 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3273 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3274 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3275 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3276 		}
3277 	}
3278 #endif
3279 #if (__ARM_VMSA__ > 7)
3280 	pmap_unmap_sharedpage(pmap);
3281 #endif /* (__ARM_VMSA__ > 7) */
3282 
3283 	pmap_simple_lock(&pmaps_lock);
3284 #if !XNU_MONITOR
3285 	while (pmap->gc_status & PMAP_GC_INFLIGHT) {
3286 		pmap->gc_status |= PMAP_GC_WAIT;
3287 		assert_wait((event_t) &pmap->gc_status, THREAD_UNINT);
3288 		pmap_simple_unlock(&pmaps_lock);
3289 		(void) thread_block(THREAD_CONTINUE_NULL);
3290 		pmap_simple_lock(&pmaps_lock);
3291 	}
3292 #endif /* !XNU_MONITOR */
3293 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3294 	pmap_simple_unlock(&pmaps_lock);
3295 
3296 	pmap_trim_self(pmap);
3297 
3298 	/*
3299 	 *	Free the memory maps, then the
3300 	 *	pmap structure.
3301 	 */
3302 #if (__ARM_VMSA__ == 7)
3303 	unsigned int i = 0;
3304 	pt_entry_t     *ttep;
3305 
3306 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3307 	for (i = 0; i < pmap->tte_index_max; i++) {
3308 		ttep = &pmap->tte[i];
3309 		if ((*ttep & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
3310 			pmap_tte_deallocate(pmap, 0, 0, false, ttep, PMAP_TT_L1_LEVEL);
3311 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3312 		}
3313 	}
3314 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3315 #else /* (__ARM_VMSA__ == 7) */
3316 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3317 #endif /* (__ARM_VMSA__ == 7) */
3318 
3319 
3320 
3321 	if (pmap->tte) {
3322 #if (__ARM_VMSA__ == 7)
3323 		pmap_tt1_deallocate(pmap, pmap->tte, pmap->tte_index_max * sizeof(tt_entry_t), 0);
3324 		pmap->tte_index_max = 0;
3325 #else /* (__ARM_VMSA__ == 7) */
3326 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3327 #endif /* (__ARM_VMSA__ == 7) */
3328 		pmap->tte = (tt_entry_t *) NULL;
3329 		pmap->ttep = 0;
3330 	}
3331 
3332 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3333 
3334 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3335 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3336 		sync_tlb_flush();
3337 	} else {
3338 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3339 		sync_tlb_flush();
3340 		/* return its asid to the pool */
3341 		pmap_get_pt_ops(pmap)->free_id(pmap);
3342 		if (pmap->nested_pmap != NULL) {
3343 #if XNU_MONITOR
3344 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3345 #endif
3346 			/* release the reference we hold on the nested pmap */
3347 			pmap_destroy_internal(pmap->nested_pmap);
3348 		}
3349 	}
3350 
3351 	pmap_check_ledgers(pmap);
3352 
3353 	if (pmap->nested_region_asid_bitmap) {
3354 #if XNU_MONITOR
3355 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3356 #else
3357 		kfree_data(pmap->nested_region_asid_bitmap,
3358 		    pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3359 #endif
3360 	}
3361 
3362 #if XNU_MONITOR
3363 	if (pmap->ledger) {
3364 		pmap_ledger_release(pmap->ledger);
3365 	}
3366 
3367 	pmap_lock_destroy(pmap);
3368 	pmap_free_pmap(pmap);
3369 #else
3370 	pmap_lock_destroy(pmap);
3371 	zfree(pmap_zone, pmap);
3372 #endif
3373 }
3374 
3375 void
pmap_destroy(pmap_t pmap)3376 pmap_destroy(
3377 	pmap_t pmap)
3378 {
3379 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3380 
3381 	ledger_t ledger = pmap->ledger;
3382 
3383 #if XNU_MONITOR
3384 	pmap_destroy_ppl(pmap);
3385 
3386 	pmap_ledger_check_balance(pmap);
3387 #else
3388 	pmap_destroy_internal(pmap);
3389 #endif
3390 
3391 	ledger_dereference(ledger);
3392 
3393 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3394 }
3395 
3396 
3397 /*
3398  *	Add a reference to the specified pmap.
3399  */
3400 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3401 pmap_reference_internal(
3402 	pmap_t pmap)
3403 {
3404 	if (pmap != PMAP_NULL) {
3405 		validate_pmap_mutable(pmap);
3406 		os_atomic_inc(&pmap->ref_count, relaxed);
3407 	}
3408 }
3409 
3410 void
pmap_reference(pmap_t pmap)3411 pmap_reference(
3412 	pmap_t pmap)
3413 {
3414 #if XNU_MONITOR
3415 	pmap_reference_ppl(pmap);
3416 #else
3417 	pmap_reference_internal(pmap);
3418 #endif
3419 }
3420 
3421 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3422 pmap_tt1_allocate(
3423 	pmap_t          pmap,
3424 	vm_size_t       size,
3425 	unsigned        option)
3426 {
3427 	tt_entry_t      *tt1 = NULL;
3428 	tt_free_entry_t *tt1_free;
3429 	pmap_paddr_t    pa;
3430 	vm_address_t    va;
3431 	vm_address_t    va_end;
3432 	kern_return_t   ret;
3433 
3434 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3435 		size = PAGE_SIZE;
3436 	}
3437 
3438 	pmap_simple_lock(&tt1_lock);
3439 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3440 		free_page_size_tt_count--;
3441 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3442 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3443 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3444 		free_two_page_size_tt_count--;
3445 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3446 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3447 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3448 		free_tt_count--;
3449 		tt1 = (tt_entry_t *)free_tt_list;
3450 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3451 	}
3452 
3453 	pmap_simple_unlock(&tt1_lock);
3454 
3455 	if (tt1 != NULL) {
3456 		pmap_tt_ledger_credit(pmap, size);
3457 		return (tt_entry_t *)tt1;
3458 	}
3459 
3460 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3461 
3462 	if (ret == KERN_RESOURCE_SHORTAGE) {
3463 		return (tt_entry_t *)0;
3464 	}
3465 
3466 #if XNU_MONITOR
3467 	assert(pa);
3468 #endif
3469 
3470 	if (size < PAGE_SIZE) {
3471 		va = phystokv(pa) + size;
3472 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3473 		tt_free_entry_t *next_free = NULL;
3474 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3475 			tt1_free = (tt_free_entry_t *)va;
3476 			tt1_free->next = next_free;
3477 			next_free = tt1_free;
3478 		}
3479 		pmap_simple_lock(&tt1_lock);
3480 		local_free_list->next = free_tt_list;
3481 		free_tt_list = next_free;
3482 		free_tt_count += ((PAGE_SIZE / size) - 1);
3483 		if (free_tt_count > free_tt_max) {
3484 			free_tt_max = free_tt_count;
3485 		}
3486 		pmap_simple_unlock(&tt1_lock);
3487 	}
3488 
3489 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3490 	 * Depending on the device, this can vary between 512b and 16K. */
3491 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3492 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3493 	pmap_tt_ledger_credit(pmap, size);
3494 
3495 	return (tt_entry_t *) phystokv(pa);
3496 }
3497 
3498 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3499 pmap_tt1_deallocate(
3500 	pmap_t pmap,
3501 	tt_entry_t *tt,
3502 	vm_size_t size,
3503 	unsigned option)
3504 {
3505 	tt_free_entry_t *tt_entry;
3506 
3507 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3508 		size = PAGE_SIZE;
3509 	}
3510 
3511 	tt_entry = (tt_free_entry_t *)tt;
3512 	assert(not_in_kdp);
3513 	pmap_simple_lock(&tt1_lock);
3514 
3515 	if (size < PAGE_SIZE) {
3516 		free_tt_count++;
3517 		if (free_tt_count > free_tt_max) {
3518 			free_tt_max = free_tt_count;
3519 		}
3520 		tt_entry->next = free_tt_list;
3521 		free_tt_list = tt_entry;
3522 	}
3523 
3524 	if (size == PAGE_SIZE) {
3525 		free_page_size_tt_count++;
3526 		if (free_page_size_tt_count > free_page_size_tt_max) {
3527 			free_page_size_tt_max = free_page_size_tt_count;
3528 		}
3529 		tt_entry->next = free_page_size_tt_list;
3530 		free_page_size_tt_list = tt_entry;
3531 	}
3532 
3533 	if (size == 2 * PAGE_SIZE) {
3534 		free_two_page_size_tt_count++;
3535 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3536 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3537 		}
3538 		tt_entry->next = free_two_page_size_tt_list;
3539 		free_two_page_size_tt_list = tt_entry;
3540 	}
3541 
3542 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3543 		pmap_simple_unlock(&tt1_lock);
3544 		pmap_tt_ledger_debit(pmap, size);
3545 		return;
3546 	}
3547 
3548 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3549 		free_page_size_tt_count--;
3550 		tt = (tt_entry_t *)free_page_size_tt_list;
3551 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3552 
3553 		pmap_simple_unlock(&tt1_lock);
3554 
3555 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3556 
3557 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3558 
3559 		pmap_simple_lock(&tt1_lock);
3560 	}
3561 
3562 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3563 		free_two_page_size_tt_count--;
3564 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3565 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3566 
3567 		pmap_simple_unlock(&tt1_lock);
3568 
3569 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3570 
3571 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3572 
3573 		pmap_simple_lock(&tt1_lock);
3574 	}
3575 	pmap_simple_unlock(&tt1_lock);
3576 	pmap_tt_ledger_debit(pmap, size);
3577 }
3578 
3579 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3580 pmap_tt_allocate(
3581 	pmap_t pmap,
3582 	tt_entry_t **ttp,
3583 	unsigned int level,
3584 	unsigned int options)
3585 {
3586 	pmap_paddr_t pa;
3587 	*ttp = NULL;
3588 
3589 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3590 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3591 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3592 
3593 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3594 		tt_free_next = tt_free_cur->next;
3595 		tt_free_cur->next = NULL;
3596 		*ttp = (tt_entry_t *)tt_free_cur;
3597 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3598 	}
3599 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3600 
3601 	if (*ttp == NULL) {
3602 		pt_desc_t       *ptdp;
3603 
3604 		/*
3605 		 *  Allocate a VM page for the level x page table entries.
3606 		 */
3607 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3608 			if (options & PMAP_OPTIONS_NOWAIT) {
3609 				return KERN_RESOURCE_SHORTAGE;
3610 			}
3611 			VM_PAGE_WAIT();
3612 		}
3613 
3614 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3615 			if (options & PMAP_OPTIONS_NOWAIT) {
3616 				pmap_pages_free(pa, PAGE_SIZE);
3617 				return KERN_RESOURCE_SHORTAGE;
3618 			}
3619 			VM_PAGE_WAIT();
3620 		}
3621 
3622 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3623 			OSAddAtomic64(1, &alloc_ttepages_count);
3624 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3625 		} else {
3626 			OSAddAtomic64(1, &alloc_ptepages_count);
3627 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3628 		}
3629 
3630 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3631 
3632 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3633 
3634 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3635 
3636 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3637 		if (PAGE_SIZE > pmap_page_size) {
3638 			vm_address_t    va;
3639 			vm_address_t    va_end;
3640 
3641 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3642 
3643 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3644 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3645 				pmap->tt_entry_free = (tt_entry_t *)va;
3646 			}
3647 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3648 		}
3649 
3650 		*ttp = (tt_entry_t *)phystokv(pa);
3651 	}
3652 
3653 #if XNU_MONITOR
3654 	assert(*ttp);
3655 #endif
3656 
3657 	return KERN_SUCCESS;
3658 }
3659 
3660 
3661 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3662 pmap_tt_deallocate(
3663 	pmap_t pmap,
3664 	tt_entry_t *ttp,
3665 	unsigned int level)
3666 {
3667 	pt_desc_t *ptdp;
3668 	ptd_info_t *ptd_info;
3669 	unsigned pt_acc_cnt;
3670 	unsigned i;
3671 	vm_offset_t     free_page = 0;
3672 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3673 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3674 
3675 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3676 
3677 	ptdp = ptep_get_ptd(ttp);
3678 	ptd_info = ptd_get_info(ptdp, ttp);
3679 
3680 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3681 
3682 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3683 		ptd_info->refcnt = 0;
3684 	}
3685 
3686 	if (ptd_info->refcnt != 0) {
3687 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3688 	}
3689 
3690 	ptd_info->refcnt = 0;
3691 
3692 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3693 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3694 	}
3695 
3696 	if (pt_acc_cnt == 0) {
3697 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3698 		unsigned pt_free_entry_cnt = 1;
3699 
3700 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3701 			tt_free_entry_t *tt_free_list_next;
3702 
3703 			tt_free_list_next = tt_free_list->next;
3704 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3705 				pt_free_entry_cnt++;
3706 			}
3707 			tt_free_list = tt_free_list_next;
3708 		}
3709 		if (pt_free_entry_cnt == max_pt_index) {
3710 			tt_free_entry_t *tt_free_list_cur;
3711 
3712 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3713 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3714 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3715 
3716 			while (tt_free_list_cur) {
3717 				tt_free_entry_t *tt_free_list_next;
3718 
3719 				tt_free_list_next = tt_free_list_cur->next;
3720 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3721 					tt_free_list->next = tt_free_list_next->next;
3722 				} else {
3723 					tt_free_list = tt_free_list_next;
3724 				}
3725 				tt_free_list_cur = tt_free_list_next;
3726 			}
3727 		} else {
3728 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3729 			pmap->tt_entry_free = ttp;
3730 		}
3731 	} else {
3732 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3733 		pmap->tt_entry_free = ttp;
3734 	}
3735 
3736 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3737 
3738 	if (free_page != 0) {
3739 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3740 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3741 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3742 		if (level < pt_attr_leaf_level(pt_attr)) {
3743 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3744 		} else {
3745 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3746 		}
3747 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3748 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3749 	}
3750 }
3751 
3752 /**
3753  * Safely clear out a translation table entry.
3754  *
3755  * @note If the TTE to clear out points to a leaf table, then that leaf table
3756  *       must have a refcnt of zero before the TTE can be removed.
3757  * @note This function expects to be called with pmap locked exclusive, and will
3758  *       return with pmap unlocked.
3759  *
3760  * @param pmap The pmap containing the page table whose TTE is being removed.
3761  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3762  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3763  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3764  * @param ttep Pointer to the TTE that should be cleared out.
3765  * @param level The level of the page table that contains the TTE to be removed.
3766  */
3767 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3768 pmap_tte_remove(
3769 	pmap_t pmap,
3770 	vm_offset_t va_start,
3771 	vm_offset_t va_end,
3772 	bool need_strong_sync,
3773 	tt_entry_t *ttep,
3774 	unsigned int level)
3775 {
3776 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3777 
3778 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3779 	const tt_entry_t tte = *ttep;
3780 
3781 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3782 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3783 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3784 	}
3785 
3786 #if (__ARM_VMSA__ == 7)
3787 	{
3788 		tt_entry_t *ttep_4M = (tt_entry_t *) ((vm_offset_t)ttep & 0xFFFFFFF0);
3789 		unsigned i;
3790 
3791 		for (i = 0; i < 4; i++, ttep_4M++) {
3792 			*ttep_4M = (tt_entry_t) 0;
3793 		}
3794 		FLUSH_PTE_STRONG();
3795 	}
3796 #else
3797 	*ttep = (tt_entry_t) 0;
3798 	FLUSH_PTE_STRONG();
3799 #endif /* (__ARM_VMSA__ == 7) */
3800 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3801 	if (va_end > va_start) {
3802 #if (__ARM_VMSA__ == 7)
3803 		// Ensure intermediate translations are flushed for each 1MB block
3804 		flush_mmu_tlb_entry_async((va_start & ~ARM_TT_L1_PT_OFFMASK) | (pmap->hw_asid & 0xff));
3805 		flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3806 		flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 2 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3807 		flush_mmu_tlb_entry_async(((va_start & ~ARM_TT_L1_PT_OFFMASK) + 3 * ARM_TT_L1_SIZE) | (pmap->hw_asid & 0xff));
3808 #endif
3809 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3810 	}
3811 
3812 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3813 
3814 	/**
3815 	 * Remember, the passed in "level" parameter refers to the level above the
3816 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3817 	 * page table).
3818 	 */
3819 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3820 
3821 	/**
3822 	 * Non-leaf pagetables don't track active references in the PTD and instead
3823 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3824 	 * the real refcount below.
3825 	 */
3826 	unsigned short refcnt = PT_DESC_REFCOUNT;
3827 
3828 	/*
3829 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3830 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3831 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3832 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3833 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3834 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3835 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3836 	 * synchronize it against the disconnect operation.  If that removal caused the
3837 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3838 	 * operation is finished using the relevant pagetable descriptor.
3839 	 * Address these cases by waiting until all CPUs have been observed to not be
3840 	 * executing pmap_disconnect().
3841 	 */
3842 	if (remove_leaf_table) {
3843 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3844 		const int max_cpu = ml_get_max_cpu_number();
3845 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3846 		bool inflight_disconnect;
3847 
3848 		/*
3849 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3850 		 * ahead of any prior PTE load which may have observed the effect of a
3851 		 * concurrent disconnect operation.  An acquire fence is required for this;
3852 		 * a load-acquire operation is insufficient.
3853 		 */
3854 		os_atomic_thread_fence(acquire);
3855 		do {
3856 			inflight_disconnect = false;
3857 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3858 			    i >= 0;
3859 			    i = bitmap_next(&active_disconnects[0], i)) {
3860 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3861 				if (cpu_data == NULL) {
3862 					continue;
3863 				}
3864 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3865 					__builtin_arm_wfe();
3866 					inflight_disconnect = true;
3867 					continue;
3868 				}
3869 				os_atomic_clear_exclusive();
3870 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3871 			}
3872 		} while (inflight_disconnect);
3873 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3874 		os_atomic_thread_fence(acquire);
3875 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3876 	}
3877 
3878 #if MACH_ASSERT
3879 	/**
3880 	 * On internal devices, always do the page table consistency check
3881 	 * regardless of page table level or the actual refcnt value.
3882 	 */
3883 	{
3884 #else /* MACH_ASSERT */
3885 	/**
3886 	 * Only perform the page table consistency check when deleting leaf page
3887 	 * tables and it seems like there might be valid/compressed mappings
3888 	 * leftover.
3889 	 */
3890 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3891 #endif /* MACH_ASSERT */
3892 
3893 		/**
3894 		 * There are multiple problems that can arise as a non-zero refcnt:
3895 		 * 1. A bug in the refcnt management logic.
3896 		 * 2. A memory stomper or hardware failure.
3897 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3898 		 *    space before destroying a pmap.
3899 		 *
3900 		 * By looping over the page table and determining how many valid or
3901 		 * compressed entries there actually are, we can narrow down which of
3902 		 * these three cases is causing this panic. If the expected refcnt
3903 		 * (valid + compressed) and the actual refcnt don't match then the
3904 		 * problem is probably either a memory corruption issue (if the
3905 		 * non-empty entries don't match valid+compressed, that could also be a
3906 		 * sign of corruption) or refcnt management bug. Otherwise, there
3907 		 * actually are leftover mappings and the higher layers of xnu are
3908 		 * probably at fault.
3909 		 */
3910 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3911 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3912 
3913 		pt_entry_t *ptep = bpte;
3914 		unsigned short non_empty = 0, valid = 0, comp = 0;
3915 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3916 			/* Keep track of all non-empty entries to detect memory corruption. */
3917 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3918 				non_empty++;
3919 			}
3920 
3921 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3922 				comp++;
3923 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3924 				valid++;
3925 			}
3926 		}
3927 
3928 #if MACH_ASSERT
3929 		/**
3930 		 * On internal machines, panic whenever a page table getting deleted has
3931 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3932 		 * non-zero refcnt.
3933 		 */
3934 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3935 #else /* MACH_ASSERT */
3936 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3937 		{
3938 #endif /* MACH_ASSERT */
3939 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3940 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3941 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3942 		}
3943 	}
3944 }
3945 
3946 /**
3947  * Given a pointer to an entry within a `level` page table, delete the
3948  * page table at `level` + 1 that is represented by that entry. For instance,
3949  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3950  * contains the PA of the L3 table, and `level` would be "2".
3951  *
3952  * @note If the table getting deallocated is a leaf table, then that leaf table
3953  *       must have a refcnt of zero before getting deallocated. All other levels
3954  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3955  * @note This function expects to be called with pmap locked exclusive and will
3956  *       return with pmap unlocked.
3957  *
3958  * @param pmap The pmap that owns the page table to be deallocated.
3959  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance
3960  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance
3961  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance
3962  * @param ttep Pointer to the `level` TTE to remove.
3963  * @param level The level of the table that contains an entry pointing to the
3964  *              table to be removed. The deallocated page table will be a
3965  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3966  *              deleted).
3967  */
3968 void
3969 pmap_tte_deallocate(
3970 	pmap_t pmap,
3971 	vm_offset_t va_start,
3972 	vm_offset_t va_end,
3973 	bool need_strong_sync,
3974 	tt_entry_t *ttep,
3975 	unsigned int level)
3976 {
3977 	pmap_paddr_t pa;
3978 	tt_entry_t tte;
3979 
3980 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3981 
3982 	tte = *ttep;
3983 
3984 	if (tte_get_ptd(tte)->pmap != pmap) {
3985 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3986 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3987 	}
3988 
3989 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3990 	    __func__, ttep, (unsigned long long)tte);
3991 	uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3992 
3993 	/* pmap_tte_remove() will drop the pmap lock */
3994 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3995 
3996 	/* Clear any page offset: we mean to free the whole page, but armv7 TTEs may only be
3997 	 * aligned on 1K boundaries.  We clear the surrounding "chunk" of 4 TTEs above. */
3998 	pa = tte_to_pa(tte) & ~(pmap_page_size - 1);
3999 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(pa), level + 1);
4000 }
4001 
4002 /*
4003  *	Remove a range of hardware page-table entries.
4004  *	The entries given are the first (inclusive)
4005  *	and last (exclusive) entries for the VM pages.
4006  *	The virtual address is the va for the first pte.
4007  *
4008  *	The pmap must be locked.
4009  *	If the pmap is not the kernel pmap, the range must lie
4010  *	entirely within one pte-page.  This is NOT checked.
4011  *	Assumes that the pte-page exists.
4012  *
4013  *	Returns the number of PTE changed
4014  */
4015 MARK_AS_PMAP_TEXT static int
4016 pmap_remove_range(
4017 	pmap_t pmap,
4018 	vm_map_address_t va,
4019 	pt_entry_t *bpte,
4020 	pt_entry_t *epte)
4021 {
4022 	bool need_strong_sync = false;
4023 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4024 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4025 	if (num_changed > 0) {
4026 		PMAP_UPDATE_TLBS(pmap, va,
4027 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4028 	}
4029 	return num_changed;
4030 }
4031 
4032 
4033 #ifdef PVH_FLAG_EXEC
4034 
4035 /*
4036  *	Update the access protection bits of the physical aperture mapping for a page.
4037  *	This is useful, for example, in guranteeing that a verified executable page
4038  *	has no writable mappings anywhere in the system, including the physical
4039  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4040  *	synchronization overhead in cases where the call to this function is
4041  *	guaranteed to be followed by other TLB operations.
4042  */
4043 void
4044 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4045 {
4046 #if __ARM_PTE_PHYSMAP__
4047 	pvh_assert_locked(pai);
4048 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4049 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4050 
4051 	pt_entry_t tmplate = *pte_p;
4052 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4053 		return;
4054 	}
4055 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4056 #if (__ARM_VMSA__ > 7)
4057 	if (tmplate & ARM_PTE_HINT_MASK) {
4058 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4059 		    __func__, pte_p, (void *)kva, tmplate);
4060 	}
4061 #endif
4062 	write_pte_strong(pte_p, tmplate);
4063 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
4064 	if (!flush_tlb_async) {
4065 		sync_tlb_flush();
4066 	}
4067 #endif
4068 }
4069 
4070 #endif /* defined(PVH_FLAG_EXEC) */
4071 
4072 MARK_AS_PMAP_TEXT int
4073 pmap_remove_range_options(
4074 	pmap_t pmap,
4075 	vm_map_address_t va,
4076 	pt_entry_t *bpte,
4077 	pt_entry_t *epte,
4078 	vm_map_address_t *eva,
4079 	bool *need_strong_sync __unused,
4080 	int options)
4081 {
4082 	pt_entry_t     *cpte;
4083 	size_t          npages = 0;
4084 	int             num_removed, num_unwired;
4085 	int             num_pte_changed;
4086 	unsigned int    pai = 0;
4087 	pmap_paddr_t    pa;
4088 	int             num_external, num_internal, num_reusable;
4089 	int             num_alt_internal;
4090 	uint64_t        num_compressed, num_alt_compressed;
4091 	int16_t         refcnt = 0;
4092 
4093 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4094 
4095 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4096 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4097 
4098 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4099 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4100 	}
4101 
4102 	num_removed = 0;
4103 	num_unwired = 0;
4104 	num_pte_changed = 0;
4105 	num_external = 0;
4106 	num_internal = 0;
4107 	num_reusable = 0;
4108 	num_compressed = 0;
4109 	num_alt_internal = 0;
4110 	num_alt_compressed = 0;
4111 
4112 #if XNU_MONITOR
4113 	bool ro_va = false;
4114 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4115 		ro_va = true;
4116 	}
4117 #endif
4118 	for (cpte = bpte; cpte < epte;
4119 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4120 		pt_entry_t      spte;
4121 		boolean_t       managed = FALSE;
4122 
4123 		/*
4124 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4125 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4126 		 */
4127 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4128 			*eva = va;
4129 			break;
4130 		}
4131 
4132 		spte = *((volatile pt_entry_t*)cpte);
4133 
4134 		while (!managed) {
4135 			if (pmap != kernel_pmap &&
4136 			    (options & PMAP_OPTIONS_REMOVE) &&
4137 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4138 				/*
4139 				 * "pmap" must be locked at this point,
4140 				 * so this should not race with another
4141 				 * pmap_remove_range() or pmap_enter().
4142 				 */
4143 
4144 				/* one less "compressed"... */
4145 				num_compressed++;
4146 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4147 					/* ... but it used to be "ALTACCT" */
4148 					num_alt_compressed++;
4149 				}
4150 
4151 				/* clear marker */
4152 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4153 				/*
4154 				 * "refcnt" also accounts for
4155 				 * our "compressed" markers,
4156 				 * so let's update it here.
4157 				 */
4158 				--refcnt;
4159 				spte = *((volatile pt_entry_t*)cpte);
4160 			}
4161 			/*
4162 			 * It may be possible for the pte to transition from managed
4163 			 * to unmanaged in this timeframe; for now, elide the assert.
4164 			 * We should break out as a consequence of checking pa_valid.
4165 			 */
4166 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4167 			pa = pte_to_pa(spte);
4168 			if (!pa_valid(pa)) {
4169 #if XNU_MONITOR
4170 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4171 #endif
4172 #if XNU_MONITOR
4173 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4174 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4175 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4176 					    __func__, (uint64_t)pa);
4177 				}
4178 #endif
4179 				break;
4180 			}
4181 			pai = pa_index(pa);
4182 			pvh_lock(pai);
4183 			spte = *((volatile pt_entry_t*)cpte);
4184 			pa = pte_to_pa(spte);
4185 			if (pai == pa_index(pa)) {
4186 				managed = TRUE;
4187 				break; // Leave pai locked as we will unlock it after we free the PV entry
4188 			}
4189 			pvh_unlock(pai);
4190 		}
4191 
4192 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4193 			/*
4194 			 * There used to be a valid mapping here but it
4195 			 * has already been removed when the page was
4196 			 * sent to the VM compressor, so nothing left to
4197 			 * remove now...
4198 			 */
4199 			continue;
4200 		}
4201 
4202 		/* remove the translation, do not flush the TLB */
4203 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4204 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4205 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4206 #if MACH_ASSERT
4207 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4208 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4209 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4210 			}
4211 #endif
4212 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4213 			num_pte_changed++;
4214 		}
4215 
4216 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4217 		    (pmap != kernel_pmap)) {
4218 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4219 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4220 			--refcnt;
4221 		}
4222 
4223 		if (pte_is_wired(spte)) {
4224 			pte_set_wired(pmap, cpte, 0);
4225 			num_unwired++;
4226 		}
4227 		/*
4228 		 * if not managed, we're done
4229 		 */
4230 		if (!managed) {
4231 			continue;
4232 		}
4233 
4234 #if XNU_MONITOR
4235 		if (__improbable(ro_va)) {
4236 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4237 		}
4238 #endif
4239 
4240 		/*
4241 		 * find and remove the mapping from the chain for this
4242 		 * physical address.
4243 		 */
4244 		bool is_internal, is_altacct;
4245 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4246 
4247 		if (is_altacct) {
4248 			assert(is_internal);
4249 			num_internal++;
4250 			num_alt_internal++;
4251 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4252 				ppattr_clear_altacct(pai);
4253 				ppattr_clear_internal(pai);
4254 			}
4255 		} else if (is_internal) {
4256 			if (ppattr_test_reusable(pai)) {
4257 				num_reusable++;
4258 			} else {
4259 				num_internal++;
4260 			}
4261 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4262 				ppattr_clear_internal(pai);
4263 			}
4264 		} else {
4265 			num_external++;
4266 		}
4267 		pvh_unlock(pai);
4268 		num_removed++;
4269 	}
4270 
4271 	/*
4272 	 *	Update the counts
4273 	 */
4274 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4275 
4276 	if (pmap != kernel_pmap) {
4277 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4278 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4279 		}
4280 
4281 		/* update ledgers */
4282 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4283 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4284 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4285 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4286 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4287 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4288 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4289 		/* make needed adjustments to phys_footprint */
4290 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4291 		    ((num_internal -
4292 		    num_alt_internal) +
4293 		    (num_compressed -
4294 		    num_alt_compressed)) * pmap_page_size);
4295 	}
4296 
4297 	/* flush the ptable entries we have written */
4298 	if (num_pte_changed > 0) {
4299 		FLUSH_PTE_STRONG();
4300 	}
4301 
4302 	return num_pte_changed;
4303 }
4304 
4305 
4306 /*
4307  *	Remove the given range of addresses
4308  *	from the specified map.
4309  *
4310  *	It is assumed that the start and end are properly
4311  *	rounded to the hardware page size.
4312  */
4313 void
4314 pmap_remove(
4315 	pmap_t pmap,
4316 	vm_map_address_t start,
4317 	vm_map_address_t end)
4318 {
4319 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4320 }
4321 
4322 MARK_AS_PMAP_TEXT vm_map_address_t
4323 pmap_remove_options_internal(
4324 	pmap_t pmap,
4325 	vm_map_address_t start,
4326 	vm_map_address_t end,
4327 	int options)
4328 {
4329 	vm_map_address_t eva = end;
4330 	pt_entry_t     *bpte, *epte;
4331 	pt_entry_t     *pte_p;
4332 	tt_entry_t     *tte_p;
4333 	int             remove_count = 0;
4334 	bool            need_strong_sync = false;
4335 	bool            unlock = true;
4336 
4337 	if (__improbable(end < start)) {
4338 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4339 	}
4340 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4341 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4342 	}
4343 
4344 	validate_pmap_mutable(pmap);
4345 
4346 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4347 
4348 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4349 
4350 	tte_p = pmap_tte(pmap, start);
4351 
4352 	if (tte_p == (tt_entry_t *) NULL) {
4353 		goto done;
4354 	}
4355 
4356 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4357 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4358 		bpte = &pte_p[pte_index(pt_attr, start)];
4359 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4360 
4361 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4362 		    &need_strong_sync, options);
4363 
4364 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4365 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4366 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4367 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4368 		}
4369 	}
4370 
4371 done:
4372 	if (unlock) {
4373 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4374 	}
4375 
4376 	if (remove_count > 0) {
4377 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4378 	}
4379 	return eva;
4380 }
4381 
4382 void
4383 pmap_remove_options(
4384 	pmap_t pmap,
4385 	vm_map_address_t start,
4386 	vm_map_address_t end,
4387 	int options)
4388 {
4389 	vm_map_address_t va;
4390 
4391 	if (pmap == PMAP_NULL) {
4392 		return;
4393 	}
4394 
4395 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4396 
4397 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4398 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4399 	    VM_KERNEL_ADDRHIDE(end));
4400 
4401 #if MACH_ASSERT
4402 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4403 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4404 		    pmap, (uint64_t)start, (uint64_t)end);
4405 	}
4406 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4407 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4408 		    pmap, (uint64_t)start, (uint64_t)end);
4409 	}
4410 #endif
4411 
4412 	/*
4413 	 * We allow single-page requests to execute non-preemptibly,
4414 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4415 	 * operation, and there are a couple of special use cases that
4416 	 * require a non-preemptible single-page operation.
4417 	 */
4418 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4419 		pmap_verify_preemptible();
4420 	}
4421 
4422 	/*
4423 	 *      Invalidate the translation buffer first
4424 	 */
4425 	va = start;
4426 	while (va < end) {
4427 		vm_map_address_t l;
4428 
4429 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4430 		if (l > end) {
4431 			l = end;
4432 		}
4433 
4434 #if XNU_MONITOR
4435 		va = pmap_remove_options_ppl(pmap, va, l, options);
4436 
4437 		pmap_ledger_check_balance(pmap);
4438 #else
4439 		va = pmap_remove_options_internal(pmap, va, l, options);
4440 #endif
4441 	}
4442 
4443 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4444 }
4445 
4446 
4447 /*
4448  *	Remove phys addr if mapped in specified map
4449  */
4450 void
4451 pmap_remove_some_phys(
4452 	__unused pmap_t map,
4453 	__unused ppnum_t pn)
4454 {
4455 	/* Implement to support working set code */
4456 }
4457 
4458 /*
4459  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4460  * switch a thread onto a new vm_map.
4461  */
4462 void
4463 pmap_switch_user(thread_t thread, vm_map_t new_map)
4464 {
4465 	pmap_t new_pmap = new_map->pmap;
4466 
4467 
4468 	thread->map = new_map;
4469 	pmap_set_pmap(new_pmap, thread);
4470 
4471 }
4472 
4473 void
4474 pmap_set_pmap(
4475 	pmap_t pmap,
4476 #if     !__ARM_USER_PROTECT__
4477 	__unused
4478 #endif
4479 	thread_t        thread)
4480 {
4481 	pmap_switch(pmap);
4482 #if __ARM_USER_PROTECT__
4483 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4484 	thread->machine.asid = pmap->hw_asid;
4485 #endif
4486 }
4487 
4488 static void
4489 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4490 {
4491 #if (__ARM_VMSA__ == 7)
4492 	flush_core_tlb_asid_async(pmap->hw_asid);
4493 #else
4494 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4495 #endif
4496 }
4497 
4498 static inline bool
4499 pmap_user_ttb_is_clear(void)
4500 {
4501 #if (__ARM_VMSA__ > 7)
4502 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4503 #else
4504 	return get_mmu_ttb() == kernel_pmap->ttep;
4505 #endif
4506 }
4507 
4508 MARK_AS_PMAP_TEXT void
4509 pmap_switch_internal(
4510 	pmap_t pmap)
4511 {
4512 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4513 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4514 #if XNU_MONITOR
4515 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4516 #endif
4517 	validate_pmap_mutable(pmap);
4518 	uint16_t asid_index = pmap->hw_asid;
4519 	bool do_asid_flush = false;
4520 	bool do_commpage_flush = false;
4521 
4522 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4523 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4524 	}
4525 #if __ARM_KERNEL_PROTECT__
4526 	asid_index >>= 1;
4527 #endif
4528 
4529 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4530 #if (__ARM_VMSA__ > 7)
4531 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4532 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4533 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4534 #endif
4535 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4536 	bool break_before_make = do_shared_region_flush;
4537 
4538 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4539 		asid_index -= 1;
4540 		pmap_update_plru(asid_index);
4541 
4542 		/* Paranoia. */
4543 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4544 
4545 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4546 		uint8_t new_sw_asid = pmap->sw_asid;
4547 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4548 
4549 		if (new_sw_asid != last_sw_asid) {
4550 			/*
4551 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4552 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4553 			 * then this switch runs the risk of aliasing.  We need to flush the
4554 			 * TLB for this phyiscal ASID in this case.
4555 			 */
4556 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4557 			do_asid_flush = true;
4558 			break_before_make = true;
4559 		}
4560 	}
4561 
4562 #if __ARM_MIXED_PAGE_SIZE__
4563 	if (pt_attr->pta_tcr_value != get_tcr()) {
4564 		break_before_make = true;
4565 	}
4566 #endif
4567 #if __ARM_MIXED_PAGE_SIZE__
4568 	/*
4569 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4570 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4571 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4572 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4573 	 * conflict abort or other unpredictable behavior.
4574 	 */
4575 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4576 		do_commpage_flush = true;
4577 	}
4578 	if (do_commpage_flush) {
4579 		break_before_make = true;
4580 	}
4581 #endif
4582 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4583 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4584 		pmap_clear_user_ttb_internal();
4585 	}
4586 
4587 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4588 	 * to flush the userspace mappings for that region.  Those mappings are global
4589 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4590 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4591 	if (__improbable(do_shared_region_flush)) {
4592 #if __ARM_RANGE_TLBI__
4593 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4594 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4595 
4596 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4597 		 * There may still be non-global entries that overlap with the incoming pmap's
4598 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4599 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4600 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4601 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4602 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4603 		 * to consider additional invalidation here in the future. */
4604 		if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4605 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4606 		} else {
4607 			do_asid_flush = false;
4608 			flush_core_tlb_async();
4609 		}
4610 #else
4611 		do_asid_flush = false;
4612 		flush_core_tlb_async();
4613 #endif // __ARM_RANGE_TLBI__
4614 	}
4615 
4616 #if __ARM_MIXED_PAGE_SIZE__
4617 	if (__improbable(do_commpage_flush)) {
4618 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4619 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4620 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4621 		flush_core_tlb_allrange_async(rtlbi_param);
4622 	}
4623 #endif
4624 	if (__improbable(do_asid_flush)) {
4625 		pmap_flush_core_tlb_asid_async(pmap);
4626 #if DEVELOPMENT || DEBUG
4627 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4628 #endif
4629 	}
4630 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4631 		sync_tlb_flush_local();
4632 	}
4633 
4634 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4635 }
4636 
4637 void
4638 pmap_switch(
4639 	pmap_t pmap)
4640 {
4641 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4642 #if XNU_MONITOR
4643 	pmap_switch_ppl(pmap);
4644 #else
4645 	pmap_switch_internal(pmap);
4646 #endif
4647 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4648 }
4649 
4650 void
4651 pmap_page_protect(
4652 	ppnum_t ppnum,
4653 	vm_prot_t prot)
4654 {
4655 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4656 }
4657 
4658 /*
4659  *	Routine:	pmap_page_protect_options
4660  *
4661  *	Function:
4662  *		Lower the permission for all mappings to a given
4663  *		page.
4664  */
4665 MARK_AS_PMAP_TEXT static void
4666 pmap_page_protect_options_with_flush_range(
4667 	ppnum_t ppnum,
4668 	vm_prot_t prot,
4669 	unsigned int options,
4670 	pmap_tlb_flush_range_t *flush_range)
4671 {
4672 	pmap_paddr_t    phys = ptoa(ppnum);
4673 	pv_entry_t    **pv_h;
4674 	pv_entry_t     *pve_p, *orig_pve_p;
4675 	pv_entry_t     *pveh_p;
4676 	pv_entry_t     *pvet_p;
4677 	pt_entry_t     *pte_p, *orig_pte_p;
4678 	pv_entry_t     *new_pve_p;
4679 	pt_entry_t     *new_pte_p;
4680 	vm_offset_t     pvh_flags;
4681 	unsigned int    pai;
4682 	bool            remove;
4683 	bool            set_NX;
4684 	unsigned int    pvh_cnt = 0;
4685 	unsigned int    pass1_updated = 0;
4686 	unsigned int    pass2_updated = 0;
4687 
4688 	assert(ppnum != vm_page_fictitious_addr);
4689 
4690 	/* Only work with managed pages. */
4691 	if (!pa_valid(phys)) {
4692 		return;
4693 	}
4694 
4695 	/*
4696 	 * Determine the new protection.
4697 	 */
4698 	switch (prot) {
4699 	case VM_PROT_ALL:
4700 		return;         /* nothing to do */
4701 	case VM_PROT_READ:
4702 	case VM_PROT_READ | VM_PROT_EXECUTE:
4703 		remove = false;
4704 		break;
4705 	default:
4706 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4707 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4708 		remove = true;
4709 		break;
4710 	}
4711 
4712 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4713 	if (remove) {
4714 #if !XNU_MONITOR
4715 		mp_disable_preemption();
4716 #endif
4717 		pmap_cpu_data = pmap_get_cpu_data();
4718 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4719 		/*
4720 		 * Ensure the store to inflight_disconnect will be observed before any of the
4721 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4722 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4723 		 * another CPU, in between this function's clearing a PTE and dropping the
4724 		 * corresponding pagetable refcount.  That can lead to a panic if the
4725 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4726 		 * store barrier; a store-release operation would not be sufficient.
4727 		 */
4728 		os_atomic_thread_fence(release);
4729 	}
4730 
4731 	pai = pa_index(phys);
4732 	pvh_lock(pai);
4733 	pv_h = pai_to_pvh(pai);
4734 	pvh_flags = pvh_get_flags(pv_h);
4735 
4736 #if XNU_MONITOR
4737 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4738 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4739 	}
4740 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4741 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4742 	}
4743 #endif
4744 
4745 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4746 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4747 	pveh_p = PV_ENTRY_NULL;
4748 	pvet_p = PV_ENTRY_NULL;
4749 	new_pve_p = PV_ENTRY_NULL;
4750 	new_pte_p = PT_ENTRY_NULL;
4751 
4752 
4753 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4754 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4755 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4756 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4757 		pveh_p = pve_p;
4758 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4759 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4760 	}
4761 
4762 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4763 	int pve_ptep_idx = 0;
4764 
4765 	/*
4766 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4767 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4768 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4769 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4770 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4771 	 * tlb_flush_needed to be true while issue_tlbi is false.
4772 	 */
4773 	bool issue_tlbi = false;
4774 	bool tlb_flush_needed = false;
4775 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4776 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4777 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4778 		bool update = false;
4779 
4780 		if (pve_p != PV_ENTRY_NULL) {
4781 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4782 			if (pte_p == PT_ENTRY_NULL) {
4783 				goto protect_skip_pve_pass1;
4784 			}
4785 		}
4786 
4787 #ifdef PVH_FLAG_IOMMU
4788 		if (pvh_ptep_is_iommu(pte_p)) {
4789 #if XNU_MONITOR
4790 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4791 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4792 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4793 			}
4794 #endif
4795 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4796 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4797 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4798 			}
4799 			goto protect_skip_pve_pass1;
4800 		}
4801 #endif
4802 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4803 		const pmap_t pmap = ptdp->pmap;
4804 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4805 
4806 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4807 #if MACH_ASSERT
4808 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4809 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4810 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4811 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4812 
4813 				pv_entry_t *check_pvep = pve_p;
4814 
4815 				do {
4816 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4817 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4818 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4819 					}
4820 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4821 
4822 				/* Restore previous PTEP value. */
4823 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4824 			}
4825 #endif
4826 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4827 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4828 		}
4829 
4830 #if DEVELOPMENT || DEBUG
4831 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4832 #else
4833 		if ((prot & VM_PROT_EXECUTE))
4834 #endif
4835 		{
4836 			set_NX = false;
4837 		} else {
4838 			set_NX = true;
4839 		}
4840 
4841 		/* Remove the mapping if new protection is NONE */
4842 		if (remove) {
4843 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4844 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4845 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4846 			pt_entry_t spte = *pte_p;
4847 
4848 			if (pte_is_wired(spte)) {
4849 				pte_set_wired(pmap, pte_p, 0);
4850 				spte = *pte_p;
4851 				if (pmap != kernel_pmap) {
4852 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4853 				}
4854 			}
4855 
4856 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4857 			    (uint64_t)spte, pte_p, ppnum);
4858 
4859 			if (compress && is_internal && (pmap != kernel_pmap)) {
4860 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4861 				/* mark this PTE as having been "compressed" */
4862 				tmplate = ARM_PTE_COMPRESSED;
4863 				if (is_altacct) {
4864 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4865 				}
4866 			} else {
4867 				tmplate = ARM_PTE_TYPE_FAULT;
4868 			}
4869 
4870 			assert(spte != tmplate);
4871 			write_pte_fast(pte_p, tmplate);
4872 			update = true;
4873 			++pass1_updated;
4874 
4875 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4876 
4877 			if (pmap != kernel_pmap) {
4878 				if (ppattr_test_reusable(pai) &&
4879 				    is_internal &&
4880 				    !is_altacct) {
4881 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4882 				} else if (!is_internal) {
4883 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4884 				}
4885 
4886 				if (is_altacct) {
4887 					assert(is_internal);
4888 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4889 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4890 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4891 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4892 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4893 					}
4894 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4895 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4896 				} else if (ppattr_test_reusable(pai)) {
4897 					assert(is_internal);
4898 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4899 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4900 						/* was not in footprint, but is now */
4901 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4902 					}
4903 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4904 				} else if (is_internal) {
4905 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4906 
4907 					/*
4908 					 * Update all stats related to physical footprint, which only
4909 					 * deals with internal pages.
4910 					 */
4911 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4912 						/*
4913 						 * This removal is only being done so we can send this page to
4914 						 * the compressor; therefore it mustn't affect total task footprint.
4915 						 */
4916 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4917 					} else {
4918 						/*
4919 						 * This internal page isn't going to the compressor, so adjust stats to keep
4920 						 * phys_footprint up to date.
4921 						 */
4922 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4923 					}
4924 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4925 				} else {
4926 					/* external page: no impact on ledgers */
4927 				}
4928 			}
4929 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4930 		} else {
4931 			pt_entry_t spte = *pte_p;
4932 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4933 
4934 			if (pmap == kernel_pmap) {
4935 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4936 			} else {
4937 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4938 			}
4939 
4940 			/*
4941 			 * While the naive implementation of this would serve to add execute
4942 			 * permission, this is not how the VM uses this interface, or how
4943 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4944 			 */
4945 			if (set_NX) {
4946 				tmplate |= pt_attr_leaf_xn(pt_attr);
4947 			}
4948 
4949 
4950 			assert(spte != ARM_PTE_TYPE_FAULT);
4951 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4952 
4953 			if (spte != tmplate) {
4954 				/*
4955 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4956 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4957 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4958 				 * should always be cleared by this function.
4959 				 */
4960 				pte_set_was_writeable(tmplate, true);
4961 				write_pte_fast(pte_p, tmplate);
4962 				update = true;
4963 				++pass1_updated;
4964 			} else if (pte_was_writeable(tmplate)) {
4965 				/*
4966 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4967 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
4968 				 * write access to a page, this function should always at least clear that flag for
4969 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4970 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
4971 				 * be handled through arm_fast_fault().
4972 				 */
4973 				pte_set_was_writeable(tmplate, false);
4974 				write_pte_fast(pte_p, tmplate);
4975 			}
4976 		}
4977 
4978 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4979 			tlb_flush_needed = true;
4980 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4981 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4982 				issue_tlbi = true;
4983 			}
4984 		}
4985 protect_skip_pve_pass1:
4986 		pte_p = PT_ENTRY_NULL;
4987 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4988 			pve_ptep_idx = 0;
4989 			pve_p = pve_next(pve_p);
4990 		}
4991 	}
4992 
4993 	if (tlb_flush_needed) {
4994 		FLUSH_PTE_STRONG();
4995 	}
4996 
4997 	if (!remove && !issue_tlbi) {
4998 		goto protect_finish;
4999 	}
5000 
5001 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5002 	pv_entry_t **pve_pp = pv_h;
5003 	pve_p = orig_pve_p;
5004 	pte_p = orig_pte_p;
5005 	pve_ptep_idx = 0;
5006 
5007 	/*
5008 	 * We need to keep track of whether a particular PVE list contains IOMMU
5009 	 * mappings when removing entries, because we should only remove CPU
5010 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5011 	 * it around.
5012 	 */
5013 	bool iommu_mapping_in_pve = false;
5014 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5015 		if (pve_p != PV_ENTRY_NULL) {
5016 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5017 			if (pte_p == PT_ENTRY_NULL) {
5018 				goto protect_skip_pve_pass2;
5019 			}
5020 		}
5021 
5022 #ifdef PVH_FLAG_IOMMU
5023 		if (pvh_ptep_is_iommu(pte_p)) {
5024 			iommu_mapping_in_pve = true;
5025 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5026 				/*
5027 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5028 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5029 				 * contain the single IOMMU PTE and exit the loop.
5030 				 */
5031 				new_pte_p = pte_p;
5032 				break;
5033 			}
5034 			goto protect_skip_pve_pass2;
5035 		}
5036 #endif
5037 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5038 		const pmap_t pmap = ptdp->pmap;
5039 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5040 
5041 		if (remove) {
5042 			if (!compress && (pmap != kernel_pmap)) {
5043 				/*
5044 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5045 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5046 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5047 				 * under us.
5048 				 */
5049 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5050 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5051 				}
5052 			}
5053 			/* Remove this CPU mapping from PVE list. */
5054 			if (pve_p != PV_ENTRY_NULL) {
5055 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5056 			}
5057 		} else {
5058 			pt_entry_t spte = *pte_p;
5059 			if (pte_was_writeable(spte)) {
5060 				pte_set_was_writeable(spte, false);
5061 				write_pte_fast(pte_p, spte);
5062 			} else {
5063 				goto protect_skip_pve_pass2;
5064 			}
5065 		}
5066 		++pass2_updated;
5067 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5068 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5069 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5070 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
5071 		}
5072 
5073 protect_skip_pve_pass2:
5074 		pte_p = PT_ENTRY_NULL;
5075 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5076 			pve_ptep_idx = 0;
5077 
5078 			if (remove) {
5079 				/**
5080 				 * If there are any IOMMU mappings in the PVE list, preserve
5081 				 * those mappings in a new PVE list (new_pve_p) which will later
5082 				 * become the new PVH entry. Keep track of the CPU mappings in
5083 				 * pveh_p/pvet_p so they can be deallocated later.
5084 				 */
5085 				if (iommu_mapping_in_pve) {
5086 					iommu_mapping_in_pve = false;
5087 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5088 					pve_remove(pv_h, pve_pp, pve_p);
5089 					pveh_p = pvh_pve_list(pv_h);
5090 					pve_p->pve_next = new_pve_p;
5091 					new_pve_p = pve_p;
5092 					pve_p = temp_pve_p;
5093 					continue;
5094 				} else {
5095 					pvet_p = pve_p;
5096 					pvh_cnt++;
5097 				}
5098 			}
5099 
5100 			pve_pp = pve_next_ptr(pve_p);
5101 			pve_p = pve_next(pve_p);
5102 			iommu_mapping_in_pve = false;
5103 		}
5104 	}
5105 
5106 protect_finish:
5107 
5108 #ifdef PVH_FLAG_EXEC
5109 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5110 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5111 	}
5112 #endif
5113 	if (__improbable(pass1_updated != pass2_updated)) {
5114 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5115 		    __func__, pass1_updated, pass2_updated);
5116 	}
5117 	/* if we removed a bunch of entries, take care of them now */
5118 	if (remove) {
5119 		if (new_pve_p != PV_ENTRY_NULL) {
5120 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5121 			pvh_set_flags(pv_h, pvh_flags);
5122 		} else if (new_pte_p != PT_ENTRY_NULL) {
5123 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5124 			pvh_set_flags(pv_h, pvh_flags);
5125 		} else {
5126 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5127 		}
5128 	}
5129 
5130 	if (flush_range && tlb_flush_needed) {
5131 		if (!remove) {
5132 			flush_range->ptfr_flush_needed = true;
5133 			tlb_flush_needed = false;
5134 		}
5135 	}
5136 
5137 	/*
5138 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5139 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5140 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5141 	 * a page to be repurposed while it is still live in the TLBs.
5142 	 */
5143 	if (remove && tlb_flush_needed) {
5144 		sync_tlb_flush();
5145 	}
5146 
5147 	pvh_unlock(pai);
5148 
5149 	if (remove) {
5150 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5151 #if !XNU_MONITOR
5152 		mp_enable_preemption();
5153 #endif
5154 	}
5155 
5156 	if (!remove && tlb_flush_needed) {
5157 		sync_tlb_flush();
5158 	}
5159 
5160 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5161 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5162 	}
5163 }
5164 
5165 MARK_AS_PMAP_TEXT void
5166 pmap_page_protect_options_internal(
5167 	ppnum_t ppnum,
5168 	vm_prot_t prot,
5169 	unsigned int options,
5170 	void *arg)
5171 {
5172 	if (arg != NULL) {
5173 		/*
5174 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5175 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5176 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5177 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5178 		 * In that case, force the flush to take place.
5179 		 */
5180 		options &= ~PMAP_OPTIONS_NOFLUSH;
5181 	}
5182 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5183 }
5184 
5185 void
5186 pmap_page_protect_options(
5187 	ppnum_t ppnum,
5188 	vm_prot_t prot,
5189 	unsigned int options,
5190 	void *arg)
5191 {
5192 	pmap_paddr_t    phys = ptoa(ppnum);
5193 
5194 	assert(ppnum != vm_page_fictitious_addr);
5195 
5196 	/* Only work with managed pages. */
5197 	if (!pa_valid(phys)) {
5198 		return;
5199 	}
5200 
5201 	/*
5202 	 * Determine the new protection.
5203 	 */
5204 	if (prot == VM_PROT_ALL) {
5205 		return;         /* nothing to do */
5206 	}
5207 
5208 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5209 
5210 #if XNU_MONITOR
5211 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5212 #else
5213 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5214 #endif
5215 
5216 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5217 }
5218 
5219 
5220 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5221 MARK_AS_PMAP_TEXT void
5222 pmap_disable_user_jop_internal(pmap_t pmap)
5223 {
5224 	if (pmap == kernel_pmap) {
5225 		panic("%s: called with kernel_pmap", __func__);
5226 	}
5227 	validate_pmap_mutable(pmap);
5228 	pmap->disable_jop = true;
5229 }
5230 
5231 void
5232 pmap_disable_user_jop(pmap_t pmap)
5233 {
5234 #if XNU_MONITOR
5235 	pmap_disable_user_jop_ppl(pmap);
5236 #else
5237 	pmap_disable_user_jop_internal(pmap);
5238 #endif
5239 }
5240 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5241 
5242 /*
5243  * Indicates if the pmap layer enforces some additional restrictions on the
5244  * given set of protections.
5245  */
5246 bool
5247 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5248 {
5249 	return false;
5250 }
5251 
5252 /*
5253  *	Set the physical protection on the
5254  *	specified range of this map as requested.
5255  *	VERY IMPORTANT: Will not increase permissions.
5256  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5257  */
5258 void
5259 pmap_protect(
5260 	pmap_t pmap,
5261 	vm_map_address_t b,
5262 	vm_map_address_t e,
5263 	vm_prot_t prot)
5264 {
5265 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5266 }
5267 
5268 MARK_AS_PMAP_TEXT vm_map_address_t
5269 pmap_protect_options_internal(
5270 	pmap_t pmap,
5271 	vm_map_address_t start,
5272 	vm_map_address_t end,
5273 	vm_prot_t prot,
5274 	unsigned int options,
5275 	__unused void *args)
5276 {
5277 	tt_entry_t      *tte_p;
5278 	pt_entry_t      *bpte_p, *epte_p;
5279 	pt_entry_t      *pte_p;
5280 	boolean_t        set_NX = TRUE;
5281 #if (__ARM_VMSA__ > 7)
5282 	boolean_t        set_XO = FALSE;
5283 #endif
5284 	boolean_t        should_have_removed = FALSE;
5285 	bool             need_strong_sync = false;
5286 
5287 	/* Validate the pmap input before accessing its data. */
5288 	validate_pmap_mutable(pmap);
5289 
5290 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5291 
5292 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5293 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5294 	}
5295 
5296 #if DEVELOPMENT || DEBUG
5297 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5298 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5299 			should_have_removed = TRUE;
5300 		}
5301 	} else
5302 #endif
5303 	{
5304 		/* Determine the new protection. */
5305 		switch (prot) {
5306 #if (__ARM_VMSA__ > 7)
5307 		case VM_PROT_EXECUTE:
5308 			set_XO = TRUE;
5309 			OS_FALLTHROUGH;
5310 #endif
5311 		case VM_PROT_READ:
5312 		case VM_PROT_READ | VM_PROT_EXECUTE:
5313 			break;
5314 		case VM_PROT_READ | VM_PROT_WRITE:
5315 		case VM_PROT_ALL:
5316 			return end;         /* nothing to do */
5317 		default:
5318 			should_have_removed = TRUE;
5319 		}
5320 	}
5321 
5322 	if (should_have_removed) {
5323 		panic("%s: should have been a remove operation, "
5324 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5325 		    __FUNCTION__,
5326 		    pmap, (void *)start, (void *)end, prot, options, args);
5327 	}
5328 
5329 #if DEVELOPMENT || DEBUG
5330 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5331 #else
5332 	if ((prot & VM_PROT_EXECUTE))
5333 #endif
5334 	{
5335 		set_NX = FALSE;
5336 	} else {
5337 		set_NX = TRUE;
5338 	}
5339 
5340 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5341 	vm_map_address_t va = start;
5342 	unsigned int npages = 0;
5343 
5344 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5345 
5346 	tte_p = pmap_tte(pmap, start);
5347 
5348 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5349 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5350 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5351 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5352 		pte_p = bpte_p;
5353 
5354 		for (pte_p = bpte_p;
5355 		    pte_p < epte_p;
5356 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5357 			++npages;
5358 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5359 			    pmap_pending_preemption())) {
5360 				break;
5361 			}
5362 			pt_entry_t spte;
5363 #if DEVELOPMENT || DEBUG
5364 			boolean_t  force_write = FALSE;
5365 #endif
5366 
5367 			spte = *((volatile pt_entry_t*)pte_p);
5368 
5369 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5370 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5371 				continue;
5372 			}
5373 
5374 			pmap_paddr_t    pa;
5375 			unsigned int    pai = 0;
5376 			boolean_t       managed = FALSE;
5377 
5378 			while (!managed) {
5379 				/*
5380 				 * It may be possible for the pte to transition from managed
5381 				 * to unmanaged in this timeframe; for now, elide the assert.
5382 				 * We should break out as a consequence of checking pa_valid.
5383 				 */
5384 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5385 				pa = pte_to_pa(spte);
5386 				if (!pa_valid(pa)) {
5387 					break;
5388 				}
5389 				pai = pa_index(pa);
5390 				pvh_lock(pai);
5391 				spte = *((volatile pt_entry_t*)pte_p);
5392 				pa = pte_to_pa(spte);
5393 				if (pai == pa_index(pa)) {
5394 					managed = TRUE;
5395 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5396 				}
5397 				pvh_unlock(pai);
5398 			}
5399 
5400 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5401 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5402 				continue;
5403 			}
5404 
5405 			pt_entry_t      tmplate;
5406 
5407 			if (pmap == kernel_pmap) {
5408 #if DEVELOPMENT || DEBUG
5409 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5410 					force_write = TRUE;
5411 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5412 				} else
5413 #endif
5414 				{
5415 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5416 				}
5417 			} else {
5418 #if DEVELOPMENT || DEBUG
5419 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5420 					assert(pmap->type != PMAP_TYPE_NESTED);
5421 					force_write = TRUE;
5422 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5423 				} else
5424 #endif
5425 				{
5426 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5427 				}
5428 			}
5429 
5430 			/*
5431 			 * XXX Removing "NX" would
5432 			 * grant "execute" access
5433 			 * immediately, bypassing any
5434 			 * checks VM might want to do
5435 			 * in its soft fault path.
5436 			 * pmap_protect() and co. are
5437 			 * not allowed to increase
5438 			 * access permissions.
5439 			 */
5440 			if (set_NX) {
5441 				tmplate |= pt_attr_leaf_xn(pt_attr);
5442 			} else {
5443 #if     (__ARM_VMSA__ > 7)
5444 				if (pmap == kernel_pmap) {
5445 					/* do NOT clear "PNX"! */
5446 					tmplate |= ARM_PTE_NX;
5447 				} else {
5448 					/* do NOT clear "NX"! */
5449 					tmplate |= pt_attr_leaf_x(pt_attr);
5450 					if (set_XO) {
5451 						tmplate &= ~ARM_PTE_APMASK;
5452 						tmplate |= pt_attr_leaf_rona(pt_attr);
5453 					}
5454 				}
5455 #endif
5456 			}
5457 
5458 #if DEVELOPMENT || DEBUG
5459 			if (force_write) {
5460 				/*
5461 				 * TODO: Run CS/Monitor checks here.
5462 				 */
5463 				if (managed) {
5464 					/*
5465 					 * We are marking the page as writable,
5466 					 * so we consider it to be modified and
5467 					 * referenced.
5468 					 */
5469 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5470 					tmplate |= ARM_PTE_AF;
5471 
5472 					if (ppattr_test_reffault(pai)) {
5473 						ppattr_clear_reffault(pai);
5474 					}
5475 
5476 					if (ppattr_test_modfault(pai)) {
5477 						ppattr_clear_modfault(pai);
5478 					}
5479 				}
5480 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5481 				/*
5482 				 * An immediate request for anything other than
5483 				 * write should still mark the page as
5484 				 * referenced if managed.
5485 				 */
5486 				if (managed) {
5487 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5488 					tmplate |= ARM_PTE_AF;
5489 
5490 					if (ppattr_test_reffault(pai)) {
5491 						ppattr_clear_reffault(pai);
5492 					}
5493 				}
5494 			}
5495 #endif
5496 
5497 			/* We do not expect to write fast fault the entry. */
5498 			pte_set_was_writeable(tmplate, false);
5499 
5500 			write_pte_fast(pte_p, tmplate);
5501 
5502 			if (managed) {
5503 				pvh_assert_locked(pai);
5504 				pvh_unlock(pai);
5505 			}
5506 		}
5507 		FLUSH_PTE_STRONG();
5508 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5509 	} else {
5510 		va = end;
5511 	}
5512 
5513 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5514 	return va;
5515 }
5516 
5517 void
5518 pmap_protect_options(
5519 	pmap_t pmap,
5520 	vm_map_address_t b,
5521 	vm_map_address_t e,
5522 	vm_prot_t prot,
5523 	unsigned int options,
5524 	__unused void *args)
5525 {
5526 	vm_map_address_t l, beg;
5527 
5528 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5529 
5530 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5531 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5532 		    pmap, (uint64_t)b, (uint64_t)e);
5533 	}
5534 
5535 	/*
5536 	 * We allow single-page requests to execute non-preemptibly,
5537 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5538 	 * operation, and there are a couple of special use cases that
5539 	 * require a non-preemptible single-page operation.
5540 	 */
5541 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5542 		pmap_verify_preemptible();
5543 	}
5544 
5545 #if DEVELOPMENT || DEBUG
5546 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5547 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5548 			pmap_remove_options(pmap, b, e, options);
5549 			return;
5550 		}
5551 	} else
5552 #endif
5553 	{
5554 		/* Determine the new protection. */
5555 		switch (prot) {
5556 		case VM_PROT_EXECUTE:
5557 		case VM_PROT_READ:
5558 		case VM_PROT_READ | VM_PROT_EXECUTE:
5559 			break;
5560 		case VM_PROT_READ | VM_PROT_WRITE:
5561 		case VM_PROT_ALL:
5562 			return;         /* nothing to do */
5563 		default:
5564 			pmap_remove_options(pmap, b, e, options);
5565 			return;
5566 		}
5567 	}
5568 
5569 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5570 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5571 	    VM_KERNEL_ADDRHIDE(e));
5572 
5573 	beg = b;
5574 
5575 	while (beg < e) {
5576 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5577 
5578 		if (l > e) {
5579 			l = e;
5580 		}
5581 
5582 #if XNU_MONITOR
5583 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5584 #else
5585 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5586 #endif
5587 	}
5588 
5589 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5590 }
5591 
5592 /**
5593  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5594  *
5595  * @param pmap pmap to insert the pages into.
5596  * @param va virtual address to map the pages into.
5597  * @param pa page number of the first physical page to map.
5598  * @param size block size, in number of pages.
5599  * @param prot mapping protection attributes.
5600  * @param attr flags to pass to pmap_enter().
5601  *
5602  * @return KERN_SUCCESS.
5603  */
5604 kern_return_t
5605 pmap_map_block(
5606 	pmap_t pmap,
5607 	addr64_t va,
5608 	ppnum_t pa,
5609 	uint32_t size,
5610 	vm_prot_t prot,
5611 	int attr,
5612 	unsigned int flags)
5613 {
5614 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5615 }
5616 
5617 /**
5618  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5619  * As opposed to pmap_map_block(), this function takes
5620  * a physical address as an input and operates using the
5621  * page size associated with the input pmap.
5622  *
5623  * @param pmap pmap to insert the pages into.
5624  * @param va virtual address to map the pages into.
5625  * @param pa physical address of the first physical page to map.
5626  * @param size block size, in number of pages.
5627  * @param prot mapping protection attributes.
5628  * @param attr flags to pass to pmap_enter().
5629  *
5630  * @return KERN_SUCCESS.
5631  */
5632 kern_return_t
5633 pmap_map_block_addr(
5634 	pmap_t pmap,
5635 	addr64_t va,
5636 	pmap_paddr_t pa,
5637 	uint32_t size,
5638 	vm_prot_t prot,
5639 	int attr,
5640 	unsigned int flags)
5641 {
5642 #if __ARM_MIXED_PAGE_SIZE__
5643 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5644 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5645 #else
5646 	const uint64_t pmap_page_size = PAGE_SIZE;
5647 #endif
5648 
5649 	for (ppnum_t page = 0; page < size; page++) {
5650 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5651 			panic("%s: failed pmap_enter_addr, "
5652 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5653 			    __FUNCTION__,
5654 			    pmap, va, (uint64_t)pa, size, prot, flags);
5655 		}
5656 
5657 		va += pmap_page_size;
5658 		pa += pmap_page_size;
5659 	}
5660 
5661 	return KERN_SUCCESS;
5662 }
5663 
5664 kern_return_t
5665 pmap_enter_addr(
5666 	pmap_t pmap,
5667 	vm_map_address_t v,
5668 	pmap_paddr_t pa,
5669 	vm_prot_t prot,
5670 	vm_prot_t fault_type,
5671 	unsigned int flags,
5672 	boolean_t wired)
5673 {
5674 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5675 }
5676 
5677 /*
5678  *	Insert the given physical page (p) at
5679  *	the specified virtual address (v) in the
5680  *	target physical map with the protection requested.
5681  *
5682  *	If specified, the page will be wired down, meaning
5683  *	that the related pte can not be reclaimed.
5684  *
5685  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5686  *	or lose information.  That is, this routine must actually
5687  *	insert this page into the given map eventually (must make
5688  *	forward progress eventually.
5689  */
5690 kern_return_t
5691 pmap_enter(
5692 	pmap_t pmap,
5693 	vm_map_address_t v,
5694 	ppnum_t pn,
5695 	vm_prot_t prot,
5696 	vm_prot_t fault_type,
5697 	unsigned int flags,
5698 	boolean_t wired)
5699 {
5700 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5701 }
5702 
5703 /*
5704  * Attempt to commit the pte.
5705  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5706  * Performs no page table or accounting writes on failures.
5707  */
5708 static inline bool
5709 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5710 {
5711 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5712 	bool success = false, changed_wiring = false;
5713 
5714 	__unreachable_ok_push
5715 	if (TEST_PAGE_RATIO_4) {
5716 		/*
5717 		 * 16K virtual pages w/ 4K hw pages.
5718 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5719 		 * As a result we require the exclusive pmap lock.
5720 		 */
5721 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5722 		*old_pte = *pte_p;
5723 		if (*old_pte == new_pte) {
5724 			/* Another thread completed this operation. Nothing to do here. */
5725 			success = true;
5726 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5727 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5728 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5729 			success = false;
5730 		} else {
5731 			write_pte_fast(pte_p, new_pte);
5732 			success = true;
5733 		}
5734 	} else {
5735 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5736 	}
5737 	__unreachable_ok_pop
5738 
5739 	if (success && *old_pte != new_pte) {
5740 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5741 			FLUSH_PTE_STRONG();
5742 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5743 		} else {
5744 			FLUSH_PTE();
5745 			__builtin_arm_isb(ISB_SY);
5746 		}
5747 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5748 		    (new_pte & ARM_PTE_WIRED) != 0 :
5749 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5750 
5751 		if (pmap != kernel_pmap && changed_wiring) {
5752 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5753 			if (new_pte & ARM_PTE_WIRED) {
5754 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5755 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5756 			} else {
5757 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5758 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5759 			}
5760 		}
5761 
5762 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5763 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5764 	}
5765 	return success;
5766 }
5767 
5768 MARK_AS_PMAP_TEXT static pt_entry_t
5769 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5770 {
5771 	pt_entry_t pte;
5772 
5773 	switch (wimg & (VM_WIMG_MASK)) {
5774 	case VM_WIMG_IO:
5775 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5776 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5777 		// AP, while preserving the security benefits of using device
5778 		// mapping against side-channel attacks. On pre-H14 platforms,
5779 		// the accesses will still be strongly ordered.
5780 		if (is_dram_addr(pa)) {
5781 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5782 		} else {
5783 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5784 		}
5785 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5786 		break;
5787 	case VM_WIMG_RT:
5788 #if HAS_UCNORMAL_MEM
5789 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5790 #else
5791 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5792 #endif
5793 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5794 		break;
5795 	case VM_WIMG_POSTED:
5796 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5797 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5798 		break;
5799 	case VM_WIMG_POSTED_REORDERED:
5800 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5801 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5802 		break;
5803 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5804 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5805 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5806 		break;
5807 	case VM_WIMG_WCOMB:
5808 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5809 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5810 		break;
5811 	case VM_WIMG_WTHRU:
5812 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5813 #if     (__ARM_VMSA__ > 7)
5814 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5815 #else
5816 		pte |= ARM_PTE_SH;
5817 #endif
5818 		break;
5819 	case VM_WIMG_COPYBACK:
5820 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5821 #if     (__ARM_VMSA__ > 7)
5822 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5823 #else
5824 		pte |= ARM_PTE_SH;
5825 #endif
5826 		break;
5827 	case VM_WIMG_INNERWBACK:
5828 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5829 #if     (__ARM_VMSA__ > 7)
5830 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5831 #else
5832 		pte |= ARM_PTE_SH;
5833 #endif
5834 		break;
5835 	default:
5836 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5837 #if     (__ARM_VMSA__ > 7)
5838 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5839 #else
5840 		pte |= ARM_PTE_SH;
5841 #endif
5842 	}
5843 
5844 	return pte;
5845 }
5846 
5847 
5848 /*
5849  * Construct a PTE (and the physical page attributes) for the given virtual to
5850  * physical mapping.
5851  *
5852  * This function has no side effects and is safe to call so that it is safe to
5853  * call while attempting a pmap_enter transaction.
5854  */
5855 MARK_AS_PMAP_TEXT static pt_entry_t
5856 pmap_construct_pte(
5857 	const pmap_t pmap,
5858 	vm_map_address_t va,
5859 	pmap_paddr_t pa,
5860 	vm_prot_t prot,
5861 	vm_prot_t fault_type,
5862 	boolean_t wired,
5863 	const pt_attr_t* const pt_attr,
5864 	uint16_t *pp_attr_bits /* OUTPUT */
5865 	)
5866 {
5867 	bool set_NX = false, set_XO = false;
5868 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5869 	assert(pp_attr_bits != NULL);
5870 	*pp_attr_bits = 0;
5871 
5872 	if (wired) {
5873 		pte |= ARM_PTE_WIRED;
5874 	}
5875 
5876 #if DEVELOPMENT || DEBUG
5877 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5878 #else
5879 	if ((prot & VM_PROT_EXECUTE))
5880 #endif
5881 	{
5882 		set_NX = false;
5883 	} else {
5884 		set_NX = true;
5885 	}
5886 
5887 #if (__ARM_VMSA__ > 7)
5888 	if (prot == VM_PROT_EXECUTE) {
5889 		set_XO = true;
5890 	}
5891 #endif
5892 
5893 	if (set_NX) {
5894 		pte |= pt_attr_leaf_xn(pt_attr);
5895 	} else {
5896 #if     (__ARM_VMSA__ > 7)
5897 		if (pmap == kernel_pmap) {
5898 			pte |= ARM_PTE_NX;
5899 		} else {
5900 			pte |= pt_attr_leaf_x(pt_attr);
5901 		}
5902 #endif
5903 	}
5904 
5905 	if (pmap == kernel_pmap) {
5906 #if __ARM_KERNEL_PROTECT__
5907 		pte |= ARM_PTE_NG;
5908 #endif /* __ARM_KERNEL_PROTECT__ */
5909 		if (prot & VM_PROT_WRITE) {
5910 			pte |= ARM_PTE_AP(AP_RWNA);
5911 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5912 		} else {
5913 			pte |= ARM_PTE_AP(AP_RONA);
5914 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5915 		}
5916 #if     (__ARM_VMSA__ == 7)
5917 		if ((_COMM_PAGE_BASE_ADDRESS <= va) && (va < _COMM_PAGE_BASE_ADDRESS + _COMM_PAGE_AREA_LENGTH)) {
5918 			pte = (pte & ~(ARM_PTE_APMASK)) | ARM_PTE_AP(AP_RORO);
5919 		}
5920 #endif
5921 	} else {
5922 		if (pmap->type != PMAP_TYPE_NESTED) {
5923 			pte |= ARM_PTE_NG;
5924 		} else if ((pmap->nested_region_asid_bitmap)
5925 		    && (va >= pmap->nested_region_addr)
5926 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5927 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5928 
5929 			if ((pmap->nested_region_asid_bitmap)
5930 			    && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5931 				pte |= ARM_PTE_NG;
5932 			}
5933 		}
5934 #if MACH_ASSERT
5935 		if (pmap->nested_pmap != NULL) {
5936 			vm_map_address_t nest_vaddr;
5937 			pt_entry_t *nest_pte_p;
5938 
5939 			nest_vaddr = va;
5940 
5941 			if ((nest_vaddr >= pmap->nested_region_addr)
5942 			    && (nest_vaddr < (pmap->nested_region_addr + pmap->nested_region_size))
5943 			    && ((nest_pte_p = pmap_pte(pmap->nested_pmap, nest_vaddr)) != PT_ENTRY_NULL)
5944 			    && (*nest_pte_p != ARM_PTE_TYPE_FAULT)
5945 			    && (!ARM_PTE_IS_COMPRESSED(*nest_pte_p, nest_pte_p))
5946 			    && (((*nest_pte_p) & ARM_PTE_NG) != ARM_PTE_NG)) {
5947 				unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5948 
5949 				if ((pmap->nested_pmap->nested_region_asid_bitmap)
5950 				    && !testbit(index, (int *)pmap->nested_pmap->nested_region_asid_bitmap)) {
5951 					panic("pmap_enter(): Global attribute conflict nest_pte_p=%p pmap=%p va=0x%llx spte=0x%llx",
5952 					    nest_pte_p, pmap, (uint64_t)va, (uint64_t)*nest_pte_p);
5953 				}
5954 			}
5955 		}
5956 #endif
5957 		if (prot & VM_PROT_WRITE) {
5958 			assert(pmap->type != PMAP_TYPE_NESTED);
5959 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5960 				if (fault_type & VM_PROT_WRITE) {
5961 					if (set_XO) {
5962 						pte |= pt_attr_leaf_rwna(pt_attr);
5963 					} else {
5964 						pte |= pt_attr_leaf_rw(pt_attr);
5965 					}
5966 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5967 				} else {
5968 					if (set_XO) {
5969 						pte |= pt_attr_leaf_rona(pt_attr);
5970 					} else {
5971 						pte |= pt_attr_leaf_ro(pt_attr);
5972 					}
5973 					/*
5974 					 * Mark the page as MODFAULT so that a subsequent write
5975 					 * may be handled through arm_fast_fault().
5976 					 */
5977 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5978 					pte_set_was_writeable(pte, true);
5979 				}
5980 			} else {
5981 				if (set_XO) {
5982 					pte |= pt_attr_leaf_rwna(pt_attr);
5983 				} else {
5984 					pte |= pt_attr_leaf_rw(pt_attr);
5985 				}
5986 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5987 			}
5988 		} else {
5989 			if (set_XO) {
5990 				pte |= pt_attr_leaf_rona(pt_attr);
5991 			} else {
5992 				pte |= pt_attr_leaf_ro(pt_attr);
5993 			}
5994 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5995 		}
5996 	}
5997 
5998 	pte |= ARM_PTE_AF;
5999 	return pte;
6000 }
6001 
6002 MARK_AS_PMAP_TEXT kern_return_t
6003 pmap_enter_options_internal(
6004 	pmap_t pmap,
6005 	vm_map_address_t v,
6006 	pmap_paddr_t pa,
6007 	vm_prot_t prot,
6008 	vm_prot_t fault_type,
6009 	unsigned int flags,
6010 	boolean_t wired,
6011 	unsigned int options)
6012 {
6013 	ppnum_t         pn = (ppnum_t)atop(pa);
6014 	pt_entry_t      pte;
6015 	pt_entry_t      spte;
6016 	pt_entry_t      *pte_p;
6017 	bool            refcnt_updated;
6018 	bool            wiredcnt_updated;
6019 	bool            ro_va = false;
6020 	unsigned int    wimg_bits;
6021 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6022 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6023 	kern_return_t   kr = KERN_SUCCESS;
6024 	uint16_t pp_attr_bits;
6025 	volatile uint16_t *refcnt;
6026 	volatile uint16_t *wiredcnt;
6027 	pv_free_list_t *local_pv_free;
6028 
6029 	validate_pmap_mutable(pmap);
6030 
6031 #if XNU_MONITOR
6032 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6033 		panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
6034 	}
6035 #endif
6036 
6037 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6038 
6039 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6040 		panic("pmap_enter_options() pmap %p v 0x%llx",
6041 		    pmap, (uint64_t)v);
6042 	}
6043 
6044 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6045 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6046 		    pmap, (uint64_t)pa);
6047 	}
6048 
6049 	/* The PA should not extend beyond the architected physical address space */
6050 	pa &= ARM_PTE_PAGE_MASK;
6051 
6052 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6053 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6054 		extern vm_offset_t ctrr_test_page;
6055 		if (__probable(v != ctrr_test_page))
6056 #endif
6057 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6058 	}
6059 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6060 		if (__improbable(prot != VM_PROT_READ)) {
6061 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6062 			    __func__, (unsigned long long)v, prot);
6063 		}
6064 		ro_va = true;
6065 	}
6066 	assert(pn != vm_page_fictitious_addr);
6067 
6068 	refcnt_updated = false;
6069 	wiredcnt_updated = false;
6070 
6071 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6072 		/*
6073 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6074 		 *
6075 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6076 		 */
6077 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6078 	}
6079 	pmap_lock(pmap, lock_mode);
6080 
6081 	/*
6082 	 *	Expand pmap to include this pte.  Assume that
6083 	 *	pmap is always expanded to include enough hardware
6084 	 *	pages to map one VM page.
6085 	 */
6086 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6087 		/* Must unlock to expand the pmap. */
6088 		pmap_unlock(pmap, lock_mode);
6089 
6090 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6091 
6092 		if (kr != KERN_SUCCESS) {
6093 			return kr;
6094 		}
6095 
6096 		pmap_lock(pmap, lock_mode);
6097 	}
6098 
6099 	if (options & PMAP_OPTIONS_NOENTER) {
6100 		pmap_unlock(pmap, lock_mode);
6101 		return KERN_SUCCESS;
6102 	}
6103 
6104 	/*
6105 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6106 	 * done via a cmpxchg loop.
6107 	 * We need to be careful about modifying non-local data structures before commiting
6108 	 * the new pte since we may need to re-do the transaction.
6109 	 */
6110 	spte = os_atomic_load(pte_p, relaxed);
6111 	while (!committed) {
6112 		refcnt = NULL;
6113 		wiredcnt = NULL;
6114 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6115 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6116 
6117 		if (pmap != kernel_pmap) {
6118 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6119 			refcnt = &ptd_info->refcnt;
6120 			wiredcnt = &ptd_info->wiredcnt;
6121 			/*
6122 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6123 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6124 			 * or acquire the pmap lock exclusive.
6125 			 */
6126 			if (!wiredcnt_updated) {
6127 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6128 				wiredcnt_updated = true;
6129 			}
6130 			if (!refcnt_updated) {
6131 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6132 				refcnt_updated = true;
6133 				drop_refcnt = true;
6134 			}
6135 		}
6136 
6137 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6138 			/*
6139 			 * There is already a mapping here & it's for a different physical page.
6140 			 * First remove that mapping.
6141 			 *
6142 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6143 			 */
6144 			if (lock_mode == PMAP_LOCK_SHARED) {
6145 				if (pmap_lock_shared_to_exclusive(pmap)) {
6146 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6147 				} else {
6148 					/*
6149 					 * We failed to upgrade to an exclusive lock.
6150 					 * As a result we no longer hold the lock at all,
6151 					 * so we need to re-acquire it and restart the transaction.
6152 					 */
6153 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6154 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6155 					/* pmap might have changed after we dropped the lock. Try again. */
6156 					spte = os_atomic_load(pte_p, relaxed);
6157 					continue;
6158 				}
6159 			}
6160 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6161 			spte = ARM_PTE_TYPE_FAULT;
6162 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6163 		}
6164 
6165 		pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6166 
6167 		if (pa_valid(pa)) {
6168 			unsigned int pai;
6169 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6170 
6171 			is_internal = FALSE;
6172 			is_altacct = FALSE;
6173 
6174 			pai = pa_index(pa);
6175 
6176 			pvh_lock(pai);
6177 
6178 			/*
6179 			 * Make sure that the current per-cpu PV free list has
6180 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6181 			 * if the transaction succeeds. We're either in the
6182 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6183 			 * Note that we can still be interrupted, but a primary
6184 			 * interrupt handler can never enter the pmap.
6185 			 */
6186 #if !XNU_MONITOR
6187 			assert(get_preemption_level() > 0);
6188 #endif
6189 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6190 			pv_entry_t **pv_h = pai_to_pvh(pai);
6191 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6192 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6193 
6194 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6195 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6196 				int new_allocated_pves = 0;
6197 
6198 				while (new_allocated_pves < 2) {
6199 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6200 					pv_status = pv_alloc(pmap, pai, lock_mode, &new_pve_p[new_allocated_pves]);
6201 					if (pv_status == PV_ALLOC_FAIL) {
6202 						break;
6203 					} else if (pv_status == PV_ALLOC_RETRY) {
6204 						/*
6205 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6206 						 * it will have dropped the pmap lock while doing so.
6207 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6208 						 * be on a different CPU now.
6209 						 */
6210 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6211 					} else {
6212 						/* If we've gotten this far then a node should've been allocated. */
6213 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6214 
6215 						new_allocated_pves++;
6216 					}
6217 				}
6218 
6219 				for (int i = 0; i < new_allocated_pves; i++) {
6220 					pv_free(new_pve_p[i]);
6221 				}
6222 			}
6223 
6224 			if (pv_status == PV_ALLOC_FAIL) {
6225 				pvh_unlock(pai);
6226 				kr = KERN_RESOURCE_SHORTAGE;
6227 				break;
6228 			} else if (pv_status == PV_ALLOC_RETRY) {
6229 				pvh_unlock(pai);
6230 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6231 				spte = os_atomic_load(pte_p, relaxed);
6232 				continue;
6233 			}
6234 
6235 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6236 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6237 			} else {
6238 				wimg_bits = pmap_cache_attributes(pn);
6239 			}
6240 
6241 			/* We may be retrying this operation after dropping the PVH lock.
6242 			 * Cache attributes for the physical page may have changed while the lock
6243 			 * was dropped, so clear any cache attributes we may have previously set
6244 			 * in the PTE template. */
6245 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6246 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6247 
6248 #if XNU_MONITOR
6249 			/* The regular old kernel is not allowed to remap PPL pages. */
6250 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6251 				panic("%s: page belongs to PPL, "
6252 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6253 				    __FUNCTION__,
6254 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6255 			}
6256 
6257 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6258 				panic("%s: page locked down, "
6259 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6260 				    __FUNCTION__,
6261 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6262 			}
6263 #endif
6264 
6265 
6266 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6267 			if (!committed) {
6268 				pvh_unlock(pai);
6269 				continue;
6270 			}
6271 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6272 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6273 
6274 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6275 			/*
6276 			 * If there was already a valid pte here then we reuse its reference
6277 			 * on the ptd and drop the one that we took above.
6278 			 */
6279 			drop_refcnt = had_valid_mapping;
6280 
6281 			if (!had_valid_mapping) {
6282 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6283 				int pve_ptep_idx = 0;
6284 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6285 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6286 				if (pv_status != PV_ALLOC_SUCCESS) {
6287 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6288 					    __func__, pv_status, new_pve_p, pmap);
6289 				}
6290 
6291 				if (pmap != kernel_pmap) {
6292 					if (options & PMAP_OPTIONS_INTERNAL) {
6293 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6294 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6295 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6296 							/*
6297 							 * Make a note to ourselves that this
6298 							 * mapping is using alternative
6299 							 * accounting. We'll need this in order
6300 							 * to know which ledger to debit when
6301 							 * the mapping is removed.
6302 							 *
6303 							 * The altacct bit must be set while
6304 							 * the pv head is locked. Defer the
6305 							 * ledger accounting until after we've
6306 							 * dropped the lock.
6307 							 */
6308 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6309 							is_altacct = TRUE;
6310 						}
6311 					}
6312 					if (ppattr_test_reusable(pai) &&
6313 					    !is_altacct) {
6314 						is_reusable = TRUE;
6315 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6316 						is_internal = TRUE;
6317 					} else {
6318 						is_external = TRUE;
6319 					}
6320 				}
6321 			}
6322 
6323 			pvh_unlock(pai);
6324 
6325 			if (pp_attr_bits != 0) {
6326 				ppattr_pa_set_bits(pa, pp_attr_bits);
6327 			}
6328 
6329 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6330 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6331 
6332 				if (is_internal) {
6333 					/*
6334 					 * Make corresponding adjustments to
6335 					 * phys_footprint statistics.
6336 					 */
6337 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6338 					if (is_altacct) {
6339 						/*
6340 						 * If this page is internal and
6341 						 * in an IOKit region, credit
6342 						 * the task's total count of
6343 						 * dirty, internal IOKit pages.
6344 						 * It should *not* count towards
6345 						 * the task's total physical
6346 						 * memory footprint, because
6347 						 * this entire region was
6348 						 * already billed to the task
6349 						 * at the time the mapping was
6350 						 * created.
6351 						 *
6352 						 * Put another way, this is
6353 						 * internal++ and
6354 						 * alternate_accounting++, so
6355 						 * net effect on phys_footprint
6356 						 * is 0. That means: don't
6357 						 * touch phys_footprint here.
6358 						 */
6359 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6360 					} else {
6361 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6362 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6363 							skip_footprint_debit = true;
6364 						} else {
6365 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6366 						}
6367 					}
6368 				}
6369 				if (is_reusable) {
6370 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6371 				} else if (is_external) {
6372 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6373 				}
6374 			}
6375 		} else {
6376 			if (prot & VM_PROT_EXECUTE) {
6377 				kr = KERN_FAILURE;
6378 				break;
6379 			}
6380 
6381 			wimg_bits = pmap_cache_attributes(pn);
6382 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6383 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6384 			}
6385 
6386 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6387 
6388 #if XNU_MONITOR
6389 			if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6390 				uint64_t xprr_perm = pte_to_xprr_perm(pte);
6391 				switch (xprr_perm) {
6392 				case XPRR_KERN_RO_PERM:
6393 					break;
6394 				case XPRR_KERN_RW_PERM:
6395 					pte &= ~ARM_PTE_XPRR_MASK;
6396 					pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6397 					break;
6398 				default:
6399 					panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6400 				}
6401 			}
6402 #endif
6403 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6404 			if (committed) {
6405 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6406 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6407 
6408 				/**
6409 				 * If there was already a valid pte here then we reuse its
6410 				 * reference on the ptd and drop the one that we took above.
6411 				 */
6412 				drop_refcnt = had_valid_mapping;
6413 			}
6414 		}
6415 		if (committed) {
6416 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6417 				assert(pmap != kernel_pmap);
6418 
6419 				/* One less "compressed" */
6420 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6421 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6422 
6423 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6424 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6425 				} else if (!skip_footprint_debit) {
6426 					/* Was part of the footprint */
6427 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6428 				}
6429 				/* The old entry held a reference so drop the extra one that we took above. */
6430 				drop_refcnt = true;
6431 			}
6432 		}
6433 	}
6434 
6435 	if (drop_refcnt && refcnt != NULL) {
6436 		assert(refcnt_updated);
6437 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6438 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6439 		}
6440 	}
6441 
6442 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6443 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6444 	}
6445 
6446 	pmap_unlock(pmap, lock_mode);
6447 
6448 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6449 		pmap_phys_write_disable(v);
6450 	}
6451 
6452 	return kr;
6453 }
6454 
6455 kern_return_t
6456 pmap_enter_options_addr(
6457 	pmap_t pmap,
6458 	vm_map_address_t v,
6459 	pmap_paddr_t pa,
6460 	vm_prot_t prot,
6461 	vm_prot_t fault_type,
6462 	unsigned int flags,
6463 	boolean_t wired,
6464 	unsigned int options,
6465 	__unused void   *arg)
6466 {
6467 	kern_return_t kr = KERN_FAILURE;
6468 
6469 
6470 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6471 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6472 
6473 
6474 #if XNU_MONITOR
6475 	/*
6476 	 * If NOWAIT was not requested, loop until the enter does not
6477 	 * fail due to lack of resources.
6478 	 */
6479 	while ((kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT)) == KERN_RESOURCE_SHORTAGE) {
6480 		pmap_alloc_page_for_ppl((options & PMAP_OPTIONS_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6481 		if (options & PMAP_OPTIONS_NOWAIT) {
6482 			break;
6483 		}
6484 	}
6485 
6486 	pmap_ledger_check_balance(pmap);
6487 #else
6488 	kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6489 #endif
6490 
6491 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6492 
6493 	return kr;
6494 }
6495 
6496 kern_return_t
6497 pmap_enter_options(
6498 	pmap_t pmap,
6499 	vm_map_address_t v,
6500 	ppnum_t pn,
6501 	vm_prot_t prot,
6502 	vm_prot_t fault_type,
6503 	unsigned int flags,
6504 	boolean_t wired,
6505 	unsigned int options,
6506 	__unused void   *arg)
6507 {
6508 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6509 }
6510 
6511 /*
6512  *	Routine:	pmap_change_wiring
6513  *	Function:	Change the wiring attribute for a map/virtual-address
6514  *			pair.
6515  *	In/out conditions:
6516  *			The mapping must already exist in the pmap.
6517  */
6518 MARK_AS_PMAP_TEXT void
6519 pmap_change_wiring_internal(
6520 	pmap_t pmap,
6521 	vm_map_address_t v,
6522 	boolean_t wired)
6523 {
6524 	pt_entry_t     *pte_p;
6525 	pmap_paddr_t    pa;
6526 
6527 	validate_pmap_mutable(pmap);
6528 
6529 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6530 
6531 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6532 
6533 	pte_p = pmap_pte(pmap, v);
6534 	if (pte_p == PT_ENTRY_NULL) {
6535 		if (!wired) {
6536 			/*
6537 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6538 			 * may have been freed by a remove operation.
6539 			 */
6540 			goto pmap_change_wiring_return;
6541 		} else {
6542 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6543 		}
6544 	}
6545 	/*
6546 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6547 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6548 	 */
6549 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6550 
6551 	while (pa_valid(pa)) {
6552 		pmap_paddr_t new_pa;
6553 
6554 		pvh_lock(pa_index(pa));
6555 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6556 
6557 		if (pa == new_pa) {
6558 			break;
6559 		}
6560 
6561 		pvh_unlock(pa_index(pa));
6562 		pa = new_pa;
6563 	}
6564 
6565 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6566 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6567 		if (!wired) {
6568 			/* PTE cleared by prior remove/disconnect operation */
6569 			goto pmap_change_wiring_cleanup;
6570 		} else {
6571 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6572 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6573 		}
6574 	}
6575 
6576 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6577 	if (wired != pte_is_wired(*pte_p)) {
6578 		pte_set_wired(pmap, pte_p, wired);
6579 		if (pmap != kernel_pmap) {
6580 			if (wired) {
6581 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6582 			} else if (!wired) {
6583 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6584 			}
6585 		}
6586 	}
6587 
6588 pmap_change_wiring_cleanup:
6589 	if (pa_valid(pa)) {
6590 		pvh_unlock(pa_index(pa));
6591 	}
6592 
6593 pmap_change_wiring_return:
6594 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6595 }
6596 
6597 void
6598 pmap_change_wiring(
6599 	pmap_t pmap,
6600 	vm_map_address_t v,
6601 	boolean_t wired)
6602 {
6603 #if XNU_MONITOR
6604 	pmap_change_wiring_ppl(pmap, v, wired);
6605 
6606 	pmap_ledger_check_balance(pmap);
6607 #else
6608 	pmap_change_wiring_internal(pmap, v, wired);
6609 #endif
6610 }
6611 
6612 MARK_AS_PMAP_TEXT pmap_paddr_t
6613 pmap_find_pa_internal(
6614 	pmap_t pmap,
6615 	addr64_t va)
6616 {
6617 	pmap_paddr_t    pa = 0;
6618 
6619 	validate_pmap(pmap);
6620 
6621 	if (pmap != kernel_pmap) {
6622 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6623 	}
6624 
6625 	pa = pmap_vtophys(pmap, va);
6626 
6627 	if (pmap != kernel_pmap) {
6628 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6629 	}
6630 
6631 	return pa;
6632 }
6633 
6634 pmap_paddr_t
6635 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6636 {
6637 	pmap_paddr_t pa = 0;
6638 
6639 	if (pmap == kernel_pmap) {
6640 		pa = mmu_kvtop(va);
6641 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6642 		/*
6643 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6644 		 * translation even if PAN would prevent kernel access through the translation.
6645 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6646 		 */
6647 		pa = mmu_uvtop(va);
6648 	}
6649 	return pa;
6650 }
6651 
6652 pmap_paddr_t
6653 pmap_find_pa(
6654 	pmap_t pmap,
6655 	addr64_t va)
6656 {
6657 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6658 
6659 	if (pa != 0) {
6660 		return pa;
6661 	}
6662 
6663 	if (not_in_kdp) {
6664 #if XNU_MONITOR
6665 		return pmap_find_pa_ppl(pmap, va);
6666 #else
6667 		return pmap_find_pa_internal(pmap, va);
6668 #endif
6669 	} else {
6670 		return pmap_vtophys(pmap, va);
6671 	}
6672 }
6673 
6674 ppnum_t
6675 pmap_find_phys_nofault(
6676 	pmap_t pmap,
6677 	addr64_t va)
6678 {
6679 	ppnum_t ppn;
6680 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6681 	return ppn;
6682 }
6683 
6684 ppnum_t
6685 pmap_find_phys(
6686 	pmap_t pmap,
6687 	addr64_t va)
6688 {
6689 	ppnum_t ppn;
6690 	ppn = atop(pmap_find_pa(pmap, va));
6691 	return ppn;
6692 }
6693 
6694 /**
6695  * Translate a kernel virtual address into a physical address.
6696  *
6697  * @param va The kernel virtual address to translate. Does not work on user
6698  *           virtual addresses.
6699  *
6700  * @return The physical address if the translation was successful, or zero if
6701  *         no valid mappings were found for the given virtual address.
6702  */
6703 pmap_paddr_t
6704 kvtophys(vm_offset_t va)
6705 {
6706 	/**
6707 	 * Attempt to do the translation first in hardware using the AT (address
6708 	 * translation) instruction. This will attempt to use the MMU to do the
6709 	 * translation for us.
6710 	 */
6711 	pmap_paddr_t pa = mmu_kvtop(va);
6712 
6713 	if (pa) {
6714 		return pa;
6715 	}
6716 
6717 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6718 	return pmap_vtophys(kernel_pmap, va);
6719 }
6720 
6721 /**
6722  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6723  * points to a non-kernel-managed physical page, then this call will panic().
6724  *
6725  * @note The output of this function is guaranteed to be a kernel-managed
6726  *       physical page, which means it's safe to pass the output directly to
6727  *       pa_index() to create a physical address index for various pmap data
6728  *       structures.
6729  *
6730  * @param va The kernel virtual address to translate. Does not work on user
6731  *           virtual addresses.
6732  *
6733  * @return The translated physical address for the given virtual address.
6734  */
6735 pmap_paddr_t
6736 kvtophys_nofail(vm_offset_t va)
6737 {
6738 	pmap_paddr_t pa = kvtophys(va);
6739 
6740 	if (!pa_valid(pa)) {
6741 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6742 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6743 	}
6744 
6745 	return pa;
6746 }
6747 
6748 pmap_paddr_t
6749 pmap_vtophys(
6750 	pmap_t pmap,
6751 	addr64_t va)
6752 {
6753 	if ((va < pmap->min) || (va >= pmap->max)) {
6754 		return 0;
6755 	}
6756 
6757 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6758 
6759 #if (__ARM_VMSA__ == 7)
6760 	tt_entry_t     *tte_p, tte;
6761 	pt_entry_t     *pte_p;
6762 	pmap_paddr_t    pa;
6763 
6764 	tte_p = pmap_tte(pmap, va);
6765 	if (tte_p == (tt_entry_t *) NULL) {
6766 		return (pmap_paddr_t) 0;
6767 	}
6768 
6769 	tte = *tte_p;
6770 	if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
6771 		pte_p = (pt_entry_t *) ttetokv(tte) + pte_index(pt_attr, va);
6772 		pa = pte_to_pa(*pte_p) | (va & ARM_PGMASK);
6773 		//LIONEL ppn = (ppnum_t) atop(pte_to_pa(*pte_p) | (va & ARM_PGMASK));
6774 #if DEVELOPMENT || DEBUG
6775 		if (atop(pa) != 0 &&
6776 		    ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
6777 			panic("pmap_vtophys(%p,0x%llx): compressed pte_p=%p 0x%llx with ppn=0x%x",
6778 			    pmap, va, pte_p, (uint64_t) (*pte_p), atop(pa));
6779 		}
6780 #endif /* DEVELOPMENT || DEBUG */
6781 	} else if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
6782 		if ((tte & ARM_TTE_BLOCK_SUPER) == ARM_TTE_BLOCK_SUPER) {
6783 			pa = suptte_to_pa(tte) | (va & ARM_TT_L1_SUPER_OFFMASK);
6784 		} else {
6785 			pa = sectte_to_pa(tte) | (va & ARM_TT_L1_BLOCK_OFFMASK);
6786 		}
6787 	} else {
6788 		pa = 0;
6789 	}
6790 #else
6791 	tt_entry_t * ttp = NULL;
6792 	tt_entry_t * ttep = NULL;
6793 	tt_entry_t   tte = ARM_TTE_EMPTY;
6794 	pmap_paddr_t pa = 0;
6795 	unsigned int cur_level;
6796 
6797 	ttp = pmap->tte;
6798 
6799 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6800 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6801 
6802 		tte = *ttep;
6803 
6804 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6805 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6806 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6807 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6808 
6809 		if ((tte & valid_mask) != valid_mask) {
6810 			return (pmap_paddr_t) 0;
6811 		}
6812 
6813 		/* This detects both leaf entries and intermediate block mappings. */
6814 		if ((tte & type_mask) == type_block) {
6815 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6816 			break;
6817 		}
6818 
6819 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6820 	}
6821 #endif
6822 
6823 	return pa;
6824 }
6825 
6826 /*
6827  *	pmap_init_pte_page - Initialize a page table page.
6828  */
6829 MARK_AS_PMAP_TEXT void
6830 pmap_init_pte_page(
6831 	pmap_t pmap,
6832 	pt_entry_t *pte_p,
6833 	vm_offset_t va,
6834 	unsigned int ttlevel,
6835 	boolean_t alloc_ptd)
6836 {
6837 	pt_desc_t   *ptdp = NULL;
6838 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6839 
6840 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6841 		if (alloc_ptd) {
6842 			/*
6843 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6844 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6845 			 * bootstrap request, so we check for an existing PTD here.
6846 			 */
6847 			ptdp = ptd_alloc(pmap);
6848 			if (ptdp == NULL) {
6849 				panic("%s: unable to allocate PTD", __func__);
6850 			}
6851 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6852 		} else {
6853 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6854 		}
6855 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6856 		ptdp = pvh_ptd(pvh);
6857 	} else {
6858 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6859 	}
6860 
6861 	// below barrier ensures previous updates to the page are visible to PTW before
6862 	// it is linked to the PTE of previous level
6863 	__builtin_arm_dmb(DMB_ISHST);
6864 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6865 }
6866 
6867 /*
6868  *	Routine:	pmap_expand
6869  *
6870  *	Expands a pmap to be able to map the specified virtual address.
6871  *
6872  *	Allocates new memory for the default (COARSE) translation table
6873  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6874  *	also allocates space for the corresponding pv entries.
6875  *
6876  *	Nothing should be locked.
6877  */
6878 MARK_AS_PMAP_TEXT static kern_return_t
6879 pmap_expand(
6880 	pmap_t pmap,
6881 	vm_map_address_t v,
6882 	unsigned int options,
6883 	unsigned int level)
6884 {
6885 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6886 
6887 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6888 		return KERN_INVALID_ADDRESS;
6889 	}
6890 #if     (__ARM_VMSA__ == 7)
6891 	vm_offset_t     pa;
6892 	tt_entry_t              *tte_p;
6893 	tt_entry_t              *tt_p;
6894 	unsigned int    i;
6895 
6896 #if DEVELOPMENT || DEBUG
6897 	/*
6898 	 * We no longer support root level expansion; panic in case something
6899 	 * still attempts to trigger it.
6900 	 */
6901 	i = tte_index(pt_attr, v);
6902 
6903 	if (i >= pmap->tte_index_max) {
6904 		panic("%s: index out of range, index=%u, max=%u, "
6905 		    "pmap=%p, addr=%p, options=%u, level=%u",
6906 		    __func__, i, pmap->tte_index_max,
6907 		    pmap, (void *)v, options, level);
6908 	}
6909 #endif /* DEVELOPMENT || DEBUG */
6910 
6911 	if (level == 1) {
6912 		return KERN_SUCCESS;
6913 	}
6914 
6915 	{
6916 		tt_entry_t     *tte_next_p;
6917 
6918 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6919 		pa = 0;
6920 		if (pmap_pte(pmap, v) != PT_ENTRY_NULL) {
6921 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6922 			return KERN_SUCCESS;
6923 		}
6924 		tte_p = &pmap->tte[ttenum(v & ~ARM_TT_L1_PT_OFFMASK)];
6925 		for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6926 			if (tte_to_pa(*tte_next_p)) {
6927 				pa = tte_to_pa(*tte_next_p);
6928 				break;
6929 			}
6930 			tte_next_p++;
6931 		}
6932 		pa = pa & ~PAGE_MASK;
6933 		if (pa) {
6934 			tte_p =  &pmap->tte[ttenum(v)];
6935 			*tte_p =  pa_to_tte(pa) | (((v >> ARM_TT_L1_SHIFT) & 0x3) << 10) | ARM_TTE_TYPE_TABLE;
6936 			FLUSH_PTE();
6937 			PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~ARM_TT_L1_OFFMASK),
6938 			    VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_OFFMASK) + ARM_TT_L1_SIZE), *tte_p);
6939 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6940 			return KERN_SUCCESS;
6941 		}
6942 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6943 	}
6944 	v = v & ~ARM_TT_L1_PT_OFFMASK;
6945 
6946 
6947 	while (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6948 		/*
6949 		 *	Allocate a VM page for the level 2 page table entries.
6950 		 */
6951 		while (pmap_tt_allocate(pmap, &tt_p, PMAP_TT_L2_LEVEL, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6952 			if (options & PMAP_OPTIONS_NOWAIT) {
6953 				return KERN_RESOURCE_SHORTAGE;
6954 			}
6955 			VM_PAGE_WAIT();
6956 		}
6957 
6958 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6959 		/*
6960 		 *	See if someone else expanded us first
6961 		 */
6962 		if (pmap_pte(pmap, v) == PT_ENTRY_NULL) {
6963 			tt_entry_t     *tte_next_p;
6964 
6965 			pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, PMAP_TT_L2_LEVEL, FALSE);
6966 			pa = kvtophys_nofail((vm_offset_t)tt_p);
6967 			tte_p = &pmap->tte[ttenum(v)];
6968 			for (i = 0, tte_next_p = tte_p; i < 4; i++) {
6969 				*tte_next_p = pa_to_tte(pa) | ARM_TTE_TYPE_TABLE;
6970 				PMAP_TRACE(5, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + (i * ARM_TT_L1_SIZE)),
6971 				    VM_KERNEL_ADDRHIDE((v & ~ARM_TT_L1_PT_OFFMASK) + ((i + 1) * ARM_TT_L1_SIZE)), *tte_p);
6972 				tte_next_p++;
6973 				pa = pa + 0x400;
6974 			}
6975 			FLUSH_PTE();
6976 
6977 			pa = 0x0ULL;
6978 			tt_p = (tt_entry_t *)NULL;
6979 		}
6980 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6981 		if (tt_p != (tt_entry_t *)NULL) {
6982 			pmap_tt_deallocate(pmap, tt_p, PMAP_TT_L2_LEVEL);
6983 			tt_p = (tt_entry_t *)NULL;
6984 		}
6985 	}
6986 	return KERN_SUCCESS;
6987 #else
6988 	pmap_paddr_t    pa;
6989 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6990 	tt_entry_t              *tte_p;
6991 	tt_entry_t              *tt_p;
6992 
6993 	pa = 0x0ULL;
6994 	tt_p =  (tt_entry_t *)NULL;
6995 
6996 	for (; ttlevel < level; ttlevel++) {
6997 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6998 
6999 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7000 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7001 			while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
7002 				if (options & PMAP_OPTIONS_NOWAIT) {
7003 					return KERN_RESOURCE_SHORTAGE;
7004 				}
7005 #if XNU_MONITOR
7006 				panic("%s: failed to allocate tt, "
7007 				    "pmap=%p, v=%p, options=0x%x, level=%u",
7008 				    __FUNCTION__,
7009 				    pmap, (void *)v, options, level);
7010 #else
7011 				VM_PAGE_WAIT();
7012 #endif
7013 			}
7014 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7015 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7016 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7017 				pa = kvtophys_nofail((vm_offset_t)tt_p);
7018 				tte_p = pmap_ttne(pmap, ttlevel, v);
7019 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7020 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7021 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7022 				pa = 0x0ULL;
7023 				tt_p = (tt_entry_t *)NULL;
7024 			}
7025 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7026 		} else {
7027 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7028 		}
7029 
7030 		if (tt_p != (tt_entry_t *)NULL) {
7031 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7032 			tt_p = (tt_entry_t *)NULL;
7033 		}
7034 	}
7035 
7036 	return KERN_SUCCESS;
7037 #endif
7038 }
7039 
7040 /*
7041  *	Routine:	pmap_collect
7042  *	Function:
7043  *		Garbage collects the physical map system for
7044  *		pages which are no longer used.
7045  *		Success need not be guaranteed -- that is, there
7046  *		may well be pages which are not referenced, but
7047  *		others may be collected.
7048  */
7049 void
7050 pmap_collect(pmap_t pmap)
7051 {
7052 	if (pmap == PMAP_NULL) {
7053 		return;
7054 	}
7055 
7056 #if 0
7057 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
7058 	if ((pmap->nested == FALSE) && (pmap != kernel_pmap)) {
7059 		/* TODO: Scan for vm page assigned to top level page tables with no reference */
7060 	}
7061 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7062 #endif
7063 
7064 	return;
7065 }
7066 
7067 /*
7068  *	Routine:	pmap_gc
7069  *	Function:
7070  *              Pmap garbage collection
7071  *		Called by the pageout daemon when pages are scarce.
7072  *
7073  */
7074 void
7075 pmap_gc(
7076 	void)
7077 {
7078 #if XNU_MONITOR
7079 	/*
7080 	 * We cannot invoke the scheduler from the PPL, so for now we elide the
7081 	 * GC logic if the PPL is enabled.
7082 	 */
7083 #endif
7084 #if !XNU_MONITOR
7085 	pmap_t  pmap, pmap_next;
7086 	boolean_t       gc_wait;
7087 
7088 	if (pmap_gc_allowed &&
7089 	    (pmap_gc_allowed_by_time_throttle ||
7090 	    pmap_gc_forced)) {
7091 		pmap_gc_forced = FALSE;
7092 		pmap_gc_allowed_by_time_throttle = FALSE;
7093 		pmap_simple_lock(&pmaps_lock);
7094 		pmap = CAST_DOWN_EXPLICIT(pmap_t, queue_first(&map_pmap_list));
7095 		while (!queue_end(&map_pmap_list, (queue_entry_t)pmap)) {
7096 			if (!(pmap->gc_status & PMAP_GC_INFLIGHT)) {
7097 				pmap->gc_status |= PMAP_GC_INFLIGHT;
7098 			}
7099 			pmap_simple_unlock(&pmaps_lock);
7100 
7101 			pmap_collect(pmap);
7102 
7103 			pmap_simple_lock(&pmaps_lock);
7104 			gc_wait = (pmap->gc_status & PMAP_GC_WAIT);
7105 			pmap->gc_status &= ~(PMAP_GC_INFLIGHT | PMAP_GC_WAIT);
7106 			pmap_next = CAST_DOWN_EXPLICIT(pmap_t, queue_next(&pmap->pmaps));
7107 			if (gc_wait) {
7108 				if (!queue_end(&map_pmap_list, (queue_entry_t)pmap_next)) {
7109 					pmap_next->gc_status |= PMAP_GC_INFLIGHT;
7110 				}
7111 				pmap_simple_unlock(&pmaps_lock);
7112 				thread_wakeup((event_t) &pmap->gc_status);
7113 				pmap_simple_lock(&pmaps_lock);
7114 			}
7115 			pmap = pmap_next;
7116 		}
7117 		pmap_simple_unlock(&pmaps_lock);
7118 	}
7119 #endif
7120 }
7121 
7122 /*
7123  *      By default, don't attempt pmap GC more frequently
7124  *      than once / 1 minutes.
7125  */
7126 
7127 void
7128 compute_pmap_gc_throttle(
7129 	void *arg __unused)
7130 {
7131 	pmap_gc_allowed_by_time_throttle = TRUE;
7132 }
7133 
7134 /*
7135  * pmap_attribute_cache_sync(vm_offset_t pa)
7136  *
7137  * Invalidates all of the instruction cache on a physical page and
7138  * pushes any dirty data from the data cache for the same physical page
7139  */
7140 
7141 kern_return_t
7142 pmap_attribute_cache_sync(
7143 	ppnum_t pp,
7144 	vm_size_t size,
7145 	__unused vm_machine_attribute_t attribute,
7146 	__unused vm_machine_attribute_val_t * value)
7147 {
7148 	if (size > PAGE_SIZE) {
7149 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7150 	} else {
7151 		cache_sync_page(pp);
7152 	}
7153 
7154 	return KERN_SUCCESS;
7155 }
7156 
7157 /*
7158  * pmap_sync_page_data_phys(ppnum_t pp)
7159  *
7160  * Invalidates all of the instruction cache on a physical page and
7161  * pushes any dirty data from the data cache for the same physical page
7162  */
7163 void
7164 pmap_sync_page_data_phys(
7165 	ppnum_t pp)
7166 {
7167 	cache_sync_page(pp);
7168 }
7169 
7170 /*
7171  * pmap_sync_page_attributes_phys(ppnum_t pp)
7172  *
7173  * Write back and invalidate all cachelines on a physical page.
7174  */
7175 void
7176 pmap_sync_page_attributes_phys(
7177 	ppnum_t pp)
7178 {
7179 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7180 }
7181 
7182 #if CONFIG_COREDUMP
7183 /* temporary workaround */
7184 boolean_t
7185 coredumpok(
7186 	vm_map_t map,
7187 	mach_vm_offset_t va)
7188 {
7189 	pt_entry_t     *pte_p;
7190 	pt_entry_t      spte;
7191 
7192 	pte_p = pmap_pte(map->pmap, va);
7193 	if (0 == pte_p) {
7194 		return FALSE;
7195 	}
7196 	spte = *pte_p;
7197 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7198 }
7199 #endif
7200 
7201 void
7202 fillPage(
7203 	ppnum_t pn,
7204 	unsigned int fill)
7205 {
7206 	unsigned int   *addr;
7207 	int             count;
7208 
7209 	addr = (unsigned int *) phystokv(ptoa(pn));
7210 	count = PAGE_SIZE / sizeof(unsigned int);
7211 	while (count--) {
7212 		*addr++ = fill;
7213 	}
7214 }
7215 
7216 extern void     mapping_set_mod(ppnum_t pn);
7217 
7218 void
7219 mapping_set_mod(
7220 	ppnum_t pn)
7221 {
7222 	pmap_set_modify(pn);
7223 }
7224 
7225 extern void     mapping_set_ref(ppnum_t pn);
7226 
7227 void
7228 mapping_set_ref(
7229 	ppnum_t pn)
7230 {
7231 	pmap_set_reference(pn);
7232 }
7233 
7234 /*
7235  * Clear specified attribute bits.
7236  *
7237  * Try to force an arm_fast_fault() for all mappings of
7238  * the page - to force attributes to be set again at fault time.
7239  * If the forcing succeeds, clear the cached bits at the head.
7240  * Otherwise, something must have been wired, so leave the cached
7241  * attributes alone.
7242  */
7243 MARK_AS_PMAP_TEXT static void
7244 phys_attribute_clear_with_flush_range(
7245 	ppnum_t         pn,
7246 	unsigned int    bits,
7247 	int             options,
7248 	void            *arg,
7249 	pmap_tlb_flush_range_t *flush_range)
7250 {
7251 	pmap_paddr_t    pa = ptoa(pn);
7252 	vm_prot_t       allow_mode = VM_PROT_ALL;
7253 
7254 #if XNU_MONITOR
7255 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7256 		panic("%s: illegal request, "
7257 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7258 		    __FUNCTION__,
7259 		    pn, bits, options, arg, flush_range);
7260 	}
7261 #endif
7262 	if ((arg != NULL) || (flush_range != NULL)) {
7263 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7264 	}
7265 
7266 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7267 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7268 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
7269 		    "should not clear 'modified' without flushing TLBs\n",
7270 		    pn, bits, options, arg, flush_range);
7271 	}
7272 
7273 	assert(pn != vm_page_fictitious_addr);
7274 
7275 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7276 		assert(bits == PP_ATTR_MODIFIED);
7277 
7278 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7279 		/*
7280 		 * We short circuit this case; it should not need to
7281 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7282 		 * pmap_page_protect has taken care of resetting
7283 		 * the state so that we'll see the next write as a fault to
7284 		 * the VM (i.e. we don't want a fast fault).
7285 		 */
7286 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7287 		return;
7288 	}
7289 	if (bits & PP_ATTR_REFERENCED) {
7290 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7291 	}
7292 	if (bits & PP_ATTR_MODIFIED) {
7293 		allow_mode &= ~VM_PROT_WRITE;
7294 	}
7295 
7296 	if (bits == PP_ATTR_NOENCRYPT) {
7297 		/*
7298 		 * We short circuit this case; it should not need to
7299 		 * invoke arm_force_fast_fault, so just clear and
7300 		 * return.  On ARM, this bit is just a debugging aid.
7301 		 */
7302 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7303 		return;
7304 	}
7305 
7306 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7307 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7308 	}
7309 }
7310 
7311 MARK_AS_PMAP_TEXT void
7312 phys_attribute_clear_internal(
7313 	ppnum_t         pn,
7314 	unsigned int    bits,
7315 	int             options,
7316 	void            *arg)
7317 {
7318 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7319 }
7320 
7321 #if __ARM_RANGE_TLBI__
7322 MARK_AS_PMAP_TEXT static vm_map_address_t
7323 phys_attribute_clear_twig_internal(
7324 	pmap_t pmap,
7325 	vm_map_address_t start,
7326 	vm_map_address_t end,
7327 	unsigned int bits,
7328 	unsigned int options,
7329 	pmap_tlb_flush_range_t *flush_range)
7330 {
7331 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7332 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7333 	assert(end >= start);
7334 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7335 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7336 	vm_map_address_t va = start;
7337 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7338 	tt_entry_t     *tte_p;
7339 	tte_p = pmap_tte(pmap, start);
7340 	unsigned int npages = 0;
7341 
7342 	if (tte_p == (tt_entry_t *) NULL) {
7343 		return end;
7344 	}
7345 
7346 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7347 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7348 
7349 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7350 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7351 		assert(end_pte_p >= start_pte_p);
7352 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7353 			if (__improbable(npages++ && pmap_pending_preemption())) {
7354 				return va;
7355 			}
7356 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7357 			if (pa_valid(pa)) {
7358 				ppnum_t pn = (ppnum_t) atop(pa);
7359 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7360 			}
7361 		}
7362 	}
7363 	return end;
7364 }
7365 
7366 MARK_AS_PMAP_TEXT vm_map_address_t
7367 phys_attribute_clear_range_internal(
7368 	pmap_t pmap,
7369 	vm_map_address_t start,
7370 	vm_map_address_t end,
7371 	unsigned int bits,
7372 	unsigned int options)
7373 {
7374 	if (__improbable(end < start)) {
7375 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7376 	}
7377 	validate_pmap_mutable(pmap);
7378 
7379 	vm_map_address_t va = start;
7380 	pmap_tlb_flush_range_t flush_range = {
7381 		.ptfr_pmap = pmap,
7382 		.ptfr_start = start,
7383 		.ptfr_end = end,
7384 		.ptfr_flush_needed = false
7385 	};
7386 
7387 	pmap_lock(pmap, PMAP_LOCK_SHARED);
7388 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7389 
7390 	while (va < end) {
7391 		vm_map_address_t curr_end;
7392 
7393 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7394 		if (curr_end > end) {
7395 			curr_end = end;
7396 		}
7397 
7398 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7399 		if ((va < curr_end) || pmap_pending_preemption()) {
7400 			break;
7401 		}
7402 	}
7403 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7404 	if (flush_range.ptfr_flush_needed) {
7405 		flush_range.ptfr_end = va;
7406 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7407 			flush_range.ptfr_start,
7408 			flush_range.ptfr_end - flush_range.ptfr_start,
7409 			flush_range.ptfr_pmap,
7410 			true);
7411 		sync_tlb_flush();
7412 	}
7413 	return va;
7414 }
7415 
7416 static void
7417 phys_attribute_clear_range(
7418 	pmap_t pmap,
7419 	vm_map_address_t start,
7420 	vm_map_address_t end,
7421 	unsigned int bits,
7422 	unsigned int options)
7423 {
7424 	/*
7425 	 * We allow single-page requests to execute non-preemptibly,
7426 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7427 	 * operation, and there are a couple of special use cases that
7428 	 * require a non-preemptible single-page operation.
7429 	 */
7430 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7431 		pmap_verify_preemptible();
7432 	}
7433 
7434 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7435 
7436 	while (start < end) {
7437 #if XNU_MONITOR
7438 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7439 #else
7440 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7441 #endif
7442 	}
7443 
7444 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7445 }
7446 #endif /* __ARM_RANGE_TLBI__ */
7447 
7448 static void
7449 phys_attribute_clear(
7450 	ppnum_t         pn,
7451 	unsigned int    bits,
7452 	int             options,
7453 	void            *arg)
7454 {
7455 	/*
7456 	 * Do we really want this tracepoint?  It will be extremely chatty.
7457 	 * Also, should we have a corresponding trace point for the set path?
7458 	 */
7459 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7460 
7461 #if XNU_MONITOR
7462 	phys_attribute_clear_ppl(pn, bits, options, arg);
7463 #else
7464 	phys_attribute_clear_internal(pn, bits, options, arg);
7465 #endif
7466 
7467 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7468 }
7469 
7470 /*
7471  *	Set specified attribute bits.
7472  *
7473  *	Set cached value in the pv head because we have
7474  *	no per-mapping hardware support for referenced and
7475  *	modify bits.
7476  */
7477 MARK_AS_PMAP_TEXT void
7478 phys_attribute_set_internal(
7479 	ppnum_t pn,
7480 	unsigned int bits)
7481 {
7482 	pmap_paddr_t    pa = ptoa(pn);
7483 	assert(pn != vm_page_fictitious_addr);
7484 
7485 #if XNU_MONITOR
7486 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7487 		panic("%s: illegal request, "
7488 		    "pn=%u, bits=%#x",
7489 		    __FUNCTION__,
7490 		    pn, bits);
7491 	}
7492 #endif
7493 
7494 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7495 
7496 	return;
7497 }
7498 
7499 static void
7500 phys_attribute_set(
7501 	ppnum_t pn,
7502 	unsigned int bits)
7503 {
7504 #if XNU_MONITOR
7505 	phys_attribute_set_ppl(pn, bits);
7506 #else
7507 	phys_attribute_set_internal(pn, bits);
7508 #endif
7509 }
7510 
7511 
7512 /*
7513  *	Check specified attribute bits.
7514  *
7515  *	use the software cached bits (since no hw support).
7516  */
7517 static boolean_t
7518 phys_attribute_test(
7519 	ppnum_t pn,
7520 	unsigned int bits)
7521 {
7522 	pmap_paddr_t    pa = ptoa(pn);
7523 	assert(pn != vm_page_fictitious_addr);
7524 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7525 }
7526 
7527 
7528 /*
7529  *	Set the modify/reference bits on the specified physical page.
7530  */
7531 void
7532 pmap_set_modify(ppnum_t pn)
7533 {
7534 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7535 }
7536 
7537 
7538 /*
7539  *	Clear the modify bits on the specified physical page.
7540  */
7541 void
7542 pmap_clear_modify(
7543 	ppnum_t pn)
7544 {
7545 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7546 }
7547 
7548 
7549 /*
7550  *	pmap_is_modified:
7551  *
7552  *	Return whether or not the specified physical page is modified
7553  *	by any physical maps.
7554  */
7555 boolean_t
7556 pmap_is_modified(
7557 	ppnum_t pn)
7558 {
7559 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7560 }
7561 
7562 
7563 /*
7564  *	Set the reference bit on the specified physical page.
7565  */
7566 static void
7567 pmap_set_reference(
7568 	ppnum_t pn)
7569 {
7570 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7571 }
7572 
7573 /*
7574  *	Clear the reference bits on the specified physical page.
7575  */
7576 void
7577 pmap_clear_reference(
7578 	ppnum_t pn)
7579 {
7580 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7581 }
7582 
7583 
7584 /*
7585  *	pmap_is_referenced:
7586  *
7587  *	Return whether or not the specified physical page is referenced
7588  *	by any physical maps.
7589  */
7590 boolean_t
7591 pmap_is_referenced(
7592 	ppnum_t pn)
7593 {
7594 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7595 }
7596 
7597 /*
7598  * pmap_get_refmod(phys)
7599  *  returns the referenced and modified bits of the specified
7600  *  physical page.
7601  */
7602 unsigned int
7603 pmap_get_refmod(
7604 	ppnum_t pn)
7605 {
7606 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7607 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7608 }
7609 
7610 static inline unsigned int
7611 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7612 {
7613 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7614 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7615 }
7616 
7617 /*
7618  * pmap_clear_refmod(phys, mask)
7619  *  clears the referenced and modified bits as specified by the mask
7620  *  of the specified physical page.
7621  */
7622 void
7623 pmap_clear_refmod_options(
7624 	ppnum_t         pn,
7625 	unsigned int    mask,
7626 	unsigned int    options,
7627 	void            *arg)
7628 {
7629 	unsigned int    bits;
7630 
7631 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7632 	phys_attribute_clear(pn, bits, options, arg);
7633 }
7634 
7635 /*
7636  * Perform pmap_clear_refmod_options on a virtual address range.
7637  * The operation will be performed in bulk & tlb flushes will be coalesced
7638  * if possible.
7639  *
7640  * Returns true if the operation is supported on this platform.
7641  * If this function returns false, the operation is not supported and
7642  * nothing has been modified in the pmap.
7643  */
7644 bool
7645 pmap_clear_refmod_range_options(
7646 	pmap_t pmap __unused,
7647 	vm_map_address_t start __unused,
7648 	vm_map_address_t end __unused,
7649 	unsigned int mask __unused,
7650 	unsigned int options __unused)
7651 {
7652 #if __ARM_RANGE_TLBI__
7653 	unsigned int    bits;
7654 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7655 	phys_attribute_clear_range(pmap, start, end, bits, options);
7656 	return true;
7657 #else /* __ARM_RANGE_TLBI__ */
7658 #pragma unused(pmap, start, end, mask, options)
7659 	/*
7660 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7661 	 * contiguous range of addresses. This is large performance improvement on
7662 	 * platforms that support ranged tlbi instructions. But on older platforms,
7663 	 * we can only flush per-page or the entire asid. So we currently
7664 	 * only support this operation on platforms that support ranged tlbi.
7665 	 * instructions. On other platforms, we require that
7666 	 * the VM modify the bits on a per-page basis.
7667 	 */
7668 	return false;
7669 #endif /* __ARM_RANGE_TLBI__ */
7670 }
7671 
7672 void
7673 pmap_clear_refmod(
7674 	ppnum_t pn,
7675 	unsigned int mask)
7676 {
7677 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7678 }
7679 
7680 unsigned int
7681 pmap_disconnect_options(
7682 	ppnum_t pn,
7683 	unsigned int options,
7684 	void *arg)
7685 {
7686 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7687 		/*
7688 		 * On ARM, the "modified" bit is managed by software, so
7689 		 * we know up-front if the physical page is "modified",
7690 		 * without having to scan all the PTEs pointing to it.
7691 		 * The caller should have made the VM page "busy" so noone
7692 		 * should be able to establish any new mapping and "modify"
7693 		 * the page behind us.
7694 		 */
7695 		if (pmap_is_modified(pn)) {
7696 			/*
7697 			 * The page has been modified and will be sent to
7698 			 * the VM compressor.
7699 			 */
7700 			options |= PMAP_OPTIONS_COMPRESSOR;
7701 		} else {
7702 			/*
7703 			 * The page hasn't been modified and will be freed
7704 			 * instead of compressed.
7705 			 */
7706 		}
7707 	}
7708 
7709 	/* disconnect the page */
7710 	pmap_page_protect_options(pn, 0, options, arg);
7711 
7712 	/* return ref/chg status */
7713 	return pmap_get_refmod(pn);
7714 }
7715 
7716 /*
7717  *	Routine:
7718  *		pmap_disconnect
7719  *
7720  *	Function:
7721  *		Disconnect all mappings for this page and return reference and change status
7722  *		in generic format.
7723  *
7724  */
7725 unsigned int
7726 pmap_disconnect(
7727 	ppnum_t pn)
7728 {
7729 	pmap_page_protect(pn, 0);       /* disconnect the page */
7730 	return pmap_get_refmod(pn);   /* return ref/chg status */
7731 }
7732 
7733 boolean_t
7734 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7735 {
7736 	if (ptoa(first) >= vm_last_phys) {
7737 		return FALSE;
7738 	}
7739 	if (ptoa(last) < vm_first_phys) {
7740 		return FALSE;
7741 	}
7742 
7743 	return TRUE;
7744 }
7745 
7746 /*
7747  * The state maintained by the noencrypt functions is used as a
7748  * debugging aid on ARM.  This incurs some overhead on the part
7749  * of the caller.  A special case check in phys_attribute_clear
7750  * (the most expensive path) currently minimizes this overhead,
7751  * but stubbing these functions out on RELEASE kernels yields
7752  * further wins.
7753  */
7754 boolean_t
7755 pmap_is_noencrypt(
7756 	ppnum_t pn)
7757 {
7758 #if DEVELOPMENT || DEBUG
7759 	boolean_t result = FALSE;
7760 
7761 	if (!pa_valid(ptoa(pn))) {
7762 		return FALSE;
7763 	}
7764 
7765 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7766 
7767 	return result;
7768 #else
7769 #pragma unused(pn)
7770 	return FALSE;
7771 #endif
7772 }
7773 
7774 void
7775 pmap_set_noencrypt(
7776 	ppnum_t pn)
7777 {
7778 #if DEVELOPMENT || DEBUG
7779 	if (!pa_valid(ptoa(pn))) {
7780 		return;
7781 	}
7782 
7783 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7784 #else
7785 #pragma unused(pn)
7786 #endif
7787 }
7788 
7789 void
7790 pmap_clear_noencrypt(
7791 	ppnum_t pn)
7792 {
7793 #if DEVELOPMENT || DEBUG
7794 	if (!pa_valid(ptoa(pn))) {
7795 		return;
7796 	}
7797 
7798 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7799 #else
7800 #pragma unused(pn)
7801 #endif
7802 }
7803 
7804 #if XNU_MONITOR
7805 boolean_t
7806 pmap_is_monitor(ppnum_t pn)
7807 {
7808 	assert(pa_valid(ptoa(pn)));
7809 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7810 }
7811 #endif
7812 
7813 void
7814 pmap_lock_phys_page(ppnum_t pn)
7815 {
7816 #if !XNU_MONITOR
7817 	unsigned int    pai;
7818 	pmap_paddr_t    phys = ptoa(pn);
7819 
7820 	if (pa_valid(phys)) {
7821 		pai = pa_index(phys);
7822 		pvh_lock(pai);
7823 	} else
7824 #else
7825 	(void)pn;
7826 #endif
7827 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7828 }
7829 
7830 
7831 void
7832 pmap_unlock_phys_page(ppnum_t pn)
7833 {
7834 #if !XNU_MONITOR
7835 	unsigned int    pai;
7836 	pmap_paddr_t    phys = ptoa(pn);
7837 
7838 	if (pa_valid(phys)) {
7839 		pai = pa_index(phys);
7840 		pvh_unlock(pai);
7841 	} else
7842 #else
7843 	(void)pn;
7844 #endif
7845 	{ simple_unlock(&phys_backup_lock);}
7846 }
7847 
7848 MARK_AS_PMAP_TEXT static void
7849 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7850 {
7851 #if     (__ARM_VMSA__ == 7)
7852 	cpu_data_ptr->cpu_user_pmap = pmap;
7853 	cpu_data_ptr->cpu_user_pmap_stamp = pmap->stamp;
7854 	if (pmap != kernel_pmap) {
7855 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7856 	}
7857 
7858 #if     MACH_ASSERT && __ARM_USER_PROTECT__
7859 	{
7860 		unsigned int ttbr0_val, ttbr1_val;
7861 		__asm__ volatile ("mrc p15,0,%0,c2,c0,0\n" : "=r"(ttbr0_val));
7862 		__asm__ volatile ("mrc p15,0,%0,c2,c0,1\n" : "=r"(ttbr1_val));
7863 		if (ttbr0_val != ttbr1_val) {
7864 			panic("Misaligned ttbr0  %08X", ttbr0_val);
7865 		}
7866 		if (pmap->ttep & 0x1000) {
7867 			panic("Misaligned ttbr0  %08X", pmap->ttep);
7868 		}
7869 	}
7870 #endif
7871 #if !__ARM_USER_PROTECT__
7872 	set_mmu_ttb(pmap->ttep);
7873 	set_context_id(pmap->hw_asid);
7874 #endif
7875 
7876 #else /* (__ARM_VMSA__ == 7) */
7877 
7878 	if (pmap != kernel_pmap) {
7879 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7880 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7881 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7882 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7883 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7884 #if __ARM_MIXED_PAGE_SIZE__
7885 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7886 #endif
7887 	}
7888 
7889 
7890 #if __ARM_MIXED_PAGE_SIZE__
7891 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7892 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7893 	}
7894 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7895 
7896 
7897 	if (pmap != kernel_pmap) {
7898 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7899 	} else if (!pmap_user_ttb_is_clear()) {
7900 		pmap_clear_user_ttb_internal();
7901 	}
7902 #endif /* (__ARM_VMSA__ == 7) */
7903 }
7904 
7905 MARK_AS_PMAP_TEXT void
7906 pmap_clear_user_ttb_internal(void)
7907 {
7908 #if (__ARM_VMSA__ > 7)
7909 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7910 #else
7911 	set_mmu_ttb(kernel_pmap->ttep);
7912 #endif
7913 }
7914 
7915 void
7916 pmap_clear_user_ttb(void)
7917 {
7918 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7919 #if XNU_MONITOR
7920 	pmap_clear_user_ttb_ppl();
7921 #else
7922 	pmap_clear_user_ttb_internal();
7923 #endif
7924 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7925 }
7926 
7927 
7928 #if defined(__arm64__)
7929 /*
7930  * Marker for use in multi-pass fast-fault PV list processing.
7931  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7932  * these functions, as compressed PTEs should never be present in PV lists.
7933  * Note that this only holds true for arm64; for arm32 we don't have enough
7934  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7935  * and WRITEABLE marker depending on whether the PTE is valid.
7936  */
7937 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7938 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7939 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7940 #endif
7941 
7942 
7943 MARK_AS_PMAP_TEXT static boolean_t
7944 arm_force_fast_fault_with_flush_range(
7945 	ppnum_t         ppnum,
7946 	vm_prot_t       allow_mode,
7947 	int             options,
7948 	pmap_tlb_flush_range_t *flush_range)
7949 {
7950 	pmap_paddr_t     phys = ptoa(ppnum);
7951 	pv_entry_t      *pve_p;
7952 	pt_entry_t      *pte_p;
7953 	unsigned int     pai;
7954 	unsigned int     pass1_updated = 0;
7955 	unsigned int     pass2_updated = 0;
7956 	boolean_t        result;
7957 	pv_entry_t     **pv_h;
7958 	bool             is_reusable;
7959 	bool             ref_fault;
7960 	bool             mod_fault;
7961 	bool             clear_write_fault = false;
7962 	bool             ref_aliases_mod = false;
7963 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7964 
7965 	assert(ppnum != vm_page_fictitious_addr);
7966 
7967 	if (!pa_valid(phys)) {
7968 		return FALSE;   /* Not a managed page. */
7969 	}
7970 
7971 	result = TRUE;
7972 	ref_fault = false;
7973 	mod_fault = false;
7974 	pai = pa_index(phys);
7975 	if (__probable(mustsynch)) {
7976 		pvh_lock(pai);
7977 	}
7978 	pv_h = pai_to_pvh(pai);
7979 
7980 #if XNU_MONITOR
7981 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7982 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7983 	}
7984 #endif
7985 	pte_p = PT_ENTRY_NULL;
7986 	pve_p = PV_ENTRY_NULL;
7987 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7988 		pte_p = pvh_ptep(pv_h);
7989 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7990 		pve_p = pvh_pve_list(pv_h);
7991 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7992 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7993 	}
7994 
7995 	is_reusable = ppattr_test_reusable(pai);
7996 
7997 	/*
7998 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7999 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
8000 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8001 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
8002 	 * operation, TLB invalidation may be handled by the caller so it's possible for
8003 	 * tlb_flush_needed to be true while issue_tlbi is false.
8004 	 */
8005 	bool issue_tlbi = false;
8006 	bool tlb_flush_needed = false;
8007 
8008 	pv_entry_t *orig_pve_p = pve_p;
8009 	pt_entry_t *orig_pte_p = pte_p;
8010 	int pve_ptep_idx = 0;
8011 
8012 	/*
8013 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8014 	 * TLB invalidation in pass 2.
8015 	 */
8016 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8017 		pt_entry_t       spte;
8018 		pt_entry_t       tmplate;
8019 
8020 		if (pve_p != PV_ENTRY_NULL) {
8021 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8022 			if (pte_p == PT_ENTRY_NULL) {
8023 				goto fff_skip_pve_pass1;
8024 			}
8025 		}
8026 
8027 #ifdef PVH_FLAG_IOMMU
8028 		if (pvh_ptep_is_iommu(pte_p)) {
8029 			goto fff_skip_pve_pass1;
8030 		}
8031 #endif
8032 		if (*pte_p == ARM_PTE_EMPTY) {
8033 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8034 		}
8035 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8036 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8037 		}
8038 
8039 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8040 		const pmap_t pmap = ptdp->pmap;
8041 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8042 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8043 
8044 		assert(va >= pmap->min && va < pmap->max);
8045 
8046 		/* update pmap stats and ledgers */
8047 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8048 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8049 		if (is_altacct) {
8050 			/*
8051 			 * We do not track "reusable" status for
8052 			 * "alternate accounting" mappings.
8053 			 */
8054 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8055 		    is_reusable &&
8056 		    is_internal &&
8057 		    pmap != kernel_pmap) {
8058 			/* one less "reusable" */
8059 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8060 			/* one more "internal" */
8061 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8062 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8063 
8064 			/*
8065 			 * Since the page is being marked non-reusable, we assume that it will be
8066 			 * modified soon.  Avoid the cost of another trap to handle the fast
8067 			 * fault when we next write to this page.
8068 			 */
8069 			clear_write_fault = true;
8070 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8071 		    !is_reusable &&
8072 		    is_internal &&
8073 		    pmap != kernel_pmap) {
8074 			/* one more "reusable" */
8075 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8076 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8077 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8078 		}
8079 
8080 		bool wiredskip = pte_is_wired(*pte_p) &&
8081 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8082 
8083 		if (wiredskip) {
8084 			result = FALSE;
8085 			goto fff_skip_pve_pass1;
8086 		}
8087 
8088 		spte = *pte_p;
8089 		tmplate = spte;
8090 
8091 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8092 			/* read protection sets the pte to fault */
8093 			tmplate =  tmplate & ~ARM_PTE_AF;
8094 			ref_fault = true;
8095 		}
8096 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8097 			/* take away write permission if set */
8098 			if (pmap == kernel_pmap) {
8099 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8100 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8101 					pte_set_was_writeable(tmplate, true);
8102 					mod_fault = true;
8103 				}
8104 			} else {
8105 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8106 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8107 					pte_set_was_writeable(tmplate, true);
8108 					mod_fault = true;
8109 				}
8110 			}
8111 		}
8112 
8113 #if MACH_ASSERT && XNU_MONITOR
8114 		if (is_pte_xprr_protected(pmap, spte)) {
8115 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8116 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8117 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8118 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8119 				    ppnum, options, allow_mode);
8120 			}
8121 		}
8122 #endif /* MACH_ASSERT && XNU_MONITOR */
8123 
8124 		if (result && (tmplate != spte)) {
8125 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8126 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
8127 				tlb_flush_needed = true;
8128 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8129 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8130 #ifdef ARM_PTE_FF_MARKER
8131 					assert(!(spte & ARM_PTE_FF_MARKER));
8132 					tmplate |= ARM_PTE_FF_MARKER;
8133 					++pass1_updated;
8134 #endif
8135 					issue_tlbi = true;
8136 				}
8137 			}
8138 			write_pte_fast(pte_p, tmplate);
8139 		}
8140 
8141 fff_skip_pve_pass1:
8142 		pte_p = PT_ENTRY_NULL;
8143 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8144 			pve_ptep_idx = 0;
8145 			pve_p = pve_next(pve_p);
8146 		}
8147 	}
8148 
8149 	if (tlb_flush_needed) {
8150 		FLUSH_PTE_STRONG();
8151 	}
8152 
8153 	if (!issue_tlbi) {
8154 		goto fff_finish;
8155 	}
8156 
8157 	/* Pass 2: Issue any required TLB invalidations */
8158 	pve_p = orig_pve_p;
8159 	pte_p = orig_pte_p;
8160 	pve_ptep_idx = 0;
8161 
8162 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8163 		if (pve_p != PV_ENTRY_NULL) {
8164 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8165 			if (pte_p == PT_ENTRY_NULL) {
8166 				goto fff_skip_pve_pass2;
8167 			}
8168 		}
8169 
8170 #ifdef PVH_FLAG_IOMMU
8171 		if (pvh_ptep_is_iommu(pte_p)) {
8172 			goto fff_skip_pve_pass2;
8173 		}
8174 #endif
8175 
8176 #ifdef ARM_PTE_FF_MARKER
8177 		pt_entry_t spte = *pte_p;
8178 
8179 		if (!(spte & ARM_PTE_FF_MARKER)) {
8180 			goto fff_skip_pve_pass2;
8181 		} else {
8182 			spte &= (~ARM_PTE_FF_MARKER);
8183 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8184 			write_pte_fast(pte_p, spte);
8185 			++pass2_updated;
8186 		}
8187 #endif
8188 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8189 		const pmap_t pmap = ptdp->pmap;
8190 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8191 
8192 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8193 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8194 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8195 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8196 		}
8197 
8198 fff_skip_pve_pass2:
8199 		pte_p = PT_ENTRY_NULL;
8200 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8201 			pve_ptep_idx = 0;
8202 			pve_p = pve_next(pve_p);
8203 		}
8204 	}
8205 
8206 fff_finish:
8207 	if (__improbable(pass1_updated != pass2_updated)) {
8208 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8209 		    __func__, pass1_updated, pass2_updated);
8210 	}
8211 
8212 	/*
8213 	 * If we are using the same approach for ref and mod
8214 	 * faults on this PTE, do not clear the write fault;
8215 	 * this would cause both ref and mod to be set on the
8216 	 * page again, and prevent us from taking ANY read/write
8217 	 * fault on the mapping.
8218 	 */
8219 	if (clear_write_fault && !ref_aliases_mod) {
8220 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8221 	}
8222 	if (tlb_flush_needed) {
8223 		if (flush_range) {
8224 			/* Delayed flush. Signal to the caller that the flush is needed. */
8225 			flush_range->ptfr_flush_needed = true;
8226 		} else {
8227 			sync_tlb_flush();
8228 		}
8229 	}
8230 
8231 	/* update global "reusable" status for this page */
8232 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8233 		ppattr_clear_reusable(pai);
8234 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8235 		ppattr_set_reusable(pai);
8236 	}
8237 
8238 	if (mod_fault) {
8239 		ppattr_set_modfault(pai);
8240 	}
8241 	if (ref_fault) {
8242 		ppattr_set_reffault(pai);
8243 	}
8244 	if (__probable(mustsynch)) {
8245 		pvh_unlock(pai);
8246 	}
8247 	return result;
8248 }
8249 
8250 MARK_AS_PMAP_TEXT boolean_t
8251 arm_force_fast_fault_internal(
8252 	ppnum_t         ppnum,
8253 	vm_prot_t       allow_mode,
8254 	int             options)
8255 {
8256 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8257 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8258 	}
8259 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8260 }
8261 
8262 /*
8263  *	Routine:	arm_force_fast_fault
8264  *
8265  *	Function:
8266  *		Force all mappings for this page to fault according
8267  *		to the access modes allowed, so we can gather ref/modify
8268  *		bits again.
8269  */
8270 
8271 boolean_t
8272 arm_force_fast_fault(
8273 	ppnum_t         ppnum,
8274 	vm_prot_t       allow_mode,
8275 	int             options,
8276 	__unused void   *arg)
8277 {
8278 	pmap_paddr_t    phys = ptoa(ppnum);
8279 
8280 	assert(ppnum != vm_page_fictitious_addr);
8281 
8282 	if (!pa_valid(phys)) {
8283 		return FALSE;   /* Not a managed page. */
8284 	}
8285 
8286 #if XNU_MONITOR
8287 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8288 #else
8289 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8290 #endif
8291 }
8292 
8293 /*
8294  *	Routine:	arm_clear_fast_fault
8295  *
8296  *	Function:
8297  *		Clear pending force fault for all mappings for this page based on
8298  *		the observed fault type, update ref/modify bits.
8299  */
8300 MARK_AS_PMAP_TEXT static boolean_t
8301 arm_clear_fast_fault(
8302 	ppnum_t ppnum,
8303 	vm_prot_t fault_type,
8304 	pt_entry_t *pte_p)
8305 {
8306 	pmap_paddr_t    pa = ptoa(ppnum);
8307 	pv_entry_t     *pve_p;
8308 	unsigned int    pai;
8309 	boolean_t       result;
8310 	bool            tlb_flush_needed = false;
8311 	pv_entry_t    **pv_h;
8312 	unsigned int    npve = 0;
8313 	unsigned int    pass1_updated = 0;
8314 	unsigned int    pass2_updated = 0;
8315 
8316 	assert(ppnum != vm_page_fictitious_addr);
8317 
8318 	if (!pa_valid(pa)) {
8319 		return FALSE;   /* Not a managed page. */
8320 	}
8321 
8322 	result = FALSE;
8323 	pai = pa_index(pa);
8324 	pvh_assert_locked(pai);
8325 	pv_h = pai_to_pvh(pai);
8326 
8327 	pve_p = PV_ENTRY_NULL;
8328 	if (pte_p == PT_ENTRY_NULL) {
8329 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8330 			pte_p = pvh_ptep(pv_h);
8331 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8332 			pve_p = pvh_pve_list(pv_h);
8333 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8334 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8335 		}
8336 	}
8337 
8338 	pv_entry_t *orig_pve_p = pve_p;
8339 	pt_entry_t *orig_pte_p = pte_p;
8340 	int pve_ptep_idx = 0;
8341 
8342 	/*
8343 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8344 	 * TLB invalidation in pass 2.
8345 	 */
8346 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8347 		pt_entry_t spte;
8348 		pt_entry_t tmplate;
8349 
8350 		if (pve_p != PV_ENTRY_NULL) {
8351 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8352 			if (pte_p == PT_ENTRY_NULL) {
8353 				goto cff_skip_pve_pass1;
8354 			}
8355 		}
8356 
8357 #ifdef PVH_FLAG_IOMMU
8358 		if (pvh_ptep_is_iommu(pte_p)) {
8359 			goto cff_skip_pve_pass1;
8360 		}
8361 #endif
8362 		if (*pte_p == ARM_PTE_EMPTY) {
8363 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8364 		}
8365 
8366 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8367 		const pmap_t pmap = ptdp->pmap;
8368 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8369 
8370 		assert(va >= pmap->min && va < pmap->max);
8371 
8372 		spte = *pte_p;
8373 		tmplate = spte;
8374 
8375 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8376 			{
8377 				if (pmap == kernel_pmap) {
8378 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8379 				} else {
8380 					assert(pmap->type != PMAP_TYPE_NESTED);
8381 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8382 				}
8383 			}
8384 
8385 			tmplate |= ARM_PTE_AF;
8386 
8387 			pte_set_was_writeable(tmplate, false);
8388 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8389 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8390 			tmplate = spte | ARM_PTE_AF;
8391 
8392 			{
8393 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8394 			}
8395 		}
8396 
8397 #if MACH_ASSERT && XNU_MONITOR
8398 		if (is_pte_xprr_protected(pmap, spte)) {
8399 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8400 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8401 				    "ppnum=0x%x, fault_type=0x%x",
8402 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8403 				    ppnum, fault_type);
8404 			}
8405 		}
8406 #endif /* MACH_ASSERT && XNU_MONITOR */
8407 
8408 		assert(spte != ARM_PTE_TYPE_FAULT);
8409 		if (spte != tmplate) {
8410 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8411 #ifdef ARM_PTE_FF_MARKER
8412 				assert(!(spte & ARM_PTE_FF_MARKER));
8413 				tmplate |= ARM_PTE_FF_MARKER;
8414 				++pass1_updated;
8415 #endif
8416 				tlb_flush_needed = true;
8417 			}
8418 			write_pte_fast(pte_p, tmplate);
8419 			result = TRUE;
8420 		}
8421 
8422 cff_skip_pve_pass1:
8423 		pte_p = PT_ENTRY_NULL;
8424 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8425 			pve_ptep_idx = 0;
8426 			pve_p = pve_next(pve_p);
8427 			++npve;
8428 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8429 				break;
8430 			}
8431 		}
8432 	}
8433 
8434 	if (!tlb_flush_needed) {
8435 		goto cff_finish;
8436 	}
8437 
8438 	FLUSH_PTE_STRONG();
8439 
8440 	/* Pass 2: Issue any required TLB invalidations */
8441 	pve_p = orig_pve_p;
8442 	pte_p = orig_pte_p;
8443 	pve_ptep_idx = 0;
8444 	npve = 0;
8445 
8446 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8447 		if (pve_p != PV_ENTRY_NULL) {
8448 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8449 			if (pte_p == PT_ENTRY_NULL) {
8450 				goto cff_skip_pve_pass2;
8451 			}
8452 		}
8453 
8454 #ifdef PVH_FLAG_IOMMU
8455 		if (pvh_ptep_is_iommu(pte_p)) {
8456 			goto cff_skip_pve_pass2;
8457 		}
8458 #endif
8459 
8460 #ifdef ARM_PTE_FF_MARKER
8461 		pt_entry_t spte = *pte_p;
8462 
8463 		if (!(spte & ARM_PTE_FF_MARKER)) {
8464 			goto cff_skip_pve_pass2;
8465 		} else {
8466 			spte &= (~ARM_PTE_FF_MARKER);
8467 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8468 			write_pte_fast(pte_p, spte);
8469 			++pass2_updated;
8470 		}
8471 #endif
8472 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8473 		const pmap_t pmap = ptdp->pmap;
8474 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8475 
8476 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8477 
8478 cff_skip_pve_pass2:
8479 		pte_p = PT_ENTRY_NULL;
8480 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8481 			pve_ptep_idx = 0;
8482 			pve_p = pve_next(pve_p);
8483 			++npve;
8484 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8485 				break;
8486 			}
8487 		}
8488 	}
8489 
8490 cff_finish:
8491 	if (__improbable(pass1_updated != pass2_updated)) {
8492 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8493 		    __func__, pass1_updated, pass2_updated);
8494 	}
8495 	if (tlb_flush_needed) {
8496 		sync_tlb_flush();
8497 	}
8498 	return result;
8499 }
8500 
8501 /*
8502  * Determine if the fault was induced by software tracking of
8503  * modify/reference bits.  If so, re-enable the mapping (and set
8504  * the appropriate bits).
8505  *
8506  * Returns KERN_SUCCESS if the fault was induced and was
8507  * successfully handled.
8508  *
8509  * Returns KERN_FAILURE if the fault was not induced and
8510  * the function was unable to deal with it.
8511  *
8512  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8513  * disallows this type of access.
8514  */
8515 MARK_AS_PMAP_TEXT kern_return_t
8516 arm_fast_fault_internal(
8517 	pmap_t pmap,
8518 	vm_map_address_t va,
8519 	vm_prot_t fault_type,
8520 	__unused bool was_af_fault,
8521 	__unused bool from_user)
8522 {
8523 	kern_return_t   result = KERN_FAILURE;
8524 	pt_entry_t     *ptep;
8525 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8526 	unsigned int    pai;
8527 	pmap_paddr_t    pa;
8528 	validate_pmap_mutable(pmap);
8529 
8530 	pmap_lock(pmap, PMAP_LOCK_SHARED);
8531 
8532 	/*
8533 	 * If the entry doesn't exist, is completely invalid, or is already
8534 	 * valid, we can't fix it here.
8535 	 */
8536 
8537 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8538 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8539 	if (ptep != PT_ENTRY_NULL) {
8540 		while (true) {
8541 			spte = *((volatile pt_entry_t*)ptep);
8542 
8543 			pa = pte_to_pa(spte);
8544 
8545 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8546 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8547 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8548 				return result;
8549 			}
8550 
8551 			if (!pa_valid(pa)) {
8552 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8553 #if XNU_MONITOR
8554 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8555 					return KERN_PROTECTION_FAILURE;
8556 				} else
8557 #endif
8558 				return result;
8559 			}
8560 			pai = pa_index(pa);
8561 			pvh_lock(pai);
8562 			if (*ptep == spte) {
8563 				/*
8564 				 * Double-check the spte value, as we care about the AF bit.
8565 				 * It's also possible that pmap_page_protect() transitioned the
8566 				 * PTE to compressed/empty before we grabbed the PVH lock.
8567 				 */
8568 				break;
8569 			}
8570 			pvh_unlock(pai);
8571 		}
8572 	} else {
8573 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8574 		return result;
8575 	}
8576 
8577 
8578 	if ((result != KERN_SUCCESS) &&
8579 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8580 		/*
8581 		 * An attempted access will always clear ref/mod fault state, as
8582 		 * appropriate for the fault type.  arm_clear_fast_fault will
8583 		 * update the associated PTEs for the page as appropriate; if
8584 		 * any PTEs are updated, we redrive the access.  If the mapping
8585 		 * does not actually allow for the attempted access, the
8586 		 * following fault will (hopefully) fail to update any PTEs, and
8587 		 * thus cause arm_fast_fault to decide that it failed to handle
8588 		 * the fault.
8589 		 */
8590 		if (ppattr_test_reffault(pai)) {
8591 			ppattr_clear_reffault(pai);
8592 		}
8593 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8594 			ppattr_clear_modfault(pai);
8595 		}
8596 
8597 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8598 			/*
8599 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8600 			 * cost of not doing so is a another fault in a case
8601 			 * that should already result in an exception.
8602 			 */
8603 			result = KERN_SUCCESS;
8604 		}
8605 	}
8606 
8607 	/*
8608 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8609 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8610 	 * on mappings of the same page
8611 	 */
8612 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8613 		uintptr_t ap_ro, ap_rw, ap_x;
8614 		if (pmap == kernel_pmap) {
8615 			ap_ro = ARM_PTE_AP(AP_RONA);
8616 			ap_rw = ARM_PTE_AP(AP_RWNA);
8617 			ap_x = ARM_PTE_NX;
8618 		} else {
8619 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8620 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8621 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8622 		}
8623 		/*
8624 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8625 		 * hardware they may be xPRR-protected, in which case they'll be handled
8626 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8627 		 * handling path currently does not call arm_fast_fault() without at least
8628 		 * VM_PROT_READ in fault_type.
8629 		 */
8630 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8631 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8632 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8633 				result = KERN_SUCCESS;
8634 			}
8635 		}
8636 	}
8637 
8638 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8639 		/*
8640 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8641 		 * another pending PV list operation or an excessively large PV list.
8642 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8643 		 * taking a fault on the same mapping.
8644 		 */
8645 		result = KERN_SUCCESS;
8646 	}
8647 
8648 	pvh_unlock(pai);
8649 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8650 	return result;
8651 }
8652 
8653 kern_return_t
8654 arm_fast_fault(
8655 	pmap_t pmap,
8656 	vm_map_address_t va,
8657 	vm_prot_t fault_type,
8658 	bool was_af_fault,
8659 	__unused bool from_user)
8660 {
8661 	kern_return_t   result = KERN_FAILURE;
8662 
8663 	if (va < pmap->min || va >= pmap->max) {
8664 		return result;
8665 	}
8666 
8667 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8668 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8669 	    from_user);
8670 
8671 #if     (__ARM_VMSA__ == 7)
8672 	if (pmap != kernel_pmap) {
8673 		pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
8674 		pmap_t          cur_pmap;
8675 		pmap_t          cur_user_pmap;
8676 
8677 		cur_pmap = current_pmap();
8678 		cur_user_pmap = cpu_data_ptr->cpu_user_pmap;
8679 
8680 		if ((cur_user_pmap == cur_pmap) && (cur_pmap == pmap)) {
8681 			if (cpu_data_ptr->cpu_user_pmap_stamp != pmap->stamp) {
8682 				pmap_set_pmap(pmap, current_thread());
8683 				result = KERN_SUCCESS;
8684 				goto done;
8685 			}
8686 		}
8687 	}
8688 #endif
8689 
8690 #if XNU_MONITOR
8691 	result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8692 #else
8693 	result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8694 #endif
8695 
8696 #if (__ARM_VMSA__ == 7)
8697 done:
8698 #endif
8699 
8700 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8701 
8702 	return result;
8703 }
8704 
8705 void
8706 pmap_copy_page(
8707 	ppnum_t psrc,
8708 	ppnum_t pdst)
8709 {
8710 	bcopy_phys((addr64_t) (ptoa(psrc)),
8711 	    (addr64_t) (ptoa(pdst)),
8712 	    PAGE_SIZE);
8713 }
8714 
8715 
8716 /*
8717  *	pmap_copy_page copies the specified (machine independent) pages.
8718  */
8719 void
8720 pmap_copy_part_page(
8721 	ppnum_t psrc,
8722 	vm_offset_t src_offset,
8723 	ppnum_t pdst,
8724 	vm_offset_t dst_offset,
8725 	vm_size_t len)
8726 {
8727 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8728 	    (addr64_t) (ptoa(pdst) + dst_offset),
8729 	    len);
8730 }
8731 
8732 
8733 /*
8734  *	pmap_zero_page zeros the specified (machine independent) page.
8735  */
8736 void
8737 pmap_zero_page(
8738 	ppnum_t pn)
8739 {
8740 	assert(pn != vm_page_fictitious_addr);
8741 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8742 }
8743 
8744 /*
8745  *	pmap_zero_part_page
8746  *	zeros the specified (machine independent) part of a page.
8747  */
8748 void
8749 pmap_zero_part_page(
8750 	ppnum_t pn,
8751 	vm_offset_t offset,
8752 	vm_size_t len)
8753 {
8754 	assert(pn != vm_page_fictitious_addr);
8755 	assert(offset + len <= PAGE_SIZE);
8756 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8757 }
8758 
8759 void
8760 pmap_map_globals(
8761 	void)
8762 {
8763 	pt_entry_t      *ptep, pte;
8764 
8765 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8766 	assert(ptep != PT_ENTRY_NULL);
8767 	assert(*ptep == ARM_PTE_EMPTY);
8768 
8769 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8770 #if __ARM_KERNEL_PROTECT__
8771 	pte |= ARM_PTE_NG;
8772 #endif /* __ARM_KERNEL_PROTECT__ */
8773 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8774 #if     (__ARM_VMSA__ > 7)
8775 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8776 #else
8777 	pte |= ARM_PTE_SH;
8778 #endif
8779 	*ptep = pte;
8780 	FLUSH_PTE();
8781 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8782 
8783 #if KASAN
8784 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8785 #endif
8786 }
8787 
8788 vm_offset_t
8789 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8790 {
8791 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8792 		panic("%s: invalid index %u", __func__, index);
8793 	}
8794 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8795 }
8796 
8797 MARK_AS_PMAP_TEXT unsigned int
8798 pmap_map_cpu_windows_copy_internal(
8799 	ppnum_t pn,
8800 	vm_prot_t prot,
8801 	unsigned int wimg_bits)
8802 {
8803 	pt_entry_t      *ptep = NULL, pte;
8804 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8805 	unsigned int    cpu_num;
8806 	unsigned int    i;
8807 	vm_offset_t     cpu_copywindow_vaddr = 0;
8808 	bool            need_strong_sync = false;
8809 
8810 #if XNU_MONITOR
8811 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8812 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8813 #endif
8814 
8815 #if XNU_MONITOR
8816 #ifdef  __ARM_COHERENT_IO__
8817 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8818 		panic("%s: attempted to map a managed page, "
8819 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8820 		    __FUNCTION__,
8821 		    pn, prot, wimg_bits);
8822 	}
8823 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8824 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8825 	}
8826 
8827 #else /* __ARM_COHERENT_IO__ */
8828 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8829 #endif /* __ARM_COHERENT_IO__ */
8830 #endif /* XNU_MONITOR */
8831 	cpu_num = pmap_cpu_data->cpu_number;
8832 
8833 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8834 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8835 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8836 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8837 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8838 			break;
8839 		}
8840 	}
8841 	if (i == CPUWINDOWS_MAX) {
8842 		panic("pmap_map_cpu_windows_copy: out of window");
8843 	}
8844 
8845 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8846 #if __ARM_KERNEL_PROTECT__
8847 	pte |= ARM_PTE_NG;
8848 #endif /* __ARM_KERNEL_PROTECT__ */
8849 
8850 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8851 
8852 	if (prot & VM_PROT_WRITE) {
8853 		pte |= ARM_PTE_AP(AP_RWNA);
8854 	} else {
8855 		pte |= ARM_PTE_AP(AP_RONA);
8856 	}
8857 
8858 	write_pte_fast(ptep, pte);
8859 	/*
8860 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8861 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8862 	 */
8863 	FLUSH_PTE_STRONG();
8864 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8865 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8866 
8867 	return i;
8868 }
8869 
8870 unsigned int
8871 pmap_map_cpu_windows_copy(
8872 	ppnum_t pn,
8873 	vm_prot_t prot,
8874 	unsigned int wimg_bits)
8875 {
8876 #if XNU_MONITOR
8877 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8878 #else
8879 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8880 #endif
8881 }
8882 
8883 MARK_AS_PMAP_TEXT void
8884 pmap_unmap_cpu_windows_copy_internal(
8885 	unsigned int index)
8886 {
8887 	pt_entry_t      *ptep;
8888 	unsigned int    cpu_num;
8889 	vm_offset_t     cpu_copywindow_vaddr = 0;
8890 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8891 
8892 	cpu_num = pmap_cpu_data->cpu_number;
8893 
8894 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8895 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8896 	 * (which are likely to have been on I/O memory) are complete before
8897 	 * tearing down the mapping. */
8898 	__builtin_arm_dsb(DSB_SY);
8899 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8900 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8901 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8902 }
8903 
8904 void
8905 pmap_unmap_cpu_windows_copy(
8906 	unsigned int index)
8907 {
8908 #if XNU_MONITOR
8909 	return pmap_unmap_cpu_windows_copy_ppl(index);
8910 #else
8911 	return pmap_unmap_cpu_windows_copy_internal(index);
8912 #endif
8913 }
8914 
8915 #if XNU_MONITOR
8916 
8917 MARK_AS_PMAP_TEXT void
8918 pmap_invoke_with_page(
8919 	ppnum_t page_number,
8920 	void *ctx,
8921 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8922 {
8923 	#pragma unused(page_number, ctx, callback)
8924 }
8925 
8926 /*
8927  * Loop over every pmap_io_range (I/O ranges marked as owned by
8928  * the PPL in the device tree) and conditionally call callback() on each range
8929  * that needs to be included in the hibernation image.
8930  *
8931  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8932  *                 context is needed in the callback.
8933  * @param callback Callback function invoked on each range (gated by flag).
8934  */
8935 MARK_AS_PMAP_TEXT void
8936 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8937 {
8938 	extern const pmap_io_range_t* io_attr_table;
8939 	extern const unsigned int num_io_rgns;
8940 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8941 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8942 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8943 		}
8944 	}
8945 }
8946 
8947 /**
8948  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8949  * PPL-owned page. Otherwise, do nothing.
8950  *
8951  * @param addr Physical address of the page to set the HASHED flag on.
8952  */
8953 MARK_AS_PMAP_TEXT void
8954 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8955 {
8956 	/* Ignore non-managed kernel memory. */
8957 	if (!pa_valid(addr)) {
8958 		return;
8959 	}
8960 
8961 	const unsigned int pai = pa_index(addr);
8962 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8963 		pv_entry_t **pv_h = pai_to_pvh(pai);
8964 
8965 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8966 		pvh_lock(pai);
8967 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8968 		pvh_unlock(pai);
8969 	}
8970 }
8971 
8972 /**
8973  * Loop through every physical page in the system and clear out the HASHED flag
8974  * on every PPL-owned page. That flag is used to keep track of which pages have
8975  * been hashed into the hibernation image during the hibernation entry process.
8976  *
8977  * The HASHED flag needs to be cleared out between hibernation cycles because the
8978  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8979  * image with the HASHED flag set on certain pages. It's important to clear the
8980  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8981  * into the hibernation image can't be compromised across hibernation cycles.
8982  */
8983 MARK_AS_PMAP_TEXT void
8984 pmap_clear_ppl_hashed_flag_all(void)
8985 {
8986 	const unsigned int last_index = pa_index(vm_last_phys);
8987 	pv_entry_t **pv_h = NULL;
8988 
8989 	for (int pai = 0; pai < last_index; ++pai) {
8990 		pv_h = pai_to_pvh(pai);
8991 
8992 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8993 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8994 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8995 			pvh_lock(pai);
8996 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8997 			pvh_unlock(pai);
8998 		}
8999 	}
9000 }
9001 
9002 /**
9003  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9004  * ppl_hib driver will call this after all wired pages have been copied into the
9005  * hibernation image.
9006  */
9007 MARK_AS_PMAP_TEXT void
9008 pmap_check_ppl_hashed_flag_all(void)
9009 {
9010 	const unsigned int last_index = pa_index(vm_last_phys);
9011 	pv_entry_t **pv_h = NULL;
9012 
9013 	for (int pai = 0; pai < last_index; ++pai) {
9014 		pv_h = pai_to_pvh(pai);
9015 
9016 		/**
9017 		 * The PMAP stacks are explicitly not saved into the image so skip checking
9018 		 * the pages that contain the PMAP stacks.
9019 		 */
9020 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9021 		    (pai < pa_index(pmap_stacks_end_pa));
9022 
9023 		if (!is_pmap_stack &&
9024 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9025 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9026 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9027 		}
9028 	}
9029 }
9030 
9031 #endif /* XNU_MONITOR */
9032 
9033 /*
9034  * Indicate that a pmap is intended to be used as a nested pmap
9035  * within one or more larger address spaces.  This must be set
9036  * before pmap_nest() is called with this pmap as the 'subordinate'.
9037  */
9038 MARK_AS_PMAP_TEXT void
9039 pmap_set_nested_internal(
9040 	pmap_t pmap)
9041 {
9042 	validate_pmap_mutable(pmap);
9043 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
9044 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9045 		    __func__, pmap, pmap->type);
9046 	}
9047 	pmap->type = PMAP_TYPE_NESTED;
9048 	pmap_get_pt_ops(pmap)->free_id(pmap);
9049 }
9050 
9051 void
9052 pmap_set_nested(
9053 	pmap_t pmap)
9054 {
9055 #if XNU_MONITOR
9056 	pmap_set_nested_ppl(pmap);
9057 #else
9058 	pmap_set_nested_internal(pmap);
9059 #endif
9060 }
9061 
9062 /*
9063  * pmap_trim_range(pmap, start, end)
9064  *
9065  * pmap  = pmap to operate on
9066  * start = start of the range
9067  * end   = end of the range
9068  *
9069  * Attempts to deallocate TTEs for the given range in the nested range.
9070  */
9071 MARK_AS_PMAP_TEXT static void
9072 pmap_trim_range(
9073 	pmap_t pmap,
9074 	addr64_t start,
9075 	addr64_t end)
9076 {
9077 	addr64_t cur;
9078 	addr64_t nested_region_start;
9079 	addr64_t nested_region_end;
9080 	addr64_t adjusted_start;
9081 	addr64_t adjusted_end;
9082 	addr64_t adjust_offmask;
9083 	tt_entry_t * tte_p;
9084 	pt_entry_t * pte_p;
9085 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9086 
9087 	if (__improbable(end < start)) {
9088 		panic("%s: invalid address range, "
9089 		    "pmap=%p, start=%p, end=%p",
9090 		    __func__,
9091 		    pmap, (void*)start, (void*)end);
9092 	}
9093 
9094 	nested_region_start = pmap->nested_region_addr;
9095 	nested_region_end = nested_region_start + pmap->nested_region_size;
9096 
9097 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9098 		panic("%s: range outside nested region %p-%p, "
9099 		    "pmap=%p, start=%p, end=%p",
9100 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9101 		    pmap, (void*)start, (void*)end);
9102 	}
9103 
9104 	/* Contract the range to TT page boundaries. */
9105 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9106 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9107 	adjusted_end = end & ~adjust_offmask;
9108 
9109 	/* Iterate over the range, trying to remove TTEs. */
9110 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9111 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9112 
9113 		tte_p = pmap_tte(pmap, cur);
9114 
9115 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9116 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9117 
9118 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9119 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9120 				/* Deallocate for the nested map. */
9121 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9122 			} else if (pmap->type == PMAP_TYPE_USER) {
9123 				/**
9124 				 * Just remove for the parent map. If the leaf table pointed
9125 				 * to by the TTE being removed (owned by the nested pmap)
9126 				 * has any mappings, then this call will panic. This
9127 				 * enforces the policy that tables being trimmed must be
9128 				 * empty to prevent possible use-after-free attacks.
9129 				 */
9130 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9131 			} else {
9132 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9133 			}
9134 		} else {
9135 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9136 		}
9137 	}
9138 
9139 #if (__ARM_VMSA__ > 7)
9140 	/* Remove empty L2 TTs. */
9141 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9142 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9143 
9144 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9145 		/* For each L1 entry in our range... */
9146 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9147 
9148 		bool remove_tt1e = true;
9149 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9150 		tt_entry_t * tt2e_start;
9151 		tt_entry_t * tt2e_end;
9152 		tt_entry_t * tt2e_p;
9153 		tt_entry_t tt1e;
9154 
9155 		if (tt1e_p == NULL) {
9156 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9157 			continue;
9158 		}
9159 
9160 		tt1e = *tt1e_p;
9161 
9162 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9163 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9164 			continue;
9165 		}
9166 
9167 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9168 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9169 
9170 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9171 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9172 				/*
9173 				 * If any TTEs are populated, don't remove the
9174 				 * L1 TT.
9175 				 */
9176 				remove_tt1e = false;
9177 			}
9178 		}
9179 
9180 		if (remove_tt1e) {
9181 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9182 		} else {
9183 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9184 		}
9185 	}
9186 #endif /* (__ARM_VMSA__ > 7) */
9187 }
9188 
9189 /*
9190  * pmap_trim_internal(grand, subord, vstart, size)
9191  *
9192  * grand  = pmap subord is nested in
9193  * subord = nested pmap
9194  * vstart = start of the used range in grand
9195  * size   = size of the used range
9196  *
9197  * Attempts to trim the shared region page tables down to only cover the given
9198  * range in subord and grand.
9199  */
9200 MARK_AS_PMAP_TEXT void
9201 pmap_trim_internal(
9202 	pmap_t grand,
9203 	pmap_t subord,
9204 	addr64_t vstart,
9205 	uint64_t size)
9206 {
9207 	addr64_t vend;
9208 	addr64_t adjust_offmask;
9209 
9210 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9211 		panic("%s: grand addr wraps around, "
9212 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9213 		    __func__, grand, subord, (void*)vstart, size);
9214 	}
9215 
9216 	validate_pmap_mutable(grand);
9217 	validate_pmap(subord);
9218 
9219 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9220 
9221 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9222 
9223 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9224 		panic("%s: subord is of non-nestable type 0x%hhx, "
9225 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9226 		    __func__, subord->type, grand, subord, (void*)vstart, size);
9227 	}
9228 
9229 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9230 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9231 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9232 		    __func__, grand->type, grand, subord, (void*)vstart, size);
9233 	}
9234 
9235 	if (__improbable(grand->nested_pmap != subord)) {
9236 		panic("%s: grand->nested != subord, "
9237 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9238 		    __func__, grand, subord, (void*)vstart, size);
9239 	}
9240 
9241 	if (__improbable((size != 0) &&
9242 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9243 		panic("%s: grand range not in nested region, "
9244 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9245 		    __func__, grand, subord, (void*)vstart, size);
9246 	}
9247 
9248 
9249 	if (!grand->nested_has_no_bounds_ref) {
9250 		assert(subord->nested_bounds_set);
9251 
9252 		if (!grand->nested_bounds_set) {
9253 			/* Inherit the bounds from subord. */
9254 			grand->nested_region_true_start = subord->nested_region_true_start;
9255 			grand->nested_region_true_end = subord->nested_region_true_end;
9256 			grand->nested_bounds_set = true;
9257 		}
9258 
9259 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9260 		return;
9261 	}
9262 
9263 	if ((!subord->nested_bounds_set) && size) {
9264 		adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9265 
9266 		subord->nested_region_true_start = vstart;
9267 		subord->nested_region_true_end = vend;
9268 		subord->nested_region_true_start &= ~adjust_offmask;
9269 
9270 		if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9271 			panic("%s: padded true end wraps around, "
9272 			    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9273 			    __func__, grand, subord, (void*)vstart, size);
9274 		}
9275 
9276 		subord->nested_region_true_end &= ~adjust_offmask;
9277 		subord->nested_bounds_set = true;
9278 	}
9279 
9280 	if (subord->nested_bounds_set) {
9281 		/* Inherit the bounds from subord. */
9282 		grand->nested_region_true_start = subord->nested_region_true_start;
9283 		grand->nested_region_true_end = subord->nested_region_true_end;
9284 		grand->nested_bounds_set = true;
9285 
9286 		/* If we know the bounds, we can trim the pmap. */
9287 		grand->nested_has_no_bounds_ref = false;
9288 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9289 	} else {
9290 		/* Don't trim if we don't know the bounds. */
9291 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9292 		return;
9293 	}
9294 
9295 	/* Trim grand to only cover the given range. */
9296 	pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9297 	pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9298 
9299 	/* Try to trim subord. */
9300 	pmap_trim_subord(subord);
9301 }
9302 
9303 MARK_AS_PMAP_TEXT static void
9304 pmap_trim_self(pmap_t pmap)
9305 {
9306 	if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9307 		/* If we have a no bounds ref, we need to drop it. */
9308 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9309 		pmap->nested_has_no_bounds_ref = false;
9310 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9311 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9312 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9313 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9314 
9315 		if (nested_bounds_set) {
9316 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9317 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9318 		}
9319 		/*
9320 		 * Try trimming the nested pmap, in case we had the
9321 		 * last reference.
9322 		 */
9323 		pmap_trim_subord(pmap->nested_pmap);
9324 	}
9325 }
9326 
9327 /*
9328  * pmap_trim_subord(grand, subord)
9329  *
9330  * grand  = pmap that we have nested subord in
9331  * subord = nested pmap we are attempting to trim
9332  *
9333  * Trims subord if possible
9334  */
9335 MARK_AS_PMAP_TEXT static void
9336 pmap_trim_subord(pmap_t subord)
9337 {
9338 	bool contract_subord = false;
9339 
9340 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9341 
9342 	subord->nested_no_bounds_refcnt--;
9343 
9344 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9345 		/* If this was the last no bounds reference, trim subord. */
9346 		contract_subord = true;
9347 	}
9348 
9349 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9350 
9351 	if (contract_subord) {
9352 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9353 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9354 	}
9355 }
9356 
9357 void
9358 pmap_trim(
9359 	pmap_t grand,
9360 	pmap_t subord,
9361 	addr64_t vstart,
9362 	uint64_t size)
9363 {
9364 #if XNU_MONITOR
9365 	pmap_trim_ppl(grand, subord, vstart, size);
9366 
9367 	pmap_ledger_check_balance(grand);
9368 	pmap_ledger_check_balance(subord);
9369 #else
9370 	pmap_trim_internal(grand, subord, vstart, size);
9371 #endif
9372 }
9373 
9374 #if HAS_APPLE_PAC
9375 void *
9376 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9377 {
9378 	void *res = NULL;
9379 	uint64_t current_intr_state = pmap_interrupts_disable();
9380 
9381 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9382 	switch (key) {
9383 	case ptrauth_key_asia:
9384 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9385 		break;
9386 	case ptrauth_key_asda:
9387 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9388 		break;
9389 	default:
9390 		panic("attempt to sign user pointer without process independent key");
9391 	}
9392 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9393 
9394 	pmap_interrupts_restore(current_intr_state);
9395 
9396 	return res;
9397 }
9398 
9399 void *
9400 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9401 {
9402 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9403 }
9404 
9405 void *
9406 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9407 {
9408 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9409 		panic("attempt to auth user pointer without process independent key");
9410 	}
9411 
9412 	void *res = NULL;
9413 	uint64_t current_intr_state = pmap_interrupts_disable();
9414 
9415 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9416 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9417 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9418 
9419 	pmap_interrupts_restore(current_intr_state);
9420 
9421 	return res;
9422 }
9423 
9424 void *
9425 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9426 {
9427 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9428 }
9429 #endif /* HAS_APPLE_PAC */
9430 
9431 /*
9432  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9433  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9434  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9435  * return value, to indicate where a preempted [un]nest operation should resume.
9436  * When the return value contains the ending address of the nested region with
9437  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9438  */
9439 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9440 
9441 /*
9442  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9443  *
9444  *	grand  = the pmap that we will nest subord into
9445  *	subord = the pmap that goes into the grand
9446  *	vstart  = start of range in pmap to be inserted
9447  *	size   = Size of nest area (up to 16TB)
9448  *
9449  *	Inserts a pmap into another.  This is used to implement shared segments.
9450  *
9451  */
9452 
9453 /**
9454  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9455  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9456  * This function operates in 3 main phases:
9457  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9458  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9459  *    the mapping range are present in subord.
9460  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9461  *    contains pointers to subord's leaf-level pagetable pages for the specified
9462  *    VA range.
9463  *
9464  * This function may return early due to pending AST_URGENT preemption; if so
9465  * it will indicate the need to be re-entered.
9466  *
9467  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9468  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9469  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9470  * @param size twig-aligned size of the nesting range
9471  * @param vrestart the twig-aligned starting address of the current call.  May contain
9472  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9473  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9474  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9475  *
9476  * @return the virtual address at which to restart the operation, possibly including
9477  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9478  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9479  */
9480 MARK_AS_PMAP_TEXT vm_map_offset_t
9481 pmap_nest_internal(
9482 	pmap_t grand,
9483 	pmap_t subord,
9484 	addr64_t vstart,
9485 	uint64_t size,
9486 	vm_map_offset_t vrestart,
9487 	kern_return_t *krp)
9488 {
9489 	kern_return_t kr = KERN_FAILURE;
9490 	vm_map_offset_t vaddr;
9491 	tt_entry_t     *stte_p;
9492 	tt_entry_t     *gtte_p;
9493 	unsigned int    nested_region_asid_bitmap_size;
9494 	unsigned int*   nested_region_asid_bitmap;
9495 	int             expand_options = 0;
9496 	bool            deref_subord = true;
9497 
9498 	addr64_t vend;
9499 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9500 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9501 	}
9502 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9503 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9504 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9505 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9506 	}
9507 
9508 	assert(krp != NULL);
9509 	validate_pmap_mutable(grand);
9510 	validate_pmap(subord);
9511 #if XNU_MONITOR
9512 	/*
9513 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9514 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9515 	 * be in the process of being destroyed.  If destruction is already committed,
9516 	 * then the check of ref_count below will cover us.  If destruction is initiated
9517 	 * during or after this call, then pmap_destroy() will catch the non-zero
9518 	 * nested_count.
9519 	 */
9520 	os_atomic_inc(&subord->nested_count, relaxed);
9521 	os_atomic_thread_fence(seq_cst);
9522 #endif
9523 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9524 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9525 	}
9526 
9527 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9528 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9529 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9530 	}
9531 
9532 #if XNU_MONITOR
9533 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9534 #endif
9535 
9536 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9537 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9538 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9539 		    grand, vstart, size, (unsigned long long)vrestart);
9540 	}
9541 
9542 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9543 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9544 	}
9545 
9546 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9547 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9548 	}
9549 
9550 	if (subord->nested_region_asid_bitmap == NULL) {
9551 		nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9552 
9553 #if XNU_MONITOR
9554 		pmap_paddr_t pa = 0;
9555 
9556 		if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9557 			panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9558 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9559 			    __FUNCTION__, nested_region_asid_bitmap_size,
9560 			    grand, subord, vstart, size);
9561 		}
9562 
9563 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9564 
9565 		if (kr != KERN_SUCCESS) {
9566 			goto nest_cleanup;
9567 		}
9568 
9569 		assert(pa);
9570 
9571 		nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9572 #else
9573 		nested_region_asid_bitmap = kalloc_data(
9574 			nested_region_asid_bitmap_size * sizeof(unsigned int),
9575 			Z_WAITOK | Z_ZERO);
9576 #endif
9577 
9578 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9579 		if (subord->nested_region_asid_bitmap == NULL) {
9580 			subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9581 			subord->nested_region_addr = vstart;
9582 			subord->nested_region_size = (mach_vm_offset_t) size;
9583 
9584 			/**
9585 			 * Ensure that the rest of the subord->nested_region_* fields are
9586 			 * initialized and visible before setting the nested_region_asid_bitmap
9587 			 * field (which is used as the flag to say that the rest are initialized).
9588 			 */
9589 			__builtin_arm_dmb(DMB_ISHST);
9590 			subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9591 			nested_region_asid_bitmap = NULL;
9592 		}
9593 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9594 		if (nested_region_asid_bitmap != NULL) {
9595 #if XNU_MONITOR
9596 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9597 #else
9598 			kfree_data(nested_region_asid_bitmap,
9599 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9600 #endif
9601 		}
9602 	}
9603 
9604 	/**
9605 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9606 	 * speculated before their initialization.
9607 	 */
9608 	__builtin_arm_dmb(DMB_ISHLD);
9609 
9610 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9611 		uint64_t        new_size;
9612 		unsigned int    new_nested_region_asid_bitmap_size;
9613 		unsigned int*   new_nested_region_asid_bitmap;
9614 
9615 		nested_region_asid_bitmap = NULL;
9616 		nested_region_asid_bitmap_size = 0;
9617 		new_size =  vend - subord->nested_region_addr;
9618 
9619 		/* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9620 		new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9621 
9622 #if XNU_MONITOR
9623 		pmap_paddr_t pa = 0;
9624 
9625 		if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9626 			panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9627 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9628 			    __FUNCTION__, new_nested_region_asid_bitmap_size,
9629 			    grand, subord, vstart, new_size);
9630 		}
9631 
9632 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9633 
9634 		if (kr != KERN_SUCCESS) {
9635 			goto nest_cleanup;
9636 		}
9637 
9638 		assert(pa);
9639 
9640 		new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9641 #else
9642 		new_nested_region_asid_bitmap = kalloc_data(
9643 			new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9644 			Z_WAITOK | Z_ZERO);
9645 #endif
9646 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9647 		if (subord->nested_region_size < new_size) {
9648 			bcopy(subord->nested_region_asid_bitmap,
9649 			    new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9650 			nested_region_asid_bitmap_size  = subord->nested_region_asid_bitmap_size;
9651 			nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9652 			subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9653 			subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9654 			subord->nested_region_size = new_size;
9655 			new_nested_region_asid_bitmap = NULL;
9656 		}
9657 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9658 		if (nested_region_asid_bitmap != NULL) {
9659 #if XNU_MONITOR
9660 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9661 #else
9662 			kfree_data(nested_region_asid_bitmap,
9663 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9664 #endif
9665 		}
9666 		if (new_nested_region_asid_bitmap != NULL) {
9667 #if XNU_MONITOR
9668 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9669 #else
9670 			kfree_data(new_nested_region_asid_bitmap,
9671 			    new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9672 #endif
9673 		}
9674 	}
9675 
9676 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9677 
9678 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9679 		/*
9680 		 * If this is grand's first nesting operation, keep the reference on subord.
9681 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9682 		 */
9683 		deref_subord = false;
9684 
9685 		if (!subord->nested_bounds_set) {
9686 			/*
9687 			 * We are nesting without the shared regions bounds
9688 			 * being known.  We'll have to trim the pmap later.
9689 			 */
9690 			grand->nested_has_no_bounds_ref = true;
9691 			subord->nested_no_bounds_refcnt++;
9692 		}
9693 
9694 		grand->nested_region_addr = vstart;
9695 		grand->nested_region_size = (mach_vm_offset_t) size;
9696 	} else {
9697 		if (__improbable(grand->nested_pmap != subord)) {
9698 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9699 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9700 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9701 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9702 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9703 		}
9704 	}
9705 
9706 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9707 	if (vaddr < subord->nested_region_true_start) {
9708 		vaddr = subord->nested_region_true_start;
9709 	}
9710 
9711 	addr64_t true_end = vend;
9712 	if (true_end > subord->nested_region_true_end) {
9713 		true_end = subord->nested_region_true_end;
9714 	}
9715 	__unused unsigned int ttecount = 0;
9716 
9717 	if (vrestart & PMAP_NEST_GRAND) {
9718 		goto nest_grand;
9719 	}
9720 #if     (__ARM_VMSA__ == 7)
9721 
9722 	while (vaddr < true_end) {
9723 		stte_p = pmap_tte(subord, vaddr);
9724 		if ((stte_p == (tt_entry_t *)NULL) || (((*stte_p) & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE)) {
9725 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9726 			kr = pmap_expand(subord, vaddr, expand_options, PMAP_TT_L2_LEVEL);
9727 
9728 			if (kr != KERN_SUCCESS) {
9729 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9730 				goto done;
9731 			}
9732 
9733 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9734 		}
9735 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9736 		pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9737 		stte_p = pmap_tte(grand, vaddr);
9738 		if (stte_p == (tt_entry_t *)NULL) {
9739 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9740 			kr = pmap_expand(grand, vaddr, expand_options, PMAP_TT_L1_LEVEL);
9741 
9742 			if (kr != KERN_SUCCESS) {
9743 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9744 				goto done;
9745 			}
9746 		} else {
9747 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9748 			kr = KERN_SUCCESS;
9749 		}
9750 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9751 		vaddr += ARM_TT_L1_SIZE;
9752 		vrestart = vaddr;
9753 	}
9754 
9755 #else
9756 	while (vaddr < true_end) {
9757 		stte_p = pmap_tte(subord, vaddr);
9758 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9759 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9760 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9761 
9762 			if (kr != KERN_SUCCESS) {
9763 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9764 				goto done;
9765 			}
9766 
9767 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9768 		}
9769 		vaddr += pt_attr_twig_size(pt_attr);
9770 		vrestart = vaddr;
9771 		++ttecount;
9772 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9773 		    pmap_pending_preemption())) {
9774 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9775 			kr = KERN_SUCCESS;
9776 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9777 			goto done;
9778 		}
9779 	}
9780 #endif
9781 	/*
9782 	 * copy TTEs from subord pmap into grand pmap
9783 	 */
9784 
9785 	vaddr = (vm_map_offset_t) vstart;
9786 	if (vaddr < subord->nested_region_true_start) {
9787 		vaddr = subord->nested_region_true_start;
9788 	}
9789 	vrestart = vaddr | PMAP_NEST_GRAND;
9790 
9791 nest_grand:
9792 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9793 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9794 #if     (__ARM_VMSA__ == 7)
9795 	while (vaddr < true_end) {
9796 		stte_p = pmap_tte(subord, vaddr);
9797 		gtte_p = pmap_tte(grand, vaddr);
9798 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9799 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9800 			    __func__, gtte_p, grand);
9801 		}
9802 		*gtte_p = *stte_p;
9803 		vaddr += ARM_TT_L1_SIZE;
9804 	}
9805 	vrestart = vaddr | PMAP_NEST_GRAND;
9806 #else
9807 	while (vaddr < true_end) {
9808 		stte_p = pmap_tte(subord, vaddr);
9809 		gtte_p = pmap_tte(grand, vaddr);
9810 		if (gtte_p == PT_ENTRY_NULL) {
9811 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9812 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9813 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9814 
9815 			if (kr != KERN_SUCCESS) {
9816 				goto done;
9817 			}
9818 
9819 			gtte_p = pmap_tt2e(grand, vaddr);
9820 		}
9821 		/* Don't leak a page table page.  Don't violate break-before-make. */
9822 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9823 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9824 			    __func__, gtte_p, grand);
9825 		}
9826 		*gtte_p = *stte_p;
9827 
9828 		vaddr += pt_attr_twig_size(pt_attr);
9829 		vrestart = vaddr | PMAP_NEST_GRAND;
9830 		++ttecount;
9831 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9832 		    pmap_pending_preemption())) {
9833 			break;
9834 		}
9835 	}
9836 #endif
9837 	if (vaddr >= true_end) {
9838 		vrestart = vend | PMAP_NEST_GRAND;
9839 	}
9840 
9841 	kr = KERN_SUCCESS;
9842 done:
9843 
9844 	FLUSH_PTE();
9845 	__builtin_arm_isb(ISB_SY);
9846 
9847 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9848 #if XNU_MONITOR
9849 nest_cleanup:
9850 	if (kr != KERN_SUCCESS) {
9851 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9852 		*krp = kr;
9853 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9854 	}
9855 #else
9856 	if (kr != KERN_SUCCESS) {
9857 		*krp = kr;
9858 	}
9859 #endif
9860 	if (deref_subord) {
9861 #if XNU_MONITOR
9862 		os_atomic_dec(&subord->nested_count, relaxed);
9863 #endif
9864 		pmap_destroy_internal(subord);
9865 	}
9866 	return vrestart;
9867 }
9868 
9869 kern_return_t
9870 pmap_nest(
9871 	pmap_t grand,
9872 	pmap_t subord,
9873 	addr64_t vstart,
9874 	uint64_t size)
9875 {
9876 	kern_return_t kr = KERN_SUCCESS;
9877 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9878 	vm_map_offset_t vend = vaddr + size;
9879 	__unused vm_map_offset_t vlast = vaddr;
9880 
9881 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9882 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9883 	    VM_KERNEL_ADDRHIDE(vstart));
9884 
9885 	pmap_verify_preemptible();
9886 #if XNU_MONITOR
9887 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9888 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9889 		if (kr == KERN_RESOURCE_SHORTAGE) {
9890 			pmap_alloc_page_for_ppl(0);
9891 			kr = KERN_SUCCESS;
9892 		} else if (kr != KERN_SUCCESS) {
9893 			break;
9894 		} else if (vaddr == vlast) {
9895 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9896 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9897 		}
9898 		vlast = vaddr;
9899 	}
9900 
9901 	pmap_ledger_check_balance(grand);
9902 	pmap_ledger_check_balance(subord);
9903 #else
9904 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9905 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9906 	}
9907 #endif
9908 
9909 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9910 
9911 	return kr;
9912 }
9913 
9914 /*
9915  *	kern_return_t pmap_unnest(grand, vaddr)
9916  *
9917  *	grand  = the pmap that will have the virtual range unnested
9918  *	vaddr  = start of range in pmap to be unnested
9919  *	size   = size of range in pmap to be unnested
9920  *
9921  */
9922 
9923 kern_return_t
9924 pmap_unnest(
9925 	pmap_t grand,
9926 	addr64_t vaddr,
9927 	uint64_t size)
9928 {
9929 	return pmap_unnest_options(grand, vaddr, size, 0);
9930 }
9931 
9932 /**
9933  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9934  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9935  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9936  * still have the region nested.  The mappings in 'grand' will be left empty
9937  * with the assumption that they will be demand-filled by subsequent access faults.
9938  *
9939  * This function operates in 2 main phases:
9940  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9941  *    them non-global.
9942  * 2. Clearing of the twig-level TTEs for the address range in grand.
9943  *
9944  * This function may return early due to pending AST_URGENT preemption; if so
9945  * it will indicate the need to be re-entered.
9946  *
9947  * @param grand pmap from which to unnest mappings
9948  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9949  * @param size twig-aligned size of the nested range
9950  * @param vrestart the page-aligned starting address of the current call.  May contain
9951  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9952  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9953  *        grand is being torn down and step 1) above is not needed.
9954  *
9955  * @return the virtual address at which to restart the operation, possibly including
9956  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9957  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9958  */
9959 MARK_AS_PMAP_TEXT vm_map_offset_t
9960 pmap_unnest_options_internal(
9961 	pmap_t grand,
9962 	addr64_t vaddr,
9963 	uint64_t size,
9964 	vm_map_offset_t vrestart,
9965 	unsigned int option)
9966 {
9967 	vm_map_offset_t start;
9968 	vm_map_offset_t addr;
9969 	tt_entry_t     *tte_p;
9970 	unsigned int    current_index;
9971 	unsigned int    start_index;
9972 	unsigned int    max_index;
9973 	unsigned int    entry_count = 0;
9974 
9975 	addr64_t vend;
9976 	addr64_t true_end;
9977 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9978 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9979 	}
9980 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9981 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9982 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9983 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9984 	}
9985 
9986 	validate_pmap_mutable(grand);
9987 
9988 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9989 
9990 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9991 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9992 		    (unsigned long long)vaddr, (unsigned long long)size);
9993 	}
9994 
9995 	if (__improbable(grand->nested_pmap == NULL)) {
9996 		panic("%s: %p has no nested pmap", __func__, grand);
9997 	}
9998 
9999 	true_end = vend;
10000 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10001 		true_end = grand->nested_pmap->nested_region_true_end;
10002 	}
10003 
10004 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10005 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10006 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10007 		}
10008 
10009 		pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10010 
10011 		start = vrestart;
10012 		if (start < grand->nested_pmap->nested_region_true_start) {
10013 			start = grand->nested_pmap->nested_region_true_start;
10014 		}
10015 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10016 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10017 		bool flush_tlb = false;
10018 
10019 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10020 			pt_entry_t  *bpte, *cpte;
10021 
10022 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10023 
10024 			bpte = pmap_pte(grand->nested_pmap, addr);
10025 
10026 			/*
10027 			 * If we've re-entered this function partway through unnesting a leaf region, the
10028 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10029 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
10030 			 * address.
10031 			 */
10032 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
10033 			    (addr & pt_attr_twig_offmask(pt_attr))) {
10034 				/*
10035 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10036 				 * the nested pmap in this region will now be marked non-global.  Do this
10037 				 * before marking any of the PTEs within the region as non-global to avoid
10038 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10039 				 * in the region, which could lead to a TLB conflict if a non-global entry
10040 				 * is later inserted for the same VA in a pmap which has fully unnested this
10041 				 * region.
10042 				 */
10043 				setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
10044 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10045 					pmap_paddr_t    pa;
10046 					unsigned int    pai = 0;
10047 					boolean_t               managed = FALSE;
10048 					pt_entry_t  spte;
10049 
10050 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10051 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10052 						spte = *((volatile pt_entry_t*)cpte);
10053 						while (!managed) {
10054 							pa = pte_to_pa(spte);
10055 							if (!pa_valid(pa)) {
10056 								break;
10057 							}
10058 							pai = pa_index(pa);
10059 							pvh_lock(pai);
10060 							spte = *((volatile pt_entry_t*)cpte);
10061 							pa = pte_to_pa(spte);
10062 							if (pai == pa_index(pa)) {
10063 								managed = TRUE;
10064 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10065 							}
10066 							pvh_unlock(pai);
10067 						}
10068 
10069 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10070 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10071 							flush_tlb = true;
10072 						}
10073 
10074 						if (managed) {
10075 							pvh_assert_locked(pai);
10076 							pvh_unlock(pai);
10077 						}
10078 					}
10079 
10080 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10081 					vrestart = addr;
10082 					++entry_count;
10083 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10084 					    pmap_pending_preemption())) {
10085 						goto unnest_subord_done;
10086 					}
10087 				}
10088 			}
10089 			addr = vlim;
10090 			vrestart = addr;
10091 			++entry_count;
10092 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10093 			    pmap_pending_preemption())) {
10094 				break;
10095 			}
10096 		}
10097 
10098 unnest_subord_done:
10099 		if (flush_tlb) {
10100 			FLUSH_PTE_STRONG();
10101 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10102 		}
10103 
10104 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10105 		if (current_index < max_index) {
10106 			return vrestart;
10107 		}
10108 	}
10109 
10110 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
10111 
10112 	/*
10113 	 * invalidate all pdes for segment at vaddr in pmap grand
10114 	 */
10115 	if (vrestart & PMAP_NEST_GRAND) {
10116 		addr = vrestart & ~PMAP_NEST_GRAND;
10117 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10118 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10119 		}
10120 	} else {
10121 		addr = vaddr;
10122 		vrestart = vaddr | PMAP_NEST_GRAND;
10123 	}
10124 
10125 	if (addr < grand->nested_pmap->nested_region_true_start) {
10126 		addr = grand->nested_pmap->nested_region_true_start;
10127 	}
10128 
10129 	while (addr < true_end) {
10130 		tte_p = pmap_tte(grand, addr);
10131 		/*
10132 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10133 		 * so it's possible that a region we're trying to unnest may not have been
10134 		 * nested in the first place.
10135 		 */
10136 		if (tte_p != NULL) {
10137 			*tte_p = ARM_TTE_TYPE_FAULT;
10138 		}
10139 		addr += pt_attr_twig_size(pt_attr);
10140 		vrestart = addr | PMAP_NEST_GRAND;
10141 		++entry_count;
10142 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10143 		    pmap_pending_preemption())) {
10144 			break;
10145 		}
10146 	}
10147 	if (addr >= true_end) {
10148 		vrestart = vend | PMAP_NEST_GRAND;
10149 	}
10150 
10151 	FLUSH_PTE_STRONG();
10152 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10153 
10154 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10155 
10156 	return vrestart;
10157 }
10158 
10159 kern_return_t
10160 pmap_unnest_options(
10161 	pmap_t grand,
10162 	addr64_t vaddr,
10163 	uint64_t size,
10164 	unsigned int option)
10165 {
10166 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10167 	vm_map_offset_t vend = vaddr + size;
10168 	__unused vm_map_offset_t vlast = vrestart;
10169 
10170 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10171 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10172 
10173 	pmap_verify_preemptible();
10174 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10175 #if XNU_MONITOR
10176 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10177 		if (vrestart == vlast) {
10178 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10179 			    __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
10180 		}
10181 		vlast = vrestart;
10182 #else
10183 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10184 #endif
10185 	}
10186 
10187 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10188 
10189 	return KERN_SUCCESS;
10190 }
10191 
10192 boolean_t
10193 pmap_adjust_unnest_parameters(
10194 	__unused pmap_t p,
10195 	__unused vm_map_offset_t *s,
10196 	__unused vm_map_offset_t *e)
10197 {
10198 	return TRUE; /* to get to log_unnest_badness()... */
10199 }
10200 
10201 /*
10202  * disable no-execute capability on
10203  * the specified pmap
10204  */
10205 #if DEVELOPMENT || DEBUG
10206 void
10207 pmap_disable_NX(
10208 	pmap_t pmap)
10209 {
10210 	pmap->nx_enabled = FALSE;
10211 }
10212 #else
10213 void
10214 pmap_disable_NX(
10215 	__unused pmap_t pmap)
10216 {
10217 }
10218 #endif
10219 
10220 /*
10221  * flush a range of hardware TLB entries.
10222  * NOTE: assumes the smallest TLB entry in use will be for
10223  * an ARM small page (4K).
10224  */
10225 
10226 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
10227 
10228 #if __ARM_RANGE_TLBI__
10229 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
10230 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_PAGES
10231 #else
10232 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10233 #endif // __ARM_RANGE_TLBI__
10234 
10235 static void
10236 flush_mmu_tlb_region_asid_async(
10237 	vm_offset_t va,
10238 	size_t length,
10239 	pmap_t pmap,
10240 	bool last_level_only __unused)
10241 {
10242 #if     (__ARM_VMSA__ == 7)
10243 	vm_offset_t     end = va + length;
10244 	uint32_t        asid;
10245 
10246 	asid = pmap->hw_asid;
10247 
10248 	if (length / ARM_SMALL_PAGE_SIZE > ARM_FULL_TLB_FLUSH_THRESHOLD) {
10249 		boolean_t       flush_all = FALSE;
10250 
10251 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10252 			flush_all = TRUE;
10253 		}
10254 		if (flush_all) {
10255 			flush_mmu_tlb_async();
10256 		} else {
10257 			flush_mmu_tlb_asid_async(asid);
10258 		}
10259 
10260 		return;
10261 	}
10262 	if (pmap->type == PMAP_TYPE_NESTED) {
10263 #if     !__ARM_MP_EXT__
10264 		flush_mmu_tlb();
10265 #else
10266 		va = arm_trunc_page(va);
10267 		while (va < end) {
10268 			flush_mmu_tlb_mva_entries_async(va);
10269 			va += ARM_SMALL_PAGE_SIZE;
10270 		}
10271 #endif
10272 		return;
10273 	}
10274 	va = arm_trunc_page(va) | (asid & 0xff);
10275 	flush_mmu_tlb_entries_async(va, end);
10276 
10277 #else
10278 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10279 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10280 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10281 	uint32_t    asid;
10282 
10283 	asid = pmap->hw_asid;
10284 
10285 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10286 		boolean_t       flush_all = FALSE;
10287 
10288 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10289 			flush_all = TRUE;
10290 		}
10291 		if (flush_all) {
10292 			flush_mmu_tlb_async();
10293 		} else {
10294 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10295 		}
10296 		return;
10297 	}
10298 #if __ARM_RANGE_TLBI__
10299 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10300 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10301 		if (pmap->type == PMAP_TYPE_NESTED) {
10302 			flush_mmu_tlb_allrange_async(va, last_level_only);
10303 		} else {
10304 			flush_mmu_tlb_range_async(va, last_level_only);
10305 		}
10306 		return;
10307 	}
10308 #endif
10309 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10310 	va = tlbi_asid(asid) | tlbi_addr(va);
10311 
10312 	if (pmap->type == PMAP_TYPE_NESTED) {
10313 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10314 	} else {
10315 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10316 	}
10317 
10318 #endif
10319 }
10320 
10321 MARK_AS_PMAP_TEXT static void
10322 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10323 {
10324 #if (__ARM_VMSA__ == 7)
10325 	flush_mmu_tlb_asid_async(pmap->hw_asid);
10326 #else /* (__ARM_VMSA__ == 7) */
10327 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10328 #endif /* (__ARM_VMSA__ == 7) */
10329 }
10330 
10331 void
10332 flush_mmu_tlb_region(
10333 	vm_offset_t va,
10334 	unsigned length)
10335 {
10336 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10337 	sync_tlb_flush();
10338 }
10339 
10340 unsigned int
10341 pmap_cache_attributes(
10342 	ppnum_t pn)
10343 {
10344 	pmap_paddr_t    paddr;
10345 	unsigned int    pai;
10346 	unsigned int    result;
10347 	pp_attr_t       pp_attr_current;
10348 
10349 	paddr = ptoa(pn);
10350 
10351 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10352 
10353 	if (!pa_valid(paddr)) {
10354 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10355 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10356 	}
10357 
10358 	result = VM_WIMG_DEFAULT;
10359 
10360 	pai = pa_index(paddr);
10361 
10362 	pp_attr_current = pp_attr_table[pai];
10363 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10364 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10365 	}
10366 	return result;
10367 }
10368 
10369 MARK_AS_PMAP_TEXT static void
10370 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10371 {
10372 	if ((wimg_bits_prev != wimg_bits_new)
10373 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10374 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10375 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10376 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10377 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10378 		pmap_sync_page_attributes_phys(pn);
10379 	}
10380 
10381 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10382 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10383 	}
10384 }
10385 
10386 MARK_AS_PMAP_TEXT __unused void
10387 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10388 {
10389 	pmap_paddr_t paddr = ptoa(pn);
10390 	const unsigned int pai = pa_index(paddr);
10391 
10392 	if (__improbable(!pa_valid(paddr))) {
10393 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10394 	}
10395 
10396 	pvh_lock(pai);
10397 
10398 #if XNU_MONITOR
10399 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10400 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10401 	}
10402 #endif
10403 
10404 	pmap_update_cache_attributes_locked(pn, new_cacheattr);
10405 
10406 	pvh_unlock(pai);
10407 
10408 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10409 }
10410 
10411 void *
10412 pmap_map_compressor_page(ppnum_t pn)
10413 {
10414 #if __ARM_PTE_PHYSMAP__
10415 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10416 	if (cacheattr != VM_WIMG_DEFAULT) {
10417 #if XNU_MONITOR
10418 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10419 #else
10420 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10421 #endif
10422 	}
10423 #endif
10424 	return (void*)phystokv(ptoa(pn));
10425 }
10426 
10427 void
10428 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10429 {
10430 #if __ARM_PTE_PHYSMAP__
10431 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10432 	if (cacheattr != VM_WIMG_DEFAULT) {
10433 #if XNU_MONITOR
10434 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10435 #else
10436 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10437 #endif
10438 	}
10439 #endif
10440 }
10441 
10442 MARK_AS_PMAP_TEXT boolean_t
10443 pmap_batch_set_cache_attributes_internal(
10444 	ppnum_t pn,
10445 	unsigned int cacheattr,
10446 	unsigned int page_cnt,
10447 	unsigned int page_index,
10448 	boolean_t doit,
10449 	unsigned int *res)
10450 {
10451 	pmap_paddr_t    paddr;
10452 	unsigned int    pai;
10453 	pp_attr_t       pp_attr_current;
10454 	pp_attr_t       pp_attr_template;
10455 	unsigned int    wimg_bits_prev, wimg_bits_new;
10456 
10457 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10458 		cacheattr = VM_WIMG_DEFAULT;
10459 	}
10460 
10461 	if ((doit == FALSE) && (*res == 0)) {
10462 		pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10463 		*res = page_cnt;
10464 		pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10465 		if (platform_cache_batch_wimg(cacheattr & (VM_WIMG_MASK), page_cnt << PAGE_SHIFT) == FALSE) {
10466 			return FALSE;
10467 		}
10468 	}
10469 
10470 	paddr = ptoa(pn);
10471 
10472 	if (!pa_valid(paddr)) {
10473 		panic("pmap_batch_set_cache_attributes(): pn 0x%08x not managed", pn);
10474 	}
10475 
10476 	pai = pa_index(paddr);
10477 
10478 	if (doit) {
10479 		pvh_lock(pai);
10480 #if XNU_MONITOR
10481 		if (ppattr_pa_test_monitor(paddr)) {
10482 			panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10483 		}
10484 #endif
10485 	}
10486 
10487 	do {
10488 		pp_attr_current = pp_attr_table[pai];
10489 		wimg_bits_prev = VM_WIMG_DEFAULT;
10490 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10491 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10492 		}
10493 
10494 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10495 
10496 		if (!doit) {
10497 			break;
10498 		}
10499 
10500 		/* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10501 		 * to avoid losing simultaneous updates to other bits like refmod. */
10502 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10503 
10504 	wimg_bits_new = VM_WIMG_DEFAULT;
10505 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10506 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10507 	}
10508 
10509 	if (doit) {
10510 		if (wimg_bits_new != wimg_bits_prev) {
10511 			pmap_update_cache_attributes_locked(pn, cacheattr);
10512 		}
10513 		pvh_unlock(pai);
10514 		if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10515 			pmap_force_dcache_clean(phystokv(paddr), PAGE_SIZE);
10516 		}
10517 	} else {
10518 		if (wimg_bits_new == VM_WIMG_COPYBACK) {
10519 			return FALSE;
10520 		}
10521 		if (wimg_bits_prev == wimg_bits_new) {
10522 			pmap_pin_kernel_pages((vm_offset_t)res, sizeof(*res));
10523 			*res = *res - 1;
10524 			pmap_unpin_kernel_pages((vm_offset_t)res, sizeof(*res));
10525 			if (!platform_cache_batch_wimg(wimg_bits_new, (*res) << PAGE_SHIFT)) {
10526 				return FALSE;
10527 			}
10528 		}
10529 		return TRUE;
10530 	}
10531 
10532 	if (page_cnt == (page_index + 1)) {
10533 		wimg_bits_prev = VM_WIMG_COPYBACK;
10534 		if (((wimg_bits_prev != wimg_bits_new))
10535 		    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10536 		    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10537 		    && (wimg_bits_new != VM_WIMG_COPYBACK))
10538 		    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10539 		    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10540 			platform_cache_flush_wimg(wimg_bits_new);
10541 		}
10542 	}
10543 
10544 	return TRUE;
10545 }
10546 
10547 boolean_t
10548 pmap_batch_set_cache_attributes(
10549 	ppnum_t pn,
10550 	unsigned int cacheattr,
10551 	unsigned int page_cnt,
10552 	unsigned int page_index,
10553 	boolean_t doit,
10554 	unsigned int *res)
10555 {
10556 #if XNU_MONITOR
10557 	return pmap_batch_set_cache_attributes_ppl(pn, cacheattr, page_cnt, page_index, doit, res);
10558 #else
10559 	return pmap_batch_set_cache_attributes_internal(pn, cacheattr, page_cnt, page_index, doit, res);
10560 #endif
10561 }
10562 
10563 MARK_AS_PMAP_TEXT static void
10564 pmap_set_cache_attributes_priv(
10565 	ppnum_t pn,
10566 	unsigned int cacheattr,
10567 	boolean_t external __unused)
10568 {
10569 	pmap_paddr_t    paddr;
10570 	unsigned int    pai;
10571 	pp_attr_t       pp_attr_current;
10572 	pp_attr_t       pp_attr_template;
10573 	unsigned int    wimg_bits_prev, wimg_bits_new;
10574 
10575 	paddr = ptoa(pn);
10576 
10577 	if (!pa_valid(paddr)) {
10578 		return;                         /* Not a managed page. */
10579 	}
10580 
10581 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10582 		cacheattr = VM_WIMG_DEFAULT;
10583 	}
10584 
10585 	pai = pa_index(paddr);
10586 
10587 	pvh_lock(pai);
10588 
10589 #if XNU_MONITOR
10590 	if (external && ppattr_pa_test_monitor(paddr)) {
10591 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10592 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10593 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10594 	}
10595 #endif
10596 
10597 	do {
10598 		pp_attr_current = pp_attr_table[pai];
10599 		wimg_bits_prev = VM_WIMG_DEFAULT;
10600 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10601 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10602 		}
10603 
10604 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10605 
10606 		/* WIMG bits should only be updated under the PVH lock, but we should do this in a CAS loop
10607 		 * to avoid losing simultaneous updates to other bits like refmod. */
10608 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10609 
10610 	wimg_bits_new = VM_WIMG_DEFAULT;
10611 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10612 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10613 	}
10614 
10615 	if (wimg_bits_new != wimg_bits_prev) {
10616 		pmap_update_cache_attributes_locked(pn, cacheattr);
10617 	}
10618 
10619 	pvh_unlock(pai);
10620 
10621 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10622 }
10623 
10624 MARK_AS_PMAP_TEXT void
10625 pmap_set_cache_attributes_internal(
10626 	ppnum_t pn,
10627 	unsigned int cacheattr)
10628 {
10629 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10630 }
10631 
10632 void
10633 pmap_set_cache_attributes(
10634 	ppnum_t pn,
10635 	unsigned int cacheattr)
10636 {
10637 #if XNU_MONITOR
10638 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10639 #else
10640 	pmap_set_cache_attributes_internal(pn, cacheattr);
10641 #endif
10642 }
10643 
10644 MARK_AS_PMAP_TEXT void
10645 pmap_update_cache_attributes_locked(
10646 	ppnum_t ppnum,
10647 	unsigned attributes)
10648 {
10649 	pmap_paddr_t    phys = ptoa(ppnum);
10650 	pv_entry_t      *pve_p;
10651 	pt_entry_t      *pte_p;
10652 	pv_entry_t      **pv_h;
10653 	pt_entry_t      tmplate;
10654 	unsigned int    pai;
10655 	boolean_t       tlb_flush_needed = FALSE;
10656 
10657 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10658 
10659 	if (pmap_panic_dev_wimg_on_managed) {
10660 		switch (attributes & VM_WIMG_MASK) {
10661 		case VM_WIMG_IO:                        // nGnRnE
10662 		case VM_WIMG_POSTED:                    // nGnRE
10663 		/* supported on DRAM, but slow, so we disallow */
10664 
10665 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10666 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10667 			/* unsupported on DRAM */
10668 
10669 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10670 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10671 			break;
10672 
10673 		default:
10674 			/* not device type memory, all good */
10675 
10676 			break;
10677 		}
10678 	}
10679 
10680 #if __ARM_PTE_PHYSMAP__
10681 	vm_offset_t kva = phystokv(phys);
10682 	pte_p = pmap_pte(kernel_pmap, kva);
10683 
10684 	tmplate = *pte_p;
10685 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10686 #if XNU_MONITOR
10687 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10688 #else
10689 	tmplate |= wimg_to_pte(attributes, phys);
10690 #endif
10691 #if (__ARM_VMSA__ > 7)
10692 	if (tmplate & ARM_PTE_HINT_MASK) {
10693 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10694 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10695 	}
10696 #endif
10697 	write_pte_strong(pte_p, tmplate);
10698 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10699 	tlb_flush_needed = TRUE;
10700 #endif
10701 
10702 	pai = pa_index(phys);
10703 
10704 	pv_h = pai_to_pvh(pai);
10705 
10706 	pte_p = PT_ENTRY_NULL;
10707 	pve_p = PV_ENTRY_NULL;
10708 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10709 		pte_p = pvh_ptep(pv_h);
10710 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10711 		pve_p = pvh_pve_list(pv_h);
10712 		pte_p = PT_ENTRY_NULL;
10713 	}
10714 
10715 	int pve_ptep_idx = 0;
10716 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10717 		vm_map_address_t va;
10718 		pmap_t          pmap;
10719 
10720 		if (pve_p != PV_ENTRY_NULL) {
10721 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10722 			if (pte_p == PT_ENTRY_NULL) {
10723 				goto cache_skip_pve;
10724 			}
10725 		}
10726 
10727 #ifdef PVH_FLAG_IOMMU
10728 		if (pvh_ptep_is_iommu(pte_p)) {
10729 			goto cache_skip_pve;
10730 		}
10731 #endif
10732 		pmap = ptep_get_pmap(pte_p);
10733 		va = ptep_get_va(pte_p);
10734 
10735 		tmplate = *pte_p;
10736 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10737 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10738 
10739 		write_pte_strong(pte_p, tmplate);
10740 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10741 		tlb_flush_needed = TRUE;
10742 
10743 cache_skip_pve:
10744 		pte_p = PT_ENTRY_NULL;
10745 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10746 			pve_ptep_idx = 0;
10747 			pve_p = pve_next(pve_p);
10748 		}
10749 	}
10750 	if (tlb_flush_needed) {
10751 		pmap_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10752 	}
10753 
10754 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10755 }
10756 
10757 #if (__ARM_VMSA__ == 7)
10758 void
10759 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10760     vm_map_address_t *user_commpage_addr)
10761 {
10762 	pmap_paddr_t    pa;
10763 	kern_return_t   kr;
10764 
10765 	assert(kernel_data_addr != NULL);
10766 	assert(kernel_text_addr != NULL);
10767 	assert(user_commpage_addr != NULL);
10768 
10769 	(void) pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, 0);
10770 
10771 	kr = pmap_enter(kernel_pmap, _COMM_PAGE_BASE_ADDRESS, atop(pa), VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10772 	assert(kr == KERN_SUCCESS);
10773 
10774 	*kernel_data_addr = phystokv(pa);
10775 	// We don't have PFZ for 32 bit arm, always NULL
10776 	*kernel_text_addr = 0;
10777 	*user_commpage_addr = 0;
10778 }
10779 
10780 #else /* __ARM_VMSA__ == 7 */
10781 
10782 /**
10783  * Mark a pmap as being dedicated to use for a commpage mapping.
10784  * The pmap itself will never be activated on a CPU; its mappings will
10785  * only be embedded in userspace pmaps at a fixed virtual address.
10786  *
10787  * @param pmap the pmap to mark as belonging to a commpage.
10788  */
10789 static void
10790 pmap_set_commpage(pmap_t pmap)
10791 {
10792 #if XNU_MONITOR
10793 	assert(!pmap_ppl_locked_down);
10794 #endif
10795 	assert(pmap->type == PMAP_TYPE_USER);
10796 	pmap->type = PMAP_TYPE_COMMPAGE;
10797 	/*
10798 	 * Free the pmap's ASID.  This pmap should not ever be directly
10799 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
10800 	 * ASID space contention but will also cause pmap_switch() to panic
10801 	 * if an attacker tries to activate this pmap.  Disable preemption to
10802 	 * accommodate the *_nopreempt spinlock in free_asid().
10803 	 */
10804 	mp_disable_preemption();
10805 	pmap_get_pt_ops(pmap)->free_id(pmap);
10806 	mp_enable_preemption();
10807 }
10808 
10809 static void
10810 pmap_update_tt3e(
10811 	pmap_t pmap,
10812 	vm_address_t address,
10813 	tt_entry_t template)
10814 {
10815 	tt_entry_t *ptep, pte;
10816 
10817 	ptep = pmap_tt3e(pmap, address);
10818 	if (ptep == NULL) {
10819 		panic("%s: no ptep?", __FUNCTION__);
10820 	}
10821 
10822 	pte = *ptep;
10823 	pte = tte_to_pa(pte) | template;
10824 	write_pte_strong(ptep, pte);
10825 }
10826 
10827 /* Note absence of non-global bit */
10828 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10829 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10830 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10831 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10832 
10833 /* Note absence of non-global bit and no-execute bit.  */
10834 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10835 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10836 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10837 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10838 
10839 void
10840 pmap_create_sharedpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10841     vm_map_address_t *user_text_addr)
10842 {
10843 	kern_return_t kr;
10844 	pmap_paddr_t data_pa = 0; // data address
10845 	pmap_paddr_t text_pa = 0; // text address
10846 
10847 	*kernel_data_addr = 0;
10848 	*kernel_text_addr = 0;
10849 	*user_text_addr = 0;
10850 
10851 #if XNU_MONITOR
10852 	data_pa = pmap_alloc_page_for_kern(0);
10853 	assert(data_pa);
10854 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10855 #if CONFIG_ARM_PFZ
10856 	text_pa = pmap_alloc_page_for_kern(0);
10857 	assert(text_pa);
10858 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10859 #endif
10860 
10861 #else /* XNU_MONITOR */
10862 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10863 #if CONFIG_ARM_PFZ
10864 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10865 #endif
10866 
10867 #endif /* XNU_MONITOR */
10868 
10869 	/*
10870 	 * In order to avoid burning extra pages on mapping the shared page, we
10871 	 * create a dedicated pmap for the shared page.  We forcibly nest the
10872 	 * translation tables from this pmap into other pmaps.  The level we
10873 	 * will nest at depends on the MMU configuration (page size, TTBR range,
10874 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10875 	 *
10876 	 * Note that this is NOT "the nested pmap" (which is used to nest the
10877 	 * shared cache).
10878 	 *
10879 	 * Note that we update parameters of the entry for our unique needs (NG
10880 	 * entry, etc.).
10881 	 */
10882 	sharedpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10883 	assert(sharedpage_pmap_default != NULL);
10884 	pmap_set_commpage(sharedpage_pmap_default);
10885 
10886 	/* The user 64-bit mapping... */
10887 	kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10888 	assert(kr == KERN_SUCCESS);
10889 	pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10890 #if CONFIG_ARM_PFZ
10891 	/* User mapping of comm page text section for 64 bit mapping only
10892 	 *
10893 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10894 	 * user processes to get this page mapped in, they should never call into
10895 	 * this page.
10896 	 *
10897 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10898 	 * is slid in the same L3 as the data commpage.  It is either outside the
10899 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10900 	 * it is reserved and unavailable to mach VM for future mappings.
10901 	 */
10902 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(sharedpage_pmap_default);
10903 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10904 
10905 	vm_map_address_t commpage_text_va = 0;
10906 
10907 	do {
10908 		int text_leaf_index = random() % num_ptes;
10909 
10910 		// Generate a VA for the commpage text with the same root and twig index as data
10911 		// comm page, but with new leaf index we've just generated.
10912 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10913 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10914 	} while (commpage_text_va == _COMM_PAGE64_BASE_ADDRESS); // Try again if we collide (should be unlikely)
10915 
10916 	// Assert that this is empty
10917 	__assert_only pt_entry_t *ptep = pmap_pte(sharedpage_pmap_default, commpage_text_va);
10918 	assert(ptep != PT_ENTRY_NULL);
10919 	assert(*ptep == ARM_TTE_EMPTY);
10920 
10921 	// At this point, we've found the address we want to insert our comm page at
10922 	kr = pmap_enter_addr(sharedpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10923 	assert(kr == KERN_SUCCESS);
10924 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10925 	pmap_update_tt3e(sharedpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10926 
10927 	*user_text_addr = commpage_text_va;
10928 #endif
10929 
10930 	/* ...and the user 32-bit mapping. */
10931 	kr = pmap_enter_addr(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10932 	assert(kr == KERN_SUCCESS);
10933 	pmap_update_tt3e(sharedpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10934 
10935 #if __ARM_MIXED_PAGE_SIZE__
10936 	/**
10937 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10938 	 * new set of page tables that point to the exact same 16K shared page as
10939 	 * before. Only the first 4K of the 16K shared page is mapped since that's
10940 	 * the only part that contains relevant data.
10941 	 */
10942 	sharedpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10943 	assert(sharedpage_pmap_4k != NULL);
10944 	pmap_set_commpage(sharedpage_pmap_4k);
10945 
10946 	/* The user 64-bit mapping... */
10947 	kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10948 	assert(kr == KERN_SUCCESS);
10949 	pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10950 
10951 	/* ...and the user 32-bit mapping. */
10952 	kr = pmap_enter_addr(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10953 	assert(kr == KERN_SUCCESS);
10954 	pmap_update_tt3e(sharedpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10955 
10956 #endif
10957 
10958 	/* For manipulation in kernel, go straight to physical page */
10959 	*kernel_data_addr = phystokv(data_pa);
10960 	*kernel_text_addr = (text_pa) ? phystokv(text_pa) : 0;
10961 }
10962 
10963 
10964 /*
10965  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10966  * with user controlled TTEs for regions that aren't explicitly reserved by the
10967  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10968  */
10969 #if (ARM_PGSHIFT == 14)
10970 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10971 #elif (ARM_PGSHIFT == 12)
10972 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= VM_MAX_ADDRESS);
10973 #else
10974 #error Nested shared page mapping is unsupported on this config
10975 #endif
10976 
10977 MARK_AS_PMAP_TEXT kern_return_t
10978 pmap_insert_sharedpage_internal(
10979 	pmap_t pmap)
10980 {
10981 	kern_return_t kr = KERN_SUCCESS;
10982 	vm_offset_t sharedpage_vaddr;
10983 	pt_entry_t *ttep, *src_ttep;
10984 	int options = 0;
10985 	pmap_t sharedpage_pmap = sharedpage_pmap_default;
10986 
10987 	/* Validate the pmap input before accessing its data. */
10988 	validate_pmap_mutable(pmap);
10989 
10990 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10991 	const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
10992 
10993 #if __ARM_MIXED_PAGE_SIZE__
10994 #if !__ARM_16K_PG__
10995 	/* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
10996 	#error "pmap_insert_sharedpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10997 #endif /* !__ARM_16K_PG__ */
10998 
10999 	/* Choose the correct shared page pmap to use. */
11000 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11001 	if (pmap_page_size == 16384) {
11002 		sharedpage_pmap = sharedpage_pmap_default;
11003 	} else if (pmap_page_size == 4096) {
11004 		sharedpage_pmap = sharedpage_pmap_4k;
11005 	} else {
11006 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11007 	}
11008 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11009 
11010 #if XNU_MONITOR
11011 	options |= PMAP_OPTIONS_NOWAIT;
11012 #endif /* XNU_MONITOR */
11013 
11014 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11015 #error We assume a single page.
11016 #endif
11017 
11018 	if (pmap_is_64bit(pmap)) {
11019 		sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11020 	} else {
11021 		sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11022 	}
11023 
11024 
11025 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11026 
11027 	/*
11028 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11029 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11030 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11031 	 * to "nest".
11032 	 *
11033 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11034 	 * nesting just means inserting pointers to pre-allocated tables inside of
11035 	 * the passed in pmap to allow us to share page tables (which map the shared
11036 	 * page) for every task. This saves at least one page of memory per process
11037 	 * compared to creating new page tables in every process for mapping the
11038 	 * shared page.
11039 	 */
11040 
11041 	/**
11042 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11043 	 * page's tables into place.
11044 	 */
11045 	while ((ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr)) == TT_ENTRY_NULL) {
11046 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11047 
11048 		kr = pmap_expand(pmap, sharedpage_vaddr, options, sharedpage_level);
11049 
11050 		if (kr != KERN_SUCCESS) {
11051 #if XNU_MONITOR
11052 			if (kr == KERN_RESOURCE_SHORTAGE) {
11053 				return kr;
11054 			} else
11055 #endif
11056 			{
11057 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11058 			}
11059 		}
11060 
11061 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11062 	}
11063 
11064 	if (*ttep != ARM_PTE_EMPTY) {
11065 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11066 	}
11067 
11068 	src_ttep = pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr);
11069 
11070 	*ttep = *src_ttep;
11071 	FLUSH_PTE_STRONG();
11072 
11073 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11074 
11075 	return kr;
11076 }
11077 
11078 static void
11079 pmap_unmap_sharedpage(
11080 	pmap_t pmap)
11081 {
11082 	pt_entry_t *ttep;
11083 	vm_offset_t sharedpage_vaddr;
11084 	pmap_t sharedpage_pmap = sharedpage_pmap_default;
11085 
11086 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11087 	const unsigned int sharedpage_level = pt_attr_commpage_level(pt_attr);
11088 
11089 #if __ARM_MIXED_PAGE_SIZE__
11090 #if !__ARM_16K_PG__
11091 	/* The following code assumes that sharedpage_pmap_default is a 16KB pmap. */
11092 	#error "pmap_unmap_sharedpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11093 #endif /* !__ARM_16K_PG__ */
11094 
11095 	/* Choose the correct shared page pmap to use. */
11096 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11097 	if (pmap_page_size == 16384) {
11098 		sharedpage_pmap = sharedpage_pmap_default;
11099 	} else if (pmap_page_size == 4096) {
11100 		sharedpage_pmap = sharedpage_pmap_4k;
11101 	} else {
11102 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11103 	}
11104 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11105 
11106 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11107 #error We assume a single page.
11108 #endif
11109 
11110 	if (pmap_is_64bit(pmap)) {
11111 		sharedpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11112 	} else {
11113 		sharedpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11114 	}
11115 
11116 
11117 	ttep = pmap_ttne(pmap, sharedpage_level, sharedpage_vaddr);
11118 
11119 	if (ttep == NULL) {
11120 		return;
11121 	}
11122 
11123 	/* It had better be mapped to the shared page. */
11124 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(sharedpage_pmap, sharedpage_level, sharedpage_vaddr)) {
11125 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11126 	}
11127 
11128 	*ttep = ARM_TTE_EMPTY;
11129 	FLUSH_PTE_STRONG();
11130 
11131 	flush_mmu_tlb_region_asid_async(sharedpage_vaddr, PAGE_SIZE, pmap, false);
11132 	sync_tlb_flush();
11133 }
11134 
11135 void
11136 pmap_insert_sharedpage(
11137 	pmap_t pmap)
11138 {
11139 #if XNU_MONITOR
11140 	kern_return_t kr = KERN_FAILURE;
11141 
11142 	while ((kr = pmap_insert_sharedpage_ppl(pmap)) == KERN_RESOURCE_SHORTAGE) {
11143 		pmap_alloc_page_for_ppl(0);
11144 	}
11145 
11146 	pmap_ledger_check_balance(pmap);
11147 
11148 	if (kr != KERN_SUCCESS) {
11149 		panic("%s: failed to insert the shared page, kr=%d, "
11150 		    "pmap=%p",
11151 		    __FUNCTION__, kr,
11152 		    pmap);
11153 	}
11154 #else
11155 	pmap_insert_sharedpage_internal(pmap);
11156 #endif
11157 }
11158 
11159 static boolean_t
11160 pmap_is_64bit(
11161 	pmap_t pmap)
11162 {
11163 	return pmap->is_64bit;
11164 }
11165 
11166 bool
11167 pmap_is_exotic(
11168 	pmap_t pmap __unused)
11169 {
11170 	return false;
11171 }
11172 
11173 #endif
11174 
11175 /* ARMTODO -- an implementation that accounts for
11176  * holes in the physical map, if any.
11177  */
11178 boolean_t
11179 pmap_valid_page(
11180 	ppnum_t pn)
11181 {
11182 	return pa_valid(ptoa(pn));
11183 }
11184 
11185 boolean_t
11186 pmap_bootloader_page(
11187 	ppnum_t pn)
11188 {
11189 	pmap_paddr_t paddr = ptoa(pn);
11190 
11191 	if (pa_valid(paddr)) {
11192 		return FALSE;
11193 	}
11194 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11195 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11196 }
11197 
11198 MARK_AS_PMAP_TEXT boolean_t
11199 pmap_is_empty_internal(
11200 	pmap_t pmap,
11201 	vm_map_offset_t va_start,
11202 	vm_map_offset_t va_end)
11203 {
11204 	vm_map_offset_t block_start, block_end;
11205 	tt_entry_t *tte_p;
11206 
11207 	if (pmap == NULL) {
11208 		return TRUE;
11209 	}
11210 
11211 	validate_pmap(pmap);
11212 
11213 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11214 	unsigned int initial_not_in_kdp = not_in_kdp;
11215 
11216 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11217 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11218 	}
11219 
11220 #if     (__ARM_VMSA__ == 7)
11221 	if (tte_index(pt_attr, va_end) >= pmap->tte_index_max) {
11222 		if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11223 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
11224 		}
11225 		return TRUE;
11226 	}
11227 #endif
11228 
11229 	/* TODO: This will be faster if we increment ttep at each level. */
11230 	block_start = va_start;
11231 
11232 	while (block_start < va_end) {
11233 		pt_entry_t     *bpte_p, *epte_p;
11234 		pt_entry_t     *pte_p;
11235 
11236 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11237 		if (block_end > va_end) {
11238 			block_end = va_end;
11239 		}
11240 
11241 		tte_p = pmap_tte(pmap, block_start);
11242 		if ((tte_p != PT_ENTRY_NULL)
11243 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11244 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11245 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11246 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11247 
11248 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11249 				if (*pte_p != ARM_PTE_EMPTY) {
11250 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11251 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11252 					}
11253 					return FALSE;
11254 				}
11255 			}
11256 		}
11257 		block_start = block_end;
11258 	}
11259 
11260 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11261 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11262 	}
11263 
11264 	return TRUE;
11265 }
11266 
11267 boolean_t
11268 pmap_is_empty(
11269 	pmap_t pmap,
11270 	vm_map_offset_t va_start,
11271 	vm_map_offset_t va_end)
11272 {
11273 #if XNU_MONITOR
11274 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11275 #else
11276 	return pmap_is_empty_internal(pmap, va_start, va_end);
11277 #endif
11278 }
11279 
11280 vm_map_offset_t
11281 pmap_max_offset(
11282 	boolean_t               is64,
11283 	unsigned int    option)
11284 {
11285 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11286 }
11287 
11288 vm_map_offset_t
11289 pmap_max_64bit_offset(
11290 	__unused unsigned int option)
11291 {
11292 	vm_map_offset_t max_offset_ret = 0;
11293 
11294 #if defined(__arm64__)
11295 	#define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000) // end of shared region + 512MB for various purposes
11296 	_Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
11297 	    "Minimum address space size outside allowable range");
11298 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11299 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11300 		max_offset_ret = arm64_pmap_max_offset_default;
11301 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11302 		max_offset_ret = min_max_offset;
11303 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11304 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11305 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11306 		if (arm64_pmap_max_offset_default) {
11307 			max_offset_ret = arm64_pmap_max_offset_default;
11308 		} else if (max_mem > 0xC0000000) {
11309 			max_offset_ret = min_max_offset + 0x138000000; // Max offset is 13.375GB for devices with > 3GB of memory
11310 		} else if (max_mem > 0x40000000) {
11311 			max_offset_ret = min_max_offset + 0x38000000;  // Max offset is 9.375GB for devices with > 1GB and <= 3GB of memory
11312 		} else {
11313 			max_offset_ret = min_max_offset;
11314 		}
11315 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11316 		if (arm64_pmap_max_offset_default) {
11317 			// Allow the boot-arg to override jumbo size
11318 			max_offset_ret = arm64_pmap_max_offset_default;
11319 		} else {
11320 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11321 		}
11322 	} else {
11323 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11324 	}
11325 
11326 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11327 	assert(max_offset_ret >= min_max_offset);
11328 #else
11329 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11330 #endif
11331 
11332 	return max_offset_ret;
11333 }
11334 
11335 vm_map_offset_t
11336 pmap_max_32bit_offset(
11337 	unsigned int option)
11338 {
11339 	vm_map_offset_t max_offset_ret = 0;
11340 
11341 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11342 		max_offset_ret = arm_pmap_max_offset_default;
11343 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11344 		max_offset_ret = 0x80000000;
11345 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11346 		max_offset_ret = VM_MAX_ADDRESS;
11347 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11348 		if (arm_pmap_max_offset_default) {
11349 			max_offset_ret = arm_pmap_max_offset_default;
11350 		} else if (max_mem > 0x20000000) {
11351 			max_offset_ret = 0x80000000;
11352 		} else {
11353 			max_offset_ret = 0x80000000;
11354 		}
11355 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11356 		max_offset_ret = 0x80000000;
11357 	} else {
11358 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11359 	}
11360 
11361 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11362 	return max_offset_ret;
11363 }
11364 
11365 #if CONFIG_DTRACE
11366 /*
11367  * Constrain DTrace copyin/copyout actions
11368  */
11369 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11370 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11371 
11372 kern_return_t
11373 dtrace_copyio_preflight(
11374 	__unused addr64_t va)
11375 {
11376 	if (current_map() == kernel_map) {
11377 		return KERN_FAILURE;
11378 	} else {
11379 		return KERN_SUCCESS;
11380 	}
11381 }
11382 
11383 kern_return_t
11384 dtrace_copyio_postflight(
11385 	__unused addr64_t va)
11386 {
11387 	return KERN_SUCCESS;
11388 }
11389 #endif /* CONFIG_DTRACE */
11390 
11391 
11392 void
11393 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11394 {
11395 }
11396 
11397 
11398 void
11399 pmap_flush(
11400 	__unused pmap_flush_context *cpus_to_flush)
11401 {
11402 	/* not implemented yet */
11403 	return;
11404 }
11405 
11406 #if XNU_MONITOR
11407 
11408 /*
11409  * Enforce that the address range described by kva and nbytes is not currently
11410  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11411  * unintentionally writing to PPL-owned memory.
11412  */
11413 static void
11414 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11415 {
11416 	vm_offset_t end;
11417 	if (os_add_overflow(kva, nbytes, &end)) {
11418 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11419 	}
11420 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11421 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11422 		pp_attr_t attr;
11423 		unsigned int pai = pa_index(pa);
11424 		if (ckva == phystokv(pa)) {
11425 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11426 		}
11427 		do {
11428 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11429 			if (attr & PP_ATTR_MONITOR) {
11430 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11431 			}
11432 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11433 	}
11434 }
11435 
11436 static void
11437 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11438 {
11439 	vm_offset_t end;
11440 	if (os_add_overflow(kva, nbytes, &end)) {
11441 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11442 	}
11443 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11444 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11445 
11446 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11447 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11448 		}
11449 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11450 		ppattr_pa_clear_no_monitor(pa);
11451 	}
11452 }
11453 
11454 /**
11455  * Lock down a page, making all mappings read-only, and preventing further
11456  * mappings or removal of this particular kva's mapping. Effectively, it makes
11457  * the physical page at kva immutable (see the ppl_writable parameter for an
11458  * exception to this).
11459  *
11460  * @param kva Valid address to any mapping of the physical page to lockdown.
11461  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11462  * @param ppl_writable True if the PPL should still be able to write to the page
11463  *                     using the physical aperture mapping. False will make the
11464  *                     page read-only for both the kernel and PPL in the
11465  *                     physical aperture.
11466  */
11467 MARK_AS_PMAP_TEXT static void
11468 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11469 {
11470 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11471 	const unsigned int pai = pa_index(pa);
11472 
11473 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11474 	pvh_lock(pai);
11475 	pv_entry_t **pvh = pai_to_pvh(pai);
11476 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11477 
11478 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11479 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11480 	}
11481 
11482 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11483 		panic("%s: %#lx already locked down/executable (%#llx)",
11484 		    __func__, kva, (uint64_t)pvh_flags);
11485 	}
11486 
11487 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11488 
11489 	/* Update the physical aperture mapping to prevent kernel write access. */
11490 	const unsigned int new_xprr_perm =
11491 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11492 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11493 
11494 	pvh_unlock(pai);
11495 
11496 	pmap_page_protect_options_internal((ppnum_t)atop(pa), VM_PROT_READ, 0, NULL);
11497 
11498 	/**
11499 	 * Double-check that the mapping didn't change physical addresses before the
11500 	 * LOCKDOWN flag was set (there is a brief window between the above
11501 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11502 	 *
11503 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11504 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11505 	 * page without the LOCKDOWN flag already set (so any future mappings can
11506 	 * only be RO, and no existing mappings can be removed).
11507 	 */
11508 	if (kvtophys_nofail(kva) != pa) {
11509 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11510 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11511 	}
11512 }
11513 
11514 /**
11515  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11516  * kernel once again.
11517  *
11518  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11519  *       to unlockdown a page that was never locked down, will panic.
11520  *
11521  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11522  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11523  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11524  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11525  *                     deviation will result in a panic.
11526  */
11527 MARK_AS_PMAP_TEXT static void
11528 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11529 {
11530 	pvh_assert_locked(pai);
11531 	pv_entry_t **pvh = pai_to_pvh(pai);
11532 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11533 
11534 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11535 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11536 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11537 	}
11538 
11539 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11540 
11541 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11542 	const unsigned int old_xprr_perm =
11543 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11544 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11545 }
11546 
11547 /**
11548  * Release a page from being locked down to the PPL, making it writable to the
11549  * kernel once again.
11550  *
11551  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11552  *       to unlockdown a page that was never locked down, will panic.
11553  *
11554  * @param kva Valid address to any mapping of the physical page to unlockdown.
11555  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11556  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11557  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11558  *                     deviation will result in a panic.
11559  */
11560 MARK_AS_PMAP_TEXT static void
11561 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11562 {
11563 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11564 	const unsigned int pai = pa_index(pa);
11565 
11566 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11567 	pvh_lock(pai);
11568 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11569 	pvh_unlock(pai);
11570 }
11571 
11572 #else /* XNU_MONITOR */
11573 
11574 static void __unused
11575 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11576 {
11577 }
11578 
11579 static void __unused
11580 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11581 {
11582 }
11583 
11584 #endif /* !XNU_MONITOR */
11585 
11586 
11587 MARK_AS_PMAP_TEXT static inline void
11588 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11589 {
11590 #if XNU_MONITOR
11591 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11592 #else
11593 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11594 #endif
11595 }
11596 
11597 MARK_AS_PMAP_TEXT static inline void
11598 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11599 {
11600 #if XNU_MONITOR
11601 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11602 #else
11603 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11604 #endif
11605 }
11606 
11607 /**
11608  * Perform basic validation checks on the destination only and
11609  * corresponding offset/sizes prior to writing to a read only allocation.
11610  *
11611  * @note Should be called before writing to an allocation from the read
11612  * only allocator.
11613  *
11614  * @param zid The ID of the zone the allocation belongs to.
11615  * @param va VA of element being modified (destination).
11616  * @param offset Offset being written to, in the element.
11617  * @param new_data_size Size of modification.
11618  *
11619  */
11620 
11621 MARK_AS_PMAP_TEXT static void
11622 pmap_ro_zone_validate_element_dst(
11623 	zone_id_t           zid,
11624 	vm_offset_t         va,
11625 	vm_offset_t         offset,
11626 	vm_size_t           new_data_size)
11627 {
11628 	vm_size_t elem_size = zone_elem_size_ro(zid);
11629 	vm_offset_t sum = 0, page = trunc_page(va);
11630 
11631 	if (__improbable(new_data_size > (elem_size - offset))) {
11632 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
11633 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11634 	}
11635 	if (__improbable(offset >= elem_size)) {
11636 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
11637 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11638 	}
11639 	if (__improbable(os_add3_overflow(va, offset, new_data_size, &sum))) {
11640 		panic("%s: Integer addition overflow %p + %lu + %lu = %lu",
11641 		    __func__, (void*)va, (uintptr_t)offset, (uintptr_t) new_data_size,
11642 		    (uintptr_t)sum);
11643 	}
11644 	if (__improbable((va - page) % elem_size)) {
11645 		panic("%s: Start of element %p is not aligned to element size %lu",
11646 		    __func__, (void *)va, (uintptr_t)elem_size);
11647 	}
11648 
11649 	/* Check element is from correct zone */
11650 	zone_require_ro(zid, elem_size, (void*)va);
11651 }
11652 
11653 
11654 /**
11655  * Perform basic validation checks on the source, destination and
11656  * corresponding offset/sizes prior to writing to a read only allocation.
11657  *
11658  * @note Should be called before writing to an allocation from the read
11659  * only allocator.
11660  *
11661  * @param zid The ID of the zone the allocation belongs to.
11662  * @param va VA of element being modified (destination).
11663  * @param offset Offset being written to, in the element.
11664  * @param new_data Pointer to new data (source).
11665  * @param new_data_size Size of modification.
11666  *
11667  */
11668 
11669 MARK_AS_PMAP_TEXT static void
11670 pmap_ro_zone_validate_element(
11671 	zone_id_t           zid,
11672 	vm_offset_t         va,
11673 	vm_offset_t         offset,
11674 	const vm_offset_t   new_data,
11675 	vm_size_t           new_data_size)
11676 {
11677 	vm_offset_t sum = 0;
11678 
11679 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11680 		panic("%s: Integer addition overflow %p + %lu = %lu",
11681 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11682 	}
11683 
11684 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11685 }
11686 
11687 /**
11688  * Ensure that physical page is locked down and pinned, before writing to it.
11689  *
11690  * @note Should be called before writing to an allocation from the read
11691  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11692  * ensure that it is called after the modification.
11693  *
11694  *
11695  * @param pa Physical address of the element being modified.
11696  * @param va Virtual address of element being modified.
11697  * @param size Size of the modification.
11698  *
11699  */
11700 
11701 MARK_AS_PMAP_TEXT static void
11702 pmap_ro_zone_lock_phy_page(
11703 	const pmap_paddr_t  pa,
11704 	vm_offset_t         va,
11705 	vm_size_t           size)
11706 {
11707 	const unsigned int pai = pa_index(pa);
11708 	pvh_lock(pai);
11709 
11710 	/* Ensure that the physical page is locked down */
11711 #if XNU_MONITOR
11712 	pv_entry_t **pvh = pai_to_pvh(pai);
11713 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11714 		panic("%s: Physical page not locked down %llx", __func__, pa);
11715 	}
11716 #endif /* XNU_MONITOR */
11717 
11718 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
11719 	pmap_pin_kernel_pages(va, size);
11720 }
11721 
11722 /**
11723  * Unlock and unpin physical page after writing to it.
11724  *
11725  * @note Should be called after writing to an allocation from the read
11726  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11727  * ensure that it has been called prior to the modification.
11728  *
11729  * @param pa Physical address of the element that was modified.
11730  * @param va Virtual address of element that was modified.
11731  * @param size Size of the modification.
11732  *
11733  */
11734 
11735 MARK_AS_PMAP_TEXT static void
11736 pmap_ro_zone_unlock_phy_page(
11737 	const pmap_paddr_t  pa,
11738 	vm_offset_t         va,
11739 	vm_size_t           size)
11740 {
11741 	const unsigned int pai = pa_index(pa);
11742 	pmap_unpin_kernel_pages(va, size);
11743 	pvh_unlock(pai);
11744 }
11745 
11746 /**
11747  * Function to copy kauth_cred from new_data to kv.
11748  * Function defined in "kern_prot.c"
11749  *
11750  * @note Will be removed upon completion of
11751  * <rdar://problem/72635194> Compiler PAC support for memcpy.
11752  *
11753  * @param kv Address to copy new data to.
11754  * @param new_data Pointer to new data.
11755  *
11756  */
11757 
11758 extern void
11759 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11760 
11761 /**
11762  * Zalloc-specific memcpy that writes through the physical aperture
11763  * and ensures the element being modified is from a read-only zone.
11764  *
11765  * @note Designed to work only with the zone allocator's read-only submap.
11766  *
11767  * @param zid The ID of the zone to allocate from.
11768  * @param va VA of element to be modified.
11769  * @param offset Offset from element.
11770  * @param new_data Pointer to new data.
11771  * @param new_data_size	Size of modification.
11772  *
11773  */
11774 
11775 void
11776 pmap_ro_zone_memcpy(
11777 	zone_id_t           zid,
11778 	vm_offset_t         va,
11779 	vm_offset_t         offset,
11780 	const vm_offset_t   new_data,
11781 	vm_size_t           new_data_size)
11782 {
11783 #if XNU_MONITOR
11784 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11785 #else /* XNU_MONITOR */
11786 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11787 #endif /* XNU_MONITOR */
11788 }
11789 
11790 MARK_AS_PMAP_TEXT void
11791 pmap_ro_zone_memcpy_internal(
11792 	zone_id_t             zid,
11793 	vm_offset_t           va,
11794 	vm_offset_t           offset,
11795 	const vm_offset_t     new_data,
11796 	vm_size_t             new_data_size)
11797 {
11798 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11799 
11800 	if (!new_data || new_data_size == 0) {
11801 		return;
11802 	}
11803 
11804 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11805 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11806 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11807 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11808 }
11809 
11810 /**
11811  * Zalloc-specific function to atomically mutate fields of an element that
11812  * belongs to a read-only zone, via the physcial aperture.
11813  *
11814  * @note Designed to work only with the zone allocator's read-only submap.
11815  *
11816  * @param zid The ID of the zone the element belongs to.
11817  * @param va VA of element to be modified.
11818  * @param offset Offset in element.
11819  * @param op Atomic operation to perform.
11820  * @param value	Mutation value.
11821  *
11822  */
11823 
11824 uint64_t
11825 pmap_ro_zone_atomic_op(
11826 	zone_id_t             zid,
11827 	vm_offset_t           va,
11828 	vm_offset_t           offset,
11829 	zro_atomic_op_t       op,
11830 	uint64_t              value)
11831 {
11832 #if XNU_MONITOR
11833 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11834 #else /* XNU_MONITOR */
11835 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11836 #endif /* XNU_MONITOR */
11837 }
11838 
11839 MARK_AS_PMAP_TEXT uint64_t
11840 pmap_ro_zone_atomic_op_internal(
11841 	zone_id_t             zid,
11842 	vm_offset_t           va,
11843 	vm_offset_t           offset,
11844 	zro_atomic_op_t       op,
11845 	uint64_t              value)
11846 {
11847 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11848 	vm_size_t value_size = op & 0xf;
11849 
11850 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11851 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
11852 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11853 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11854 
11855 	return value;
11856 }
11857 
11858 /**
11859  * bzero for allocations from read only zones, that writes through the
11860  * physical aperture.
11861  *
11862  * @note This is called by the zfree path of all allocations from read
11863  * only zones.
11864  *
11865  * @param zid The ID of the zone the allocation belongs to.
11866  * @param va VA of element to be zeroed.
11867  * @param offset Offset in the element.
11868  * @param size	Size of allocation.
11869  *
11870  */
11871 
11872 void
11873 pmap_ro_zone_bzero(
11874 	zone_id_t       zid,
11875 	vm_offset_t     va,
11876 	vm_offset_t     offset,
11877 	vm_size_t       size)
11878 {
11879 #if XNU_MONITOR
11880 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11881 #else /* XNU_MONITOR */
11882 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
11883 #endif /* XNU_MONITOR */
11884 }
11885 
11886 MARK_AS_PMAP_TEXT void
11887 pmap_ro_zone_bzero_internal(
11888 	zone_id_t       zid,
11889 	vm_offset_t     va,
11890 	vm_offset_t     offset,
11891 	vm_size_t       size)
11892 {
11893 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11894 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11895 	pmap_ro_zone_lock_phy_page(pa, va, size);
11896 	bzero((void*)phystokv(pa), size);
11897 	pmap_ro_zone_unlock_phy_page(pa, va, size);
11898 }
11899 
11900 /**
11901  * Removes write access from the Physical Aperture.
11902  *
11903  * @note For non-PPL devices, it simply makes all virtual mappings RO.
11904  * @note Designed to work only with the zone allocator's read-only submap.
11905  *
11906  * @param va VA of the page to restore write access to.
11907  *
11908  */
11909 MARK_AS_PMAP_TEXT static void
11910 pmap_phys_write_disable(vm_address_t va)
11911 {
11912 #if XNU_MONITOR
11913 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11914 #else /* XNU_MONITOR */
11915 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11916 #endif /* XNU_MONITOR */
11917 }
11918 
11919 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
11920 
11921 MARK_AS_PMAP_TEXT mach_vm_size_t
11922 pmap_query_resident_internal(
11923 	pmap_t                  pmap,
11924 	vm_map_address_t        start,
11925 	vm_map_address_t        end,
11926 	mach_vm_size_t          *compressed_bytes_p)
11927 {
11928 	mach_vm_size_t  resident_bytes = 0;
11929 	mach_vm_size_t  compressed_bytes = 0;
11930 
11931 	pt_entry_t     *bpte, *epte;
11932 	pt_entry_t     *pte_p;
11933 	tt_entry_t     *tte_p;
11934 
11935 	if (pmap == NULL) {
11936 		return PMAP_RESIDENT_INVALID;
11937 	}
11938 
11939 	validate_pmap(pmap);
11940 
11941 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11942 
11943 	/* Ensure that this request is valid, and addresses exactly one TTE. */
11944 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11945 	    (end % pt_attr_page_size(pt_attr)))) {
11946 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11947 	}
11948 
11949 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11950 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11951 	}
11952 
11953 	pmap_lock(pmap, PMAP_LOCK_SHARED);
11954 	tte_p = pmap_tte(pmap, start);
11955 	if (tte_p == (tt_entry_t *) NULL) {
11956 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11957 		return PMAP_RESIDENT_INVALID;
11958 	}
11959 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
11960 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
11961 		bpte = &pte_p[pte_index(pt_attr, start)];
11962 		epte = &pte_p[pte_index(pt_attr, end)];
11963 
11964 		for (; bpte < epte; bpte++) {
11965 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
11966 				compressed_bytes += pt_attr_page_size(pt_attr);
11967 			} else if (pa_valid(pte_to_pa(*bpte))) {
11968 				resident_bytes += pt_attr_page_size(pt_attr);
11969 			}
11970 		}
11971 	}
11972 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
11973 
11974 	if (compressed_bytes_p) {
11975 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11976 		*compressed_bytes_p += compressed_bytes;
11977 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
11978 	}
11979 
11980 	return resident_bytes;
11981 }
11982 
11983 mach_vm_size_t
11984 pmap_query_resident(
11985 	pmap_t                  pmap,
11986 	vm_map_address_t        start,
11987 	vm_map_address_t        end,
11988 	mach_vm_size_t          *compressed_bytes_p)
11989 {
11990 	mach_vm_size_t          total_resident_bytes;
11991 	mach_vm_size_t          compressed_bytes;
11992 	vm_map_address_t        va;
11993 
11994 
11995 	if (pmap == PMAP_NULL) {
11996 		if (compressed_bytes_p) {
11997 			*compressed_bytes_p = 0;
11998 		}
11999 		return 0;
12000 	}
12001 
12002 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12003 
12004 	total_resident_bytes = 0;
12005 	compressed_bytes = 0;
12006 
12007 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12008 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12009 	    VM_KERNEL_ADDRHIDE(end));
12010 
12011 	va = start;
12012 	while (va < end) {
12013 		vm_map_address_t l;
12014 		mach_vm_size_t resident_bytes;
12015 
12016 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12017 
12018 		if (l > end) {
12019 			l = end;
12020 		}
12021 #if XNU_MONITOR
12022 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12023 #else
12024 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12025 #endif
12026 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12027 			break;
12028 		}
12029 
12030 		total_resident_bytes += resident_bytes;
12031 
12032 		va = l;
12033 	}
12034 
12035 	if (compressed_bytes_p) {
12036 		*compressed_bytes_p = compressed_bytes;
12037 	}
12038 
12039 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12040 	    total_resident_bytes);
12041 
12042 	return total_resident_bytes;
12043 }
12044 
12045 #if MACH_ASSERT
12046 static void
12047 pmap_check_ledgers(
12048 	pmap_t pmap)
12049 {
12050 	int     pid;
12051 	char    *procname;
12052 
12053 	if (pmap->pmap_pid == 0) {
12054 		/*
12055 		 * This pmap was not or is no longer fully associated
12056 		 * with a task (e.g. the old pmap after a fork()/exec() or
12057 		 * spawn()).  Its "ledger" still points at a task that is
12058 		 * now using a different (and active) address space, so
12059 		 * we can't check that all the pmap ledgers are balanced here.
12060 		 *
12061 		 * If the "pid" is set, that means that we went through
12062 		 * pmap_set_process() in task_terminate_internal(), so
12063 		 * this task's ledger should not have been re-used and
12064 		 * all the pmap ledgers should be back to 0.
12065 		 */
12066 		return;
12067 	}
12068 
12069 	pid = pmap->pmap_pid;
12070 	procname = pmap->pmap_procname;
12071 
12072 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12073 }
12074 #endif /* MACH_ASSERT */
12075 
12076 void
12077 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12078 {
12079 }
12080 
12081 /**
12082  * The minimum shared region nesting size is used by the VM to determine when to
12083  * break up large mappings to nested regions. The smallest size that these
12084  * mappings can be broken into is determined by what page table level those
12085  * regions are being nested in at and the size of the page tables.
12086  *
12087  * For instance, if a nested region is nesting at L2 for a process utilizing
12088  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12089  * block entry).
12090  *
12091  * @param pmap The target pmap to determine the block size based on whether it's
12092  *             using 16KB or 4KB page tables.
12093  */
12094 uint64_t
12095 pmap_shared_region_size_min(__unused pmap_t pmap)
12096 {
12097 #if (__ARM_VMSA__ > 7)
12098 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12099 
12100 	/**
12101 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12102 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12103 	 * point to shared L3 page tables in the shared region pmap.
12104 	 */
12105 	return pt_attr_twig_size(pt_attr);
12106 
12107 #else
12108 	return ARM_NESTING_SIZE_MIN;
12109 #endif
12110 }
12111 
12112 boolean_t
12113 pmap_enforces_execute_only(
12114 #if (__ARM_VMSA__ == 7)
12115 	__unused
12116 #endif
12117 	pmap_t pmap)
12118 {
12119 #if (__ARM_VMSA__ > 7)
12120 	return pmap != kernel_pmap;
12121 #else
12122 	return FALSE;
12123 #endif
12124 }
12125 
12126 MARK_AS_PMAP_TEXT void
12127 pmap_set_vm_map_cs_enforced_internal(
12128 	pmap_t pmap,
12129 	bool new_value)
12130 {
12131 	validate_pmap_mutable(pmap);
12132 	pmap->pmap_vm_map_cs_enforced = new_value;
12133 }
12134 
12135 void
12136 pmap_set_vm_map_cs_enforced(
12137 	pmap_t pmap,
12138 	bool new_value)
12139 {
12140 #if XNU_MONITOR
12141 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12142 #else
12143 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12144 #endif
12145 }
12146 
12147 extern int cs_process_enforcement_enable;
12148 bool
12149 pmap_get_vm_map_cs_enforced(
12150 	pmap_t pmap)
12151 {
12152 	if (cs_process_enforcement_enable) {
12153 		return true;
12154 	}
12155 	return pmap->pmap_vm_map_cs_enforced;
12156 }
12157 
12158 MARK_AS_PMAP_TEXT void
12159 pmap_set_jit_entitled_internal(
12160 	__unused pmap_t pmap)
12161 {
12162 	return;
12163 }
12164 
12165 void
12166 pmap_set_jit_entitled(
12167 	pmap_t pmap)
12168 {
12169 #if XNU_MONITOR
12170 	pmap_set_jit_entitled_ppl(pmap);
12171 #else
12172 	pmap_set_jit_entitled_internal(pmap);
12173 #endif
12174 }
12175 
12176 bool
12177 pmap_get_jit_entitled(
12178 	__unused pmap_t pmap)
12179 {
12180 	return false;
12181 }
12182 
12183 MARK_AS_PMAP_TEXT kern_return_t
12184 pmap_query_page_info_internal(
12185 	pmap_t          pmap,
12186 	vm_map_offset_t va,
12187 	int             *disp_p)
12188 {
12189 	pmap_paddr_t    pa;
12190 	int             disp;
12191 	unsigned int    pai;
12192 	pt_entry_t      *pte;
12193 	pv_entry_t      **pv_h, *pve_p;
12194 
12195 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12196 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12197 		*disp_p = 0;
12198 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12199 		return KERN_INVALID_ARGUMENT;
12200 	}
12201 
12202 	disp = 0;
12203 
12204 	validate_pmap(pmap);
12205 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12206 
12207 	pte = pmap_pte(pmap, va);
12208 	if (pte == PT_ENTRY_NULL) {
12209 		goto done;
12210 	}
12211 
12212 	pa = pte_to_pa(*((volatile pt_entry_t*)pte));
12213 	if (pa == 0) {
12214 		if (ARM_PTE_IS_COMPRESSED(*pte, pte)) {
12215 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12216 			if (*pte & ARM_PTE_COMPRESSED_ALT) {
12217 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12218 			}
12219 		}
12220 	} else {
12221 		disp |= PMAP_QUERY_PAGE_PRESENT;
12222 		pai = pa_index(pa);
12223 		if (!pa_valid(pa)) {
12224 			goto done;
12225 		}
12226 		pvh_lock(pai);
12227 		pv_h = pai_to_pvh(pai);
12228 		pve_p = PV_ENTRY_NULL;
12229 		int pve_ptep_idx = 0;
12230 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12231 			pve_p = pvh_pve_list(pv_h);
12232 			while (pve_p != PV_ENTRY_NULL &&
12233 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte)) == -1) {
12234 				pve_p = pve_next(pve_p);
12235 			}
12236 		}
12237 
12238 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12239 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12240 		} else if (ppattr_test_reusable(pai)) {
12241 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12242 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12243 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12244 		}
12245 		pvh_unlock(pai);
12246 	}
12247 
12248 done:
12249 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12250 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12251 	*disp_p = disp;
12252 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12253 	return KERN_SUCCESS;
12254 }
12255 
12256 kern_return_t
12257 pmap_query_page_info(
12258 	pmap_t          pmap,
12259 	vm_map_offset_t va,
12260 	int             *disp_p)
12261 {
12262 #if XNU_MONITOR
12263 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12264 #else
12265 	return pmap_query_page_info_internal(pmap, va, disp_p);
12266 #endif
12267 }
12268 
12269 
12270 
12271 static vm_map_size_t
12272 pmap_user_va_size(pmap_t pmap __unused)
12273 {
12274 #if (__ARM_VMSA__ == 7)
12275 	return VM_MAX_ADDRESS;
12276 #else
12277 #if __ARM_MIXED_PAGE_SIZE__
12278 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12279 	return 1ULL << (64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK));
12280 #else
12281 	return 1ULL << (64 - T0SZ_BOOT);
12282 #endif
12283 #endif /* __ARM_VMSA > 7 */
12284 }
12285 
12286 
12287 
12288 kern_return_t
12289 pmap_load_legacy_trust_cache(struct pmap_legacy_trust_cache __unused *trust_cache,
12290     const vm_size_t __unused trust_cache_len)
12291 {
12292 	// Unsupported
12293 	return KERN_NOT_SUPPORTED;
12294 }
12295 
12296 pmap_tc_ret_t
12297 pmap_load_image4_trust_cache(struct pmap_image4_trust_cache __unused *trust_cache,
12298     const vm_size_t __unused trust_cache_len,
12299     uint8_t const * __unused img4_manifest,
12300     const vm_size_t __unused img4_manifest_buffer_len,
12301     const vm_size_t __unused img4_manifest_actual_len,
12302     bool __unused dry_run)
12303 {
12304 	// Unsupported
12305 	return PMAP_TC_UNKNOWN_FORMAT;
12306 }
12307 
12308 bool
12309 pmap_in_ppl(void)
12310 {
12311 	// Unsupported
12312 	return false;
12313 }
12314 
12315 bool
12316 pmap_has_ppl(void)
12317 {
12318 	// Unsupported
12319 	return false;
12320 }
12321 
12322 void
12323 pmap_lockdown_image4_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12324 {
12325 	// Unsupported
12326 }
12327 
12328 void
12329 pmap_lockdown_image4_late_slab(__unused vm_offset_t slab, __unused vm_size_t slab_len, __unused uint64_t flags)
12330 {
12331 	// Unsupported
12332 }
12333 
12334 void *
12335 pmap_claim_reserved_ppl_page(void)
12336 {
12337 	// Unsupported
12338 	return NULL;
12339 }
12340 
12341 void
12342 pmap_free_reserved_ppl_page(void __unused *kva)
12343 {
12344 	// Unsupported
12345 }
12346 
12347 
12348 MARK_AS_PMAP_TEXT bool
12349 pmap_is_trust_cache_loaded_internal(const uuid_t uuid)
12350 {
12351 	bool found = false;
12352 
12353 	pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12354 
12355 	for (struct pmap_image4_trust_cache const *c = pmap_image4_trust_caches; c != NULL; c = c->next) {
12356 		if (bcmp(uuid, c->module->uuid, sizeof(uuid_t)) == 0) {
12357 			found = true;
12358 			goto done;
12359 		}
12360 	}
12361 
12362 #ifdef PLATFORM_BridgeOS
12363 	for (struct pmap_legacy_trust_cache const *c = pmap_legacy_trust_caches; c != NULL; c = c->next) {
12364 		if (bcmp(uuid, c->uuid, sizeof(uuid_t)) == 0) {
12365 			found = true;
12366 			goto done;
12367 		}
12368 	}
12369 #endif
12370 
12371 done:
12372 	pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12373 	return found;
12374 }
12375 
12376 bool
12377 pmap_is_trust_cache_loaded(const uuid_t uuid)
12378 {
12379 #if XNU_MONITOR
12380 	return pmap_is_trust_cache_loaded_ppl(uuid);
12381 #else
12382 	return pmap_is_trust_cache_loaded_internal(uuid);
12383 #endif
12384 }
12385 
12386 MARK_AS_PMAP_TEXT bool
12387 pmap_lookup_in_loaded_trust_caches_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12388 {
12389 	struct pmap_image4_trust_cache const *cache = NULL;
12390 #ifdef PLATFORM_BridgeOS
12391 	struct pmap_legacy_trust_cache const *legacy = NULL;
12392 #endif
12393 
12394 	pmap_simple_lock(&pmap_loaded_trust_caches_lock);
12395 
12396 	for (cache = pmap_image4_trust_caches; cache != NULL; cache = cache->next) {
12397 		uint8_t hash_type = 0, flags = 0;
12398 
12399 		if (lookup_in_trust_cache_module(cache->module, cdhash, &hash_type, &flags)) {
12400 			goto done;
12401 		}
12402 	}
12403 
12404 #ifdef PLATFORM_BridgeOS
12405 	for (legacy = pmap_legacy_trust_caches; legacy != NULL; legacy = legacy->next) {
12406 		for (uint32_t i = 0; i < legacy->num_hashes; i++) {
12407 			if (bcmp(legacy->hashes[i], cdhash, CS_CDHASH_LEN) == 0) {
12408 				goto done;
12409 			}
12410 		}
12411 	}
12412 #endif
12413 
12414 done:
12415 	pmap_simple_unlock(&pmap_loaded_trust_caches_lock);
12416 
12417 	if (cache != NULL) {
12418 		return true;
12419 #ifdef PLATFORM_BridgeOS
12420 	} else if (legacy != NULL) {
12421 		return true;
12422 #endif
12423 	}
12424 
12425 	return false;
12426 }
12427 
12428 bool
12429 pmap_lookup_in_loaded_trust_caches(const uint8_t cdhash[CS_CDHASH_LEN])
12430 {
12431 #if XNU_MONITOR
12432 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
12433 #else
12434 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
12435 #endif
12436 }
12437 
12438 MARK_AS_PMAP_TEXT uint32_t
12439 pmap_lookup_in_static_trust_cache_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12440 {
12441 	// Awkward indirection, because the PPL macros currently force their functions to be static.
12442 	return lookup_in_static_trust_cache(cdhash);
12443 }
12444 
12445 uint32_t
12446 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
12447 {
12448 #if XNU_MONITOR
12449 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
12450 #else
12451 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
12452 #endif
12453 }
12454 
12455 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
12456 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
12457 
12458 MARK_AS_PMAP_TEXT void
12459 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12460 {
12461 
12462 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12463 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
12464 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12465 
12466 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X", cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
12467 }
12468 
12469 MARK_AS_PMAP_TEXT bool
12470 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
12471 {
12472 	bool match = false;
12473 
12474 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
12475 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
12476 		match = true;
12477 	}
12478 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
12479 
12480 	if (match) {
12481 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
12482 	}
12483 
12484 	return match;
12485 }
12486 
12487 void
12488 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12489 {
12490 #if XNU_MONITOR
12491 	pmap_set_compilation_service_cdhash_ppl(cdhash);
12492 #else
12493 	pmap_set_compilation_service_cdhash_internal(cdhash);
12494 #endif
12495 }
12496 
12497 bool
12498 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
12499 {
12500 #if XNU_MONITOR
12501 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
12502 #else
12503 	return pmap_match_compilation_service_cdhash_internal(cdhash);
12504 #endif
12505 }
12506 
12507 /*
12508  * As part of supporting local signing on the device, we need the PMAP layer
12509  * to store the local signing key so that PMAP_CS can validate with it. We
12510  * store it at the PMAP layer such that it is accessible to both AMFI and
12511  * PMAP_CS should they need it.
12512  */
12513 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
12514 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE] = { 0 };
12515 
12516 MARK_AS_PMAP_TEXT void
12517 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12518 {
12519 	bool key_set = false;
12520 
12521 	/*
12522 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
12523 	 * a successful exchange means that the local signing public key has _not_ been
12524 	 * set. In case the key has been set, we panic as we would never expect the
12525 	 * kernel to attempt to set the key more than once.
12526 	 */
12527 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
12528 
12529 	if (key_set) {
12530 		panic("attempted to set the local signing public key multiple times");
12531 	}
12532 
12533 	memcpy(pmap_local_signing_public_key, public_key, PMAP_ECC_P384_PUBLIC_KEY_SIZE);
12534 	pmap_cs_log_info("set local signing public key");
12535 }
12536 
12537 void
12538 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_ECC_P384_PUBLIC_KEY_SIZE])
12539 {
12540 #if XNU_MONITOR
12541 	return pmap_set_local_signing_public_key_ppl(public_key);
12542 #else
12543 	return pmap_set_local_signing_public_key_internal(public_key);
12544 #endif
12545 }
12546 
12547 uint8_t*
12548 pmap_get_local_signing_public_key(void)
12549 {
12550 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
12551 
12552 	if (key_set) {
12553 		return pmap_local_signing_public_key;
12554 	}
12555 
12556 	return NULL;
12557 }
12558 
12559 /*
12560  * Locally signed applications need to be explicitly authorized by an entitled application
12561  * before we allow them to run.
12562  */
12563 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
12564 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
12565 
12566 MARK_AS_PMAP_TEXT void
12567 pmap_unrestrict_local_signing_internal(
12568 	const uint8_t cdhash[CS_CDHASH_LEN])
12569 {
12570 
12571 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12572 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12573 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12574 
12575 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
12576 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
12577 }
12578 
12579 void
12580 pmap_unrestrict_local_signing(
12581 	const uint8_t cdhash[CS_CDHASH_LEN])
12582 {
12583 #if XNU_MONITOR
12584 	return pmap_unrestrict_local_signing_ppl(cdhash);
12585 #else
12586 	return pmap_unrestrict_local_signing_internal(cdhash);
12587 #endif
12588 }
12589 
12590 #if PMAP_CS
12591 MARK_AS_PMAP_TEXT static void
12592 pmap_restrict_local_signing(void)
12593 {
12594 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12595 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
12596 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12597 }
12598 
12599 MARK_AS_PMAP_TEXT static bool
12600 pmap_local_signing_restricted(
12601 	const uint8_t cdhash[CS_CDHASH_LEN])
12602 {
12603 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
12604 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
12605 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
12606 
12607 	return ret != 0;
12608 }
12609 
12610 MARK_AS_PMAP_TEXT bool
12611 pmap_cs_query_entitlements_internal(
12612 	pmap_t pmap,
12613 	CEQuery_t query,
12614 	size_t queryLength,
12615 	CEQueryContext_t finalContext)
12616 {
12617 	struct pmap_cs_code_directory *cd_entry = NULL;
12618 	bool ret = false;
12619 
12620 	if (!pmap_cs) {
12621 		panic("PMAP_CS: cannot query for entitlements as pmap_cs is turned off");
12622 	}
12623 
12624 	/*
12625 	 * When a pmap has not been passed in, we assume the caller wants to check the
12626 	 * entitlements on the current user space process.
12627 	 */
12628 	if (pmap == NULL) {
12629 		pmap = current_pmap();
12630 	}
12631 
12632 	if (pmap == kernel_pmap) {
12633 		/*
12634 		 * Instead of panicking we will just return false.
12635 		 */
12636 		return false;
12637 	}
12638 
12639 	if (query == NULL || queryLength > 64) {
12640 		panic("PMAP_CS: bogus entitlements query");
12641 	} else {
12642 		pmap_cs_assert_addr((vm_address_t)query, sizeof(CEQueryOperation_t) * queryLength, false, true);
12643 	}
12644 
12645 	if (finalContext != NULL) {
12646 		pmap_cs_assert_addr((vm_address_t)finalContext, sizeof(*finalContext), false, false);
12647 	}
12648 
12649 	validate_pmap(pmap);
12650 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12651 
12652 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
12653 	if (cd_entry == NULL) {
12654 		pmap_cs_log_error("attempted to query entitlements from an invalid pmap or a retired code directory");
12655 		goto out;
12656 	}
12657 
12658 	if (cd_entry->ce_ctx == NULL) {
12659 		pmap_cs_log_debug("%s: code signature doesn't have any entitlements", cd_entry->identifier);
12660 		goto out;
12661 	}
12662 
12663 	der_vm_context_t executionContext = cd_entry->ce_ctx->der_context;
12664 
12665 	for (size_t op = 0; op < queryLength; op++) {
12666 		executionContext = amfi->CoreEntitlements.der_vm_execute(executionContext, query[op]);
12667 	}
12668 
12669 	if (amfi->CoreEntitlements.der_vm_context_is_valid(executionContext)) {
12670 		ret = true;
12671 		if (finalContext != NULL) {
12672 			pmap_pin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12673 			finalContext->der_context = executionContext;
12674 			pmap_unpin_kernel_pages((vm_offset_t)finalContext, sizeof(*finalContext));
12675 		}
12676 	} else {
12677 		ret = false;
12678 	}
12679 
12680 out:
12681 	if (cd_entry) {
12682 		lck_rw_unlock_shared(&cd_entry->rwlock);
12683 		cd_entry = NULL;
12684 	}
12685 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12686 
12687 	return ret;
12688 }
12689 #endif
12690 
12691 bool
12692 pmap_query_entitlements(
12693 	__unused pmap_t pmap,
12694 	__unused CEQuery_t query,
12695 	__unused size_t queryLength,
12696 	__unused CEQueryContext_t finalContext)
12697 {
12698 #if !PMAP_SUPPORTS_ENTITLEMENT_CHECKS
12699 	panic("PMAP_CS: do not use this API without checking for \'#if PMAP_SUPPORTS_ENTITLEMENT_CHECKS\'");
12700 #else
12701 
12702 #if XNU_MONITOR
12703 	return pmap_cs_query_entitlements_ppl(pmap, query, queryLength, finalContext);
12704 #else
12705 	return pmap_cs_query_entitlements_internal(pmap, query, queryLength, finalContext);
12706 #endif
12707 
12708 #endif /* !PMAP_SUPPORTS_ENTITLEMENT_CHECKS */
12709 }
12710 
12711 MARK_AS_PMAP_TEXT void
12712 pmap_footprint_suspend_internal(
12713 	vm_map_t        map,
12714 	boolean_t       suspend)
12715 {
12716 #if DEVELOPMENT || DEBUG
12717 	if (suspend) {
12718 		current_thread()->pmap_footprint_suspended = TRUE;
12719 		map->pmap->footprint_was_suspended = TRUE;
12720 	} else {
12721 		current_thread()->pmap_footprint_suspended = FALSE;
12722 	}
12723 #else /* DEVELOPMENT || DEBUG */
12724 	(void) map;
12725 	(void) suspend;
12726 #endif /* DEVELOPMENT || DEBUG */
12727 }
12728 
12729 void
12730 pmap_footprint_suspend(
12731 	vm_map_t map,
12732 	boolean_t suspend)
12733 {
12734 #if XNU_MONITOR
12735 	pmap_footprint_suspend_ppl(map, suspend);
12736 #else
12737 	pmap_footprint_suspend_internal(map, suspend);
12738 #endif
12739 }
12740 
12741 MARK_AS_PMAP_TEXT void
12742 pmap_nop_internal(pmap_t pmap __unused)
12743 {
12744 	validate_pmap_mutable(pmap);
12745 }
12746 
12747 void
12748 pmap_nop(pmap_t pmap)
12749 {
12750 #if XNU_MONITOR
12751 	pmap_nop_ppl(pmap);
12752 #else
12753 	pmap_nop_internal(pmap);
12754 #endif
12755 }
12756 
12757 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
12758 
12759 struct page_table_dump_header {
12760 	uint64_t pa;
12761 	uint64_t num_entries;
12762 	uint64_t start_va;
12763 	uint64_t end_va;
12764 };
12765 
12766 static kern_return_t
12767 pmap_dump_page_tables_recurse(pmap_t pmap,
12768     const tt_entry_t *ttp,
12769     unsigned int cur_level,
12770     unsigned int level_mask,
12771     uint64_t start_va,
12772     void *buf_start,
12773     void *buf_end,
12774     size_t *bytes_copied)
12775 {
12776 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12777 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
12778 
12779 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
12780 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
12781 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
12782 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
12783 
12784 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
12785 
12786 	if (cur_level == pt_attr_root_level(pt_attr)) {
12787 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12788 	}
12789 
12790 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12791 	const tt_entry_t *tt_end = &ttp[num_entries];
12792 
12793 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12794 		return KERN_INSUFFICIENT_BUFFER_SIZE;
12795 	}
12796 
12797 	if (level_mask & (1U << cur_level)) {
12798 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12799 		header->pa = ml_static_vtop((vm_offset_t)ttp);
12800 		header->num_entries = num_entries;
12801 		header->start_va = start_va;
12802 		header->end_va = start_va + (num_entries * size);
12803 
12804 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12805 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12806 	}
12807 	uint64_t current_va = start_va;
12808 
12809 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12810 		tt_entry_t tte = *ttep;
12811 
12812 		if (!(tte & valid_mask)) {
12813 			continue;
12814 		}
12815 
12816 		if ((tte & type_mask) == type_block) {
12817 			continue;
12818 		} else {
12819 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12820 				panic("%s: corrupt entry %#llx at %p, "
12821 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12822 				    __FUNCTION__, tte, ttep,
12823 				    ttp, cur_level, bufp, buf_end);
12824 			}
12825 
12826 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12827 
12828 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12829 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
12830 
12831 			if (recurse_result != KERN_SUCCESS) {
12832 				return recurse_result;
12833 			}
12834 		}
12835 	}
12836 
12837 	return KERN_SUCCESS;
12838 }
12839 
12840 kern_return_t
12841 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12842 {
12843 	if (not_in_kdp) {
12844 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
12845 	}
12846 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12847 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
12848 }
12849 
12850 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
12851 
12852 kern_return_t
12853 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12854     unsigned int level_mask __unused, size_t *bytes_copied __unused)
12855 {
12856 	return KERN_NOT_SUPPORTED;
12857 }
12858 #endif /* !defined(__arm64__) */
12859 
12860 
12861 #ifdef CONFIG_XNUPOST
12862 #ifdef __arm64__
12863 static volatile bool pmap_test_took_fault = false;
12864 
12865 static bool
12866 pmap_test_fault_handler(arm_saved_state_t * state)
12867 {
12868 	bool retval                 = false;
12869 	uint32_t esr                = get_saved_state_esr(state);
12870 	esr_exception_class_t class = ESR_EC(esr);
12871 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
12872 
12873 	if ((class == ESR_EC_DABORT_EL1) &&
12874 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
12875 		pmap_test_took_fault = true;
12876 		/* return to the instruction immediately after the call to NX page */
12877 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12878 		retval = true;
12879 	}
12880 
12881 	return retval;
12882 }
12883 
12884 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12885 static NOKASAN bool
12886 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12887 {
12888 	pmap_t old_pmap = NULL;
12889 
12890 	pmap_test_took_fault = false;
12891 
12892 	/*
12893 	 * We're potentially switching pmaps without using the normal thread
12894 	 * mechanism; disable interrupts and preemption to avoid any unexpected
12895 	 * memory accesses.
12896 	 */
12897 	uint64_t old_int_state = pmap_interrupts_disable();
12898 	mp_disable_preemption();
12899 
12900 	if (pmap != NULL) {
12901 		old_pmap = current_pmap();
12902 		pmap_switch(pmap);
12903 
12904 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
12905 #if __ARM_PAN_AVAILABLE__
12906 		__builtin_arm_wsr("pan", 0);
12907 #endif /* __ARM_PAN_AVAILABLE__ */
12908 	}
12909 
12910 	ml_expect_fault_begin(pmap_test_fault_handler, va);
12911 
12912 	if (is_write) {
12913 		*((volatile uint64_t*)(va)) = 0xdec0de;
12914 	} else {
12915 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
12916 		(void)tmp;
12917 	}
12918 
12919 	/* Save the fault bool, and undo the gross stuff we did. */
12920 	bool took_fault = pmap_test_took_fault;
12921 	ml_expect_fault_end();
12922 
12923 	if (pmap != NULL) {
12924 #if __ARM_PAN_AVAILABLE__
12925 		__builtin_arm_wsr("pan", 1);
12926 #endif /* __ARM_PAN_AVAILABLE__ */
12927 
12928 		pmap_switch(old_pmap);
12929 	}
12930 
12931 	mp_enable_preemption();
12932 	pmap_interrupts_restore(old_int_state);
12933 	bool retval = (took_fault == should_fault);
12934 	return retval;
12935 }
12936 
12937 static bool
12938 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12939 {
12940 	bool retval = pmap_test_access(pmap, va, should_fault, false);
12941 
12942 	if (!retval) {
12943 		T_FAIL("%s: %s, "
12944 		    "pmap=%p, va=%p, should_fault=%u",
12945 		    __func__, should_fault ? "did not fault" : "faulted",
12946 		    pmap, (void*)va, (unsigned)should_fault);
12947 	}
12948 
12949 	return retval;
12950 }
12951 
12952 static bool
12953 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12954 {
12955 	bool retval = pmap_test_access(pmap, va, should_fault, true);
12956 
12957 	if (!retval) {
12958 		T_FAIL("%s: %s, "
12959 		    "pmap=%p, va=%p, should_fault=%u",
12960 		    __func__, should_fault ? "did not fault" : "faulted",
12961 		    pmap, (void*)va, (unsigned)should_fault);
12962 	}
12963 
12964 	return retval;
12965 }
12966 
12967 static bool
12968 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12969 {
12970 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12971 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12972 
12973 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12974 
12975 	if (!retval) {
12976 		T_FAIL("%s: bits=%u, "
12977 		    "pa=%p, should_be_set=%u",
12978 		    __func__, bits,
12979 		    (void*)pa, should_be_set);
12980 	}
12981 
12982 	return retval;
12983 }
12984 
12985 static __attribute__((noinline)) bool
12986 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12987 {
12988 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12989 	return retval;
12990 }
12991 
12992 static int
12993 pmap_test_test_config(unsigned int flags)
12994 {
12995 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
12996 	unsigned int map_count = 0;
12997 	unsigned long page_ratio = 0;
12998 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
12999 
13000 	if (!pmap) {
13001 		panic("Failed to allocate pmap");
13002 	}
13003 
13004 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13005 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13006 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13007 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13008 
13009 	if (pmap_page_size <= native_page_size) {
13010 		page_ratio = native_page_size / pmap_page_size;
13011 	} else {
13012 		/*
13013 		 * We claim to support a page_ratio of less than 1, which is
13014 		 * not currently supported by the pmap layer; panic.
13015 		 */
13016 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13017 		    "flags=%u",
13018 		    __func__, native_page_size, pmap_page_size,
13019 		    flags);
13020 	}
13021 
13022 	if (PAGE_RATIO > 1) {
13023 		/*
13024 		 * The kernel is deliberately pretending to have 16KB pages.
13025 		 * The pmap layer has code that supports this, so pretend the
13026 		 * page size is larger than it is.
13027 		 */
13028 		pmap_page_size = PAGE_SIZE;
13029 		native_page_size = PAGE_SIZE;
13030 	}
13031 
13032 	/*
13033 	 * Get two pages from the VM; one to be mapped wired, and one to be
13034 	 * mapped nonwired.
13035 	 */
13036 	vm_page_t unwired_vm_page = vm_page_grab();
13037 	vm_page_t wired_vm_page = vm_page_grab();
13038 
13039 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
13040 		panic("Failed to grab VM pages");
13041 	}
13042 
13043 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
13044 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
13045 
13046 	pmap_paddr_t pa = ptoa(pn);
13047 	pmap_paddr_t wired_pa = ptoa(wired_pn);
13048 
13049 	/*
13050 	 * We'll start mappings at the second twig TT.  This keeps us from only
13051 	 * using the first entry in each TT, which would trivially be address
13052 	 * 0; one of the things we will need to test is retrieving the VA for
13053 	 * a given PTE.
13054 	 */
13055 	vm_map_address_t va_base = pmap_twig_size;
13056 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
13057 
13058 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
13059 		/*
13060 		 * Not exactly a functional failure, but this test relies on
13061 		 * there being a spare PTE slot we can use to pin the TT.
13062 		 */
13063 		panic("Cannot pin translation table");
13064 	}
13065 
13066 	/*
13067 	 * Create the wired mapping; this will prevent the pmap layer from
13068 	 * reclaiming our test TTs, which would interfere with this test
13069 	 * ("interfere" -> "make it panic").
13070 	 */
13071 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
13072 
13073 #if XNU_MONITOR
13074 	/*
13075 	 * If the PPL is enabled, make sure that the kernel cannot write
13076 	 * to PPL memory.
13077 	 */
13078 	if (!pmap_ppl_disable) {
13079 		T_LOG("Validate that kernel cannot write to PPL memory.");
13080 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
13081 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
13082 	}
13083 #endif
13084 
13085 	/*
13086 	 * Create read-only mappings of the nonwired page; if the pmap does
13087 	 * not use the same page size as the kernel, create multiple mappings
13088 	 * so that the kernel page is fully mapped.
13089 	 */
13090 	for (map_count = 0; map_count < page_ratio; map_count++) {
13091 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
13092 	}
13093 
13094 	/* Validate that all the PTEs have the expected PA and VA. */
13095 	for (map_count = 0; map_count < page_ratio; map_count++) {
13096 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
13097 
13098 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
13099 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
13100 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
13101 		}
13102 
13103 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
13104 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
13105 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
13106 		}
13107 	}
13108 
13109 	T_LOG("Validate that reads to our mapping do not fault.");
13110 	pmap_test_read(pmap, va_base, false);
13111 
13112 	T_LOG("Validate that writes to our mapping fault.");
13113 	pmap_test_write(pmap, va_base, true);
13114 
13115 	T_LOG("Make the first mapping writable.");
13116 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13117 
13118 	T_LOG("Validate that writes to our mapping do not fault.");
13119 	pmap_test_write(pmap, va_base, false);
13120 
13121 
13122 	T_LOG("Make the first mapping XO.");
13123 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
13124 
13125 	T_LOG("Validate that reads to our mapping do not fault.");
13126 	pmap_test_read(pmap, va_base, false);
13127 
13128 	T_LOG("Validate that writes to our mapping fault.");
13129 	pmap_test_write(pmap, va_base, true);
13130 
13131 
13132 	/*
13133 	 * For page ratios of greater than 1: validate that writes to the other
13134 	 * mappings still fault.  Remove the mappings afterwards (we're done
13135 	 * with page ratio testing).
13136 	 */
13137 	for (map_count = 1; map_count < page_ratio; map_count++) {
13138 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
13139 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
13140 	}
13141 
13142 	T_LOG("Mark the page unreferenced and unmodified.");
13143 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13144 	pmap_test_check_refmod(pa, 0);
13145 
13146 	/*
13147 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
13148 	 * different protection/fault_type settings, and confirm that the
13149 	 * ref/mod state matches our expectations at each step.
13150 	 */
13151 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
13152 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
13153 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13154 
13155 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
13156 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13157 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13158 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13159 
13160 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
13161 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13162 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
13163 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13164 
13165 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
13166 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
13167 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13168 
13169 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
13170 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13171 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13172 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13173 
13174 	/*
13175 	 * Shared memory testing; we'll have two mappings; one read-only,
13176 	 * one read-write.
13177 	 */
13178 	vm_map_address_t rw_base = va_base;
13179 	vm_map_address_t ro_base = va_base + pmap_page_size;
13180 
13181 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13182 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
13183 
13184 	/*
13185 	 * Test that we take faults as expected for unreferenced/unmodified
13186 	 * pages.  Also test the arm_fast_fault interface, to ensure that
13187 	 * mapping permissions change as expected.
13188 	 */
13189 	T_LOG("!ref/!mod: expect no access");
13190 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13191 	pmap_test_read_write(pmap, ro_base, false, false);
13192 	pmap_test_read_write(pmap, rw_base, false, false);
13193 
13194 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
13195 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
13196 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
13197 	pmap_test_read_write(pmap, ro_base, true, false);
13198 	pmap_test_read_write(pmap, rw_base, true, false);
13199 
13200 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
13201 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13202 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13203 	pmap_test_read_write(pmap, ro_base, true, false);
13204 	pmap_test_read_write(pmap, rw_base, true, true);
13205 
13206 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
13207 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
13208 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
13209 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13210 	pmap_test_read_write(pmap, ro_base, true, false);
13211 	pmap_test_read_write(pmap, rw_base, true, true);
13212 
13213 	T_LOG("RW protect both mappings; should not change protections.");
13214 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13215 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
13216 	pmap_test_read_write(pmap, ro_base, true, false);
13217 	pmap_test_read_write(pmap, rw_base, true, true);
13218 
13219 	T_LOG("Read protect both mappings; RW mapping should become RO.");
13220 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
13221 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
13222 	pmap_test_read_write(pmap, ro_base, true, false);
13223 	pmap_test_read_write(pmap, rw_base, true, false);
13224 
13225 	T_LOG("RW protect the page; mappings should not change protections.");
13226 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
13227 	pmap_page_protect(pn, VM_PROT_ALL);
13228 	pmap_test_read_write(pmap, ro_base, true, false);
13229 	pmap_test_read_write(pmap, rw_base, true, true);
13230 
13231 	T_LOG("Read protect the page; RW mapping should become RO.");
13232 	pmap_page_protect(pn, VM_PROT_READ);
13233 	pmap_test_read_write(pmap, ro_base, true, false);
13234 	pmap_test_read_write(pmap, rw_base, true, false);
13235 
13236 	T_LOG("Validate that disconnect removes all known mappings of the page.");
13237 	pmap_disconnect(pn);
13238 	if (!pmap_verify_free(pn)) {
13239 		T_FAIL("Page still has mappings");
13240 	}
13241 
13242 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
13243 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
13244 	pmap_destroy(pmap);
13245 
13246 	T_LOG("Release the pages back to the VM.");
13247 	vm_page_lock_queues();
13248 	vm_page_free(unwired_vm_page);
13249 	vm_page_free(wired_vm_page);
13250 	vm_page_unlock_queues();
13251 
13252 	T_LOG("Testing successful!");
13253 	return 0;
13254 }
13255 #endif /* __arm64__ */
13256 
13257 kern_return_t
13258 pmap_test(void)
13259 {
13260 	T_LOG("Starting pmap_tests");
13261 #ifdef __arm64__
13262 	int flags = 0;
13263 	flags |= PMAP_CREATE_64BIT;
13264 
13265 #if __ARM_MIXED_PAGE_SIZE__
13266 	T_LOG("Testing VM_PAGE_SIZE_4KB");
13267 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
13268 	T_LOG("Testing VM_PAGE_SIZE_16KB");
13269 	pmap_test_test_config(flags);
13270 #else /* __ARM_MIXED_PAGE_SIZE__ */
13271 	pmap_test_test_config(flags);
13272 #endif /* __ARM_MIXED_PAGE_SIZE__ */
13273 
13274 #endif /* __arm64__ */
13275 	T_PASS("completed pmap_test successfully");
13276 	return KERN_SUCCESS;
13277 }
13278 #endif /* CONFIG_XNUPOST */
13279 
13280 /*
13281  * The following function should never make it to RELEASE code, since
13282  * it provides a way to get the PPL to modify text pages.
13283  */
13284 #if DEVELOPMENT || DEBUG
13285 
13286 #define ARM_UNDEFINED_INSN 0xe7f000f0
13287 #define ARM_UNDEFINED_INSN_THUMB 0xde00
13288 
13289 /**
13290  * Forcibly overwrite executable text with an illegal instruction.
13291  *
13292  * @note Only used for xnu unit testing.
13293  *
13294  * @param pa The physical address to corrupt.
13295  *
13296  * @return KERN_SUCCESS on success.
13297  */
13298 kern_return_t
13299 pmap_test_text_corruption(pmap_paddr_t pa)
13300 {
13301 #if XNU_MONITOR
13302 	return pmap_test_text_corruption_ppl(pa);
13303 #else /* XNU_MONITOR */
13304 	return pmap_test_text_corruption_internal(pa);
13305 #endif /* XNU_MONITOR */
13306 }
13307 
13308 MARK_AS_PMAP_TEXT kern_return_t
13309 pmap_test_text_corruption_internal(pmap_paddr_t pa)
13310 {
13311 	vm_offset_t va = phystokv(pa);
13312 	unsigned int pai = pa_index(pa);
13313 
13314 	assert(pa_valid(pa));
13315 
13316 	pvh_lock(pai);
13317 
13318 	pv_entry_t **pv_h  = pai_to_pvh(pai);
13319 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
13320 #if defined(PVH_FLAG_EXEC)
13321 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
13322 
13323 	if (need_ap_twiddle) {
13324 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
13325 	}
13326 #endif /* defined(PVH_FLAG_EXEC) */
13327 
13328 	/*
13329 	 * The low bit in an instruction address indicates a THUMB instruction
13330 	 */
13331 	if (va & 1) {
13332 		va &= ~(vm_offset_t)1;
13333 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
13334 	} else {
13335 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
13336 	}
13337 
13338 #if defined(PVH_FLAG_EXEC)
13339 	if (need_ap_twiddle) {
13340 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
13341 	}
13342 #endif /* defined(PVH_FLAG_EXEC) */
13343 
13344 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
13345 
13346 	pvh_unlock(pai);
13347 
13348 	return KERN_SUCCESS;
13349 }
13350 
13351 #endif /* DEVELOPMENT || DEBUG */
13352