xref: /xnu-8796.101.5/osfmk/arm/pmap/pmap.c (revision aca3beaa3dfbd42498b42c5e5ce20a938e6554e5)
1 /*
2  * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_attr pmap_pt_attr_4k = {
222 	.pta_level_info = pmap_table_level_info_4k,
223 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 	.pta_max_level  = PMAP_TT_L3_LEVEL,
234 	.pta_ops = &native_pt_ops,
235 	.ap_ro = ARM_PTE_AP(AP_RORO),
236 	.ap_rw = ARM_PTE_AP(AP_RWRW),
237 	.ap_rona = ARM_PTE_AP(AP_RONA),
238 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
239 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 	.ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 	.pta_tcr_value  = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 	.pta_page_size  = 4096,
245 	.pta_page_shift = 12,
246 };
247 
248 const struct page_table_attr pmap_pt_attr_16k = {
249 	.pta_level_info = pmap_table_level_info_16k,
250 	.pta_root_level = PMAP_TT_L1_LEVEL,
251 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
252 	.pta_max_level  = PMAP_TT_L3_LEVEL,
253 	.pta_ops = &native_pt_ops,
254 	.ap_ro = ARM_PTE_AP(AP_RORO),
255 	.ap_rw = ARM_PTE_AP(AP_RWRW),
256 	.ap_rona = ARM_PTE_AP(AP_RONA),
257 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
258 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 	.ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 	.pta_tcr_value  = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 	.pta_page_size  = 16384,
264 	.pta_page_shift = 14,
265 };
266 
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272 
273 
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276 
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279 
280 #endif /* MACH_ASSERT */
281 
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 	(current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288 
289 
290 SECURITY_READ_ONLY_LATE(int) srd_fused = 0;
291 
292 /*
293  * Represents a tlb range that will be flushed before exiting
294  * the ppl.
295  * Used by phys_attribute_clear_range to defer flushing pages in
296  * this range until the end of the operation.
297  */
298 typedef struct pmap_tlb_flush_range {
299 	pmap_t ptfr_pmap;
300 	vm_map_address_t ptfr_start;
301 	vm_map_address_t ptfr_end;
302 	bool ptfr_flush_needed;
303 } pmap_tlb_flush_range_t;
304 
305 #if XNU_MONITOR
306 /*
307  * PPL External References.
308  */
309 extern vm_offset_t   segPPLDATAB;
310 extern unsigned long segSizePPLDATA;
311 extern vm_offset_t   segPPLTEXTB;
312 extern unsigned long segSizePPLTEXT;
313 extern vm_offset_t   segPPLDATACONSTB;
314 extern unsigned long segSizePPLDATACONST;
315 
316 
317 /*
318  * PPL Global Variables
319  */
320 
321 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
322 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
323 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
324 #else
325 const boolean_t pmap_ppl_disable = FALSE;
326 #endif
327 
328 /*
329  * Indicates if the PPL has started applying APRR.
330  * This variable is accessed from various assembly trampolines, so be sure to change
331  * those if you change the size or layout of this variable.
332  */
333 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
334 
335 extern void *pmap_stacks_start;
336 extern void *pmap_stacks_end;
337 
338 #endif /* !XNU_MONITOR */
339 
340 
341 
342 /* Virtual memory region for early allocation */
343 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
344 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
345 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
346 
347 extern uint8_t bootstrap_pagetables[];
348 
349 extern unsigned int not_in_kdp;
350 
351 extern vm_offset_t first_avail;
352 
353 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
354 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
355 extern vm_offset_t     static_memory_end;
356 
357 extern const vm_map_address_t physmap_base;
358 extern const vm_map_address_t physmap_end;
359 
360 extern int maxproc, hard_maxproc;
361 
362 /* The number of address bits one TTBR can cover. */
363 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
364 
365 /*
366  * The bounds on our TTBRs.  These are for sanity checking that
367  * an address is accessible by a TTBR before we attempt to map it.
368  */
369 
370 /* The level of the root of a page table. */
371 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
372 
373 /* The number of entries in the root TT of a page table. */
374 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
375 
376 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
377 const pmap_t    kernel_pmap = &kernel_pmap_store;
378 
379 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
380 
381 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
382 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
383 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
384 
385 typedef struct tt_free_entry {
386 	struct tt_free_entry    *next;
387 } tt_free_entry_t;
388 
389 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
390 
391 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
392 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
393 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
394 #define FREE_PAGE_SIZE_TT_MAX   4
395 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
396 unsigned int    free_two_page_size_tt_count MARK_AS_PMAP_DATA;
397 unsigned int    free_two_page_size_tt_max MARK_AS_PMAP_DATA;
398 #define FREE_TWO_PAGE_SIZE_TT_MAX       4
399 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
400 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
401 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
402 
403 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
404 
405 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
406 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
407 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
408 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
409 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
410 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
411 
412 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
413 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
414 
415 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
416 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
417 
418 /* Lock group used for all pmap object locks. */
419 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
420 
421 #if DEVELOPMENT || DEBUG
422 int nx_enabled = 1;                                     /* enable no-execute protection */
423 int allow_data_exec  = 0;                               /* No apps may execute data */
424 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
425 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
426 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
427 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
428 #else /* DEVELOPMENT || DEBUG */
429 const int nx_enabled = 1;                                       /* enable no-execute protection */
430 const int allow_data_exec  = 0;                         /* No apps may execute data */
431 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
432 #endif /* DEVELOPMENT || DEBUG */
433 
434 /**
435  * This variable is set true during hibernation entry to protect pmap data structures
436  * during image copying, and reset false on hibernation exit.
437  */
438 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
439 
440 #if MACH_ASSERT
441 static void pmap_check_ledgers(pmap_t pmap);
442 #else
443 static inline void
pmap_check_ledgers(__unused pmap_t pmap)444 pmap_check_ledgers(__unused pmap_t pmap)
445 {
446 }
447 #endif /* MACH_ASSERT */
448 
449 /**
450  * This helper function ensures that potentially-long-running batched PPL operations are
451  * called in preemptible context before entering the PPL, so that the PPL call may
452  * periodically exit to allow pending urgent ASTs to be taken.
453  */
454 static inline void
pmap_verify_preemptible(void)455 pmap_verify_preemptible(void)
456 {
457 	assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
458 }
459 
460 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
461 
462 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
463 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
464 
465 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
466 
467 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
468 #if defined(__arm64__)
469 /* end of shared region + 512MB for various purposes */
470 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
471 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
472     "Minimum address space size outside allowable range");
473 
474 // Max offset is 13.375GB for devices with "large" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
476 // Max offset is 9.375GB for devices with "small" memory config
477 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
478 
479 
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
481     "Large device address space size outside allowable range");
482 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
483     "Small device address space size outside allowable range");
484 
485 #  ifdef XNU_TARGET_OS_OSX
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
487 #  else
488 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
489 #  endif
490 #endif /* __arm64__ */
491 
492 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
493 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
494 #else
495 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
496 #endif
497 
498 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
499 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
500 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
501 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
502 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
503 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
504 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
505 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
506 
507 
508 #if __ARM_MIXED_PAGE_SIZE__
509 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
510 #endif
511 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
512 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
513 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
514 
515 /* PTE Define Macros */
516 
517 #define ARM_PTE_IS_COMPRESSED(x, p) \
518 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
519 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
520 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
521 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
522 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
523 
524 #define pte_is_wired(pte)                                                               \
525 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
526 
527 #define pte_was_writeable(pte) \
528 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
529 
530 #define pte_set_was_writeable(pte, was_writeable) \
531 	do {                                         \
532 	        if ((was_writeable)) {               \
533 	                (pte) |= ARM_PTE_WRITEABLE;  \
534 	        } else {                             \
535 	                (pte) &= ~ARM_PTE_WRITEABLE; \
536 	        }                                    \
537 	} while(0)
538 
539 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)540 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
541 {
542 	if (wired) {
543 		*ptep |= ARM_PTE_WIRED;
544 	} else {
545 		*ptep &= ~ARM_PTE_WIRED;
546 	}
547 	/*
548 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
549 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
550 	 * never reclaimed.
551 	 */
552 	if (pmap == kernel_pmap) {
553 		return;
554 	}
555 	unsigned short *ptd_wiredcnt_ptr;
556 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
557 	if (wired) {
558 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
559 	} else {
560 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
561 		if (__improbable(prev_wired == 0)) {
562 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
563 		}
564 	}
565 }
566 
567 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                       \
568 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only); \
569 	arm64_sync_tlb(strong);                                                                        \
570 }
571 
572 /*
573  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
574  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
575  * will observe the updated PTE.
576  */
577 #define FLUSH_PTE()                                                                     \
578 	__builtin_arm_dmb(DMB_ISH);
579 
580 /*
581  * Synchronize updates to PTEs that were previously valid and thus may be cached in
582  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
583  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
584  * program order will not issue until the DSB completes.  Prior loads may be reordered
585  * after the barrier, but their behavior should not be materially affected by the
586  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
587  * matter for loads until the access is re-driven well after the TLB update is
588  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
589  * we should be in a position to handle access faults.  For "voluntary" PTE access
590  * restriction due to unmapping or protection, the decision to restrict access should
591  * have a data dependency on prior loads in order to avoid a data race.
592  */
593 #define FLUSH_PTE_STRONG()                                                             \
594 	__builtin_arm_dsb(DSB_ISHST);
595 
596 /**
597  * Write enough page table entries to map a single VM page. On systems where the
598  * VM page size does not match the hardware page size, multiple page table
599  * entries will need to be written.
600  *
601  * @note This function does not emit a barrier to ensure these page table writes
602  *       have completed before continuing. This is commonly needed. In the case
603  *       where a DMB or DSB barrier is needed, then use the write_pte() and
604  *       write_pte_strong() functions respectively instead of this one.
605  *
606  * @param ptep Pointer to the first page table entry to update.
607  * @param pte The value to write into each page table entry. In the case that
608  *            multiple PTEs are updated to a non-empty value, then the address
609  *            in this value will automatically be incremented for each PTE
610  *            write.
611  */
612 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)613 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
614 {
615 	/**
616 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
617 	 * systems, which is why it's checked at runtime instead of compile time.
618 	 * The "unreachable" warning needs to be suppressed because it still is a
619 	 * compile time constant on some systems.
620 	 */
621 	__unreachable_ok_push
622 	if (TEST_PAGE_RATIO_4) {
623 		if (((uintptr_t)ptep) & 0x1f) {
624 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
625 			    __func__, ptep, (void*)pte);
626 		}
627 
628 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
629 			/**
630 			 * If we're writing an empty/compressed PTE value, then don't
631 			 * auto-increment the address for each PTE write.
632 			 */
633 			*ptep = pte;
634 			*(ptep + 1) = pte;
635 			*(ptep + 2) = pte;
636 			*(ptep + 3) = pte;
637 		} else {
638 			*ptep = pte;
639 			*(ptep + 1) = pte | 0x1000;
640 			*(ptep + 2) = pte | 0x2000;
641 			*(ptep + 3) = pte | 0x3000;
642 		}
643 	} else {
644 		*ptep = pte;
645 	}
646 	__unreachable_ok_pop
647 }
648 
649 /**
650  * Writes enough page table entries to map a single VM page and then ensures
651  * those writes complete by executing a Data Memory Barrier.
652  *
653  * @note The DMB issued by this function is not strong enough to protect against
654  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
655  *       instruction is going to immediately be called after this write, it's
656  *       recommended to call write_pte_strong() instead of this function.
657  *
658  * See the function header for write_pte_fast() for more details on the
659  * parameters.
660  */
661 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)662 write_pte(pt_entry_t *ptep, pt_entry_t pte)
663 {
664 	write_pte_fast(ptep, pte);
665 	FLUSH_PTE();
666 }
667 
668 /**
669  * Writes enough page table entries to map a single VM page and then ensures
670  * those writes complete by executing a Data Synchronization Barrier. This
671  * barrier provides stronger guarantees than the DMB executed by write_pte().
672  *
673  * @note This function is useful if you're going to immediately flush the TLB
674  *       after making the PTE write. A DSB is required to protect against the
675  *       TLB invalidate being reordered before the PTE write.
676  *
677  * See the function header for write_pte_fast() for more details on the
678  * parameters.
679  */
680 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)681 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
682 {
683 	write_pte_fast(ptep, pte);
684 	FLUSH_PTE_STRONG();
685 }
686 
687 /**
688  * Retrieve the pmap structure for the thread running on the current CPU.
689  */
690 pmap_t
current_pmap()691 current_pmap()
692 {
693 	const pmap_t current = vm_map_pmap(current_thread()->map);
694 
695 	assert(current != NULL);
696 
697 #if XNU_MONITOR
698 	/**
699 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
700 	 * decided by kernel-writable memory. This function is used in various parts
701 	 * of the PPL, and besides validating that the pointer returned by this
702 	 * function is indeed a pmap structure, it's also important to ensure that
703 	 * it's actually the current thread's pmap. This is because different pmaps
704 	 * will have access to different entitlements based on the code signature of
705 	 * their loaded process. So if a different user pmap is set in the current
706 	 * thread structure (in an effort to bypass code signing restrictions), even
707 	 * though the structure would validate correctly as it is a real pmap
708 	 * structure, it should fail here.
709 	 *
710 	 * This only needs to occur for user pmaps because the kernel pmap's root
711 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
712 	 * changed so it'd be redundant to check), and its code signing fields are
713 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
714 	 * it shouldn't be possible to set those fields. Due to that, an attacker
715 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
716 	 * this check won't accomplish anything as it doesn't provide any extra code
717 	 * signing entitlements.
718 	 */
719 	if ((current != kernel_pmap) &&
720 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
721 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
722 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
723 	}
724 #endif /* XNU_MONITOR */
725 
726 	return current;
727 }
728 
729 #if DEVELOPMENT || DEBUG
730 
731 /*
732  * Trace levels are controlled by a bitmask in which each
733  * level can be enabled/disabled by the (1<<level) position
734  * in the boot arg
735  * Level 0: PPL extension functionality
736  * Level 1: pmap lifecycle (create/destroy/switch)
737  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
738  * Level 3: internal state management (attributes/fast-fault)
739  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
740  */
741 
742 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
743 
744 #define PMAP_TRACE(level, ...) \
745 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
746 	        KDBG_RELEASE(__VA_ARGS__); \
747 	}
748 #else /* DEVELOPMENT || DEBUG */
749 
750 #define PMAP_TRACE(level, ...)
751 
752 #endif /* DEVELOPMENT || DEBUG */
753 
754 
755 /*
756  * Internal function prototypes (forward declarations).
757  */
758 
759 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
760 
761 static void pmap_set_reference(ppnum_t pn);
762 
763 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
764 
765 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
766 
767 static kern_return_t pmap_expand(
768 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
769 
770 static int pmap_remove_range(
771 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
772 
773 static tt_entry_t *pmap_tt1_allocate(
774 	pmap_t, vm_size_t, unsigned int);
775 
776 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
777 
778 static void pmap_tt1_deallocate(
779 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
780 
781 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
782 
783 static kern_return_t pmap_tt_allocate(
784 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
785 
786 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
787 
788 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
789 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
790 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
791 
792 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
793 
794 
795 static void pmap_unmap_commpage(
796 	pmap_t pmap);
797 
798 static boolean_t
799 pmap_is_64bit(pmap_t);
800 
801 
802 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
803 
804 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
805 
806 static bool pmap_update_cache_attributes_locked(
807 	ppnum_t, unsigned, bool);
808 
809 static boolean_t arm_clear_fast_fault(
810 	ppnum_t ppnum,
811 	vm_prot_t fault_type,
812 	pt_entry_t *pte_p);
813 
814 static void pmap_trim_self(pmap_t pmap);
815 static void pmap_trim_subord(pmap_t subord);
816 
817 
818 /*
819  * Temporary prototypes, while we wait for pmap_enter to move to taking an
820  * address instead of a page number.
821  */
822 static kern_return_t
823 pmap_enter_addr(
824 	pmap_t pmap,
825 	vm_map_address_t v,
826 	pmap_paddr_t pa,
827 	vm_prot_t prot,
828 	vm_prot_t fault_type,
829 	unsigned int flags,
830 	boolean_t wired);
831 
832 kern_return_t
833 pmap_enter_options_addr(
834 	pmap_t pmap,
835 	vm_map_address_t v,
836 	pmap_paddr_t pa,
837 	vm_prot_t prot,
838 	vm_prot_t fault_type,
839 	unsigned int flags,
840 	boolean_t wired,
841 	unsigned int options,
842 	__unused void   *arg);
843 
844 #ifdef CONFIG_XNUPOST
845 kern_return_t pmap_test(void);
846 #endif /* CONFIG_XNUPOST */
847 
848 PMAP_SUPPORT_PROTOTYPES(
849 	kern_return_t,
850 	arm_fast_fault, (pmap_t pmap,
851 	vm_map_address_t va,
852 	vm_prot_t fault_type,
853 	bool was_af_fault,
854 	bool from_user), ARM_FAST_FAULT_INDEX);
855 
856 PMAP_SUPPORT_PROTOTYPES(
857 	boolean_t,
858 	arm_force_fast_fault, (ppnum_t ppnum,
859 	vm_prot_t allow_mode,
860 	int options), ARM_FORCE_FAST_FAULT_INDEX);
861 
862 MARK_AS_PMAP_TEXT static boolean_t
863 arm_force_fast_fault_with_flush_range(
864 	ppnum_t ppnum,
865 	vm_prot_t allow_mode,
866 	int options,
867 	pmap_tlb_flush_range_t *flush_range);
868 
869 /**
870  * Definition of the states driving the batch cache attributes update
871  * state machine.
872  */
873 typedef struct {
874 	uint64_t page_index : 32,           /* The page index to be operated on */
875 	    state : 8,                      /* The current state of the update machine */
876 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
877 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
878 	:0;
879 } batch_set_cache_attr_state_t;
880 
881 /* Possible values of the "state" field. */
882 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
883 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
884 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
885 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
886 
887 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
888 
889 PMAP_SUPPORT_PROTOTYPES(
890 	batch_set_cache_attr_state_t,
891 	pmap_batch_set_cache_attributes, (
892 #if XNU_MONITOR
893 		volatile upl_page_info_t *user_page_list,
894 #else /* !XNU_MONITOR */
895 		upl_page_info_array_t user_page_list,
896 #endif /* XNU_MONITOR */
897 		batch_set_cache_attr_state_t state,
898 		unsigned int page_cnt,
899 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
900 
901 PMAP_SUPPORT_PROTOTYPES(
902 	kern_return_t,
903 	pmap_change_wiring, (pmap_t pmap,
904 	vm_map_address_t v,
905 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
906 
907 PMAP_SUPPORT_PROTOTYPES(
908 	pmap_t,
909 	pmap_create_options, (ledger_t ledger,
910 	vm_map_size_t size,
911 	unsigned int flags,
912 	kern_return_t * kr), PMAP_CREATE_INDEX);
913 
914 PMAP_SUPPORT_PROTOTYPES(
915 	void,
916 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
917 
918 PMAP_SUPPORT_PROTOTYPES(
919 	kern_return_t,
920 	pmap_enter_options, (pmap_t pmap,
921 	vm_map_address_t v,
922 	pmap_paddr_t pa,
923 	vm_prot_t prot,
924 	vm_prot_t fault_type,
925 	unsigned int flags,
926 	boolean_t wired,
927 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
928 
929 PMAP_SUPPORT_PROTOTYPES(
930 	pmap_paddr_t,
931 	pmap_find_pa, (pmap_t pmap,
932 	addr64_t va), PMAP_FIND_PA_INDEX);
933 
934 PMAP_SUPPORT_PROTOTYPES(
935 	kern_return_t,
936 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
937 
938 
939 PMAP_SUPPORT_PROTOTYPES(
940 	boolean_t,
941 	pmap_is_empty, (pmap_t pmap,
942 	vm_map_offset_t va_start,
943 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
944 
945 
946 PMAP_SUPPORT_PROTOTYPES(
947 	unsigned int,
948 	pmap_map_cpu_windows_copy, (ppnum_t pn,
949 	vm_prot_t prot,
950 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
951 
952 PMAP_SUPPORT_PROTOTYPES(
953 	void,
954 	pmap_ro_zone_memcpy, (zone_id_t zid,
955 	vm_offset_t va,
956 	vm_offset_t offset,
957 	const vm_offset_t new_data,
958 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
959 
960 PMAP_SUPPORT_PROTOTYPES(
961 	uint64_t,
962 	pmap_ro_zone_atomic_op, (zone_id_t zid,
963 	vm_offset_t va,
964 	vm_offset_t offset,
965 	zro_atomic_op_t op,
966 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
967 
968 PMAP_SUPPORT_PROTOTYPES(
969 	void,
970 	pmap_ro_zone_bzero, (zone_id_t zid,
971 	vm_offset_t va,
972 	vm_offset_t offset,
973 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
974 
975 PMAP_SUPPORT_PROTOTYPES(
976 	vm_map_offset_t,
977 	pmap_nest, (pmap_t grand,
978 	pmap_t subord,
979 	addr64_t vstart,
980 	uint64_t size,
981 	vm_map_offset_t vrestart,
982 	kern_return_t * krp), PMAP_NEST_INDEX);
983 
984 PMAP_SUPPORT_PROTOTYPES(
985 	void,
986 	pmap_page_protect_options, (ppnum_t ppnum,
987 	vm_prot_t prot,
988 	unsigned int options,
989 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
990 
991 PMAP_SUPPORT_PROTOTYPES(
992 	vm_map_address_t,
993 	pmap_protect_options, (pmap_t pmap,
994 	vm_map_address_t start,
995 	vm_map_address_t end,
996 	vm_prot_t prot,
997 	unsigned int options,
998 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
999 
1000 PMAP_SUPPORT_PROTOTYPES(
1001 	kern_return_t,
1002 	pmap_query_page_info, (pmap_t pmap,
1003 	vm_map_offset_t va,
1004 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1005 
1006 PMAP_SUPPORT_PROTOTYPES(
1007 	mach_vm_size_t,
1008 	pmap_query_resident, (pmap_t pmap,
1009 	vm_map_address_t start,
1010 	vm_map_address_t end,
1011 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1012 
1013 PMAP_SUPPORT_PROTOTYPES(
1014 	void,
1015 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1016 
1017 PMAP_SUPPORT_PROTOTYPES(
1018 	vm_map_address_t,
1019 	pmap_remove_options, (pmap_t pmap,
1020 	vm_map_address_t start,
1021 	vm_map_address_t end,
1022 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1023 
1024 
1025 PMAP_SUPPORT_PROTOTYPES(
1026 	void,
1027 	pmap_set_cache_attributes, (ppnum_t pn,
1028 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1029 
1030 PMAP_SUPPORT_PROTOTYPES(
1031 	void,
1032 	pmap_update_compressor_page, (ppnum_t pn,
1033 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1034 
1035 PMAP_SUPPORT_PROTOTYPES(
1036 	void,
1037 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1038 
1039 #if MACH_ASSERT || XNU_MONITOR
1040 PMAP_SUPPORT_PROTOTYPES(
1041 	void,
1042 	pmap_set_process, (pmap_t pmap,
1043 	int pid,
1044 	char *procname), PMAP_SET_PROCESS_INDEX);
1045 #endif
1046 
1047 PMAP_SUPPORT_PROTOTYPES(
1048 	void,
1049 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1050 
1051 PMAP_SUPPORT_PROTOTYPES(
1052 	vm_map_offset_t,
1053 	pmap_unnest_options, (pmap_t grand,
1054 	addr64_t vaddr,
1055 	uint64_t size,
1056 	vm_map_offset_t vrestart,
1057 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1058 
1059 PMAP_SUPPORT_PROTOTYPES(
1060 	void,
1061 	phys_attribute_set, (ppnum_t pn,
1062 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1063 
1064 PMAP_SUPPORT_PROTOTYPES(
1065 	void,
1066 	phys_attribute_clear, (ppnum_t pn,
1067 	unsigned int bits,
1068 	int options,
1069 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1070 
1071 #if __ARM_RANGE_TLBI__
1072 PMAP_SUPPORT_PROTOTYPES(
1073 	vm_map_address_t,
1074 	phys_attribute_clear_range, (pmap_t pmap,
1075 	vm_map_address_t start,
1076 	vm_map_address_t end,
1077 	unsigned int bits,
1078 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1079 #endif /* __ARM_RANGE_TLBI__ */
1080 
1081 
1082 PMAP_SUPPORT_PROTOTYPES(
1083 	void,
1084 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1085 
1086 PMAP_SUPPORT_PROTOTYPES(
1087 	void,
1088 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1089 
1090 PMAP_SUPPORT_PROTOTYPES(
1091 	void,
1092 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1093 
1094 PMAP_SUPPORT_PROTOTYPES(
1095 	void,
1096 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1097 
1098 PMAP_SUPPORT_PROTOTYPES(
1099 	void,
1100 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1101 
1102 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1103 PMAP_SUPPORT_PROTOTYPES(
1104 	void,
1105 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1106 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1107 
1108 /* Definition of the states used by pmap_trim(). */
1109 typedef enum {
1110 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1111 	PMAP_TRIM_STATE_START = 0,
1112 
1113 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1114 	PMAP_TRIM_STATE_GRAND_BEFORE,
1115 
1116 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1117 	PMAP_TRIM_STATE_GRAND_AFTER,
1118 
1119 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1120 	PMAP_TRIM_STATE_SUBORD,
1121 
1122 	/* Marks that trimming is finished. */
1123 	PMAP_TRIM_STATE_DONE,
1124 
1125 	/* Sentry enum for sanity checks. */
1126 	PMAP_TRIM_STATE_COUNT,
1127 } pmap_trim_state_t;
1128 
1129 PMAP_SUPPORT_PROTOTYPES(
1130 	pmap_trim_state_t,
1131 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1132 
1133 #if HAS_APPLE_PAC
1134 PMAP_SUPPORT_PROTOTYPES(
1135 	void *,
1136 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1137 PMAP_SUPPORT_PROTOTYPES(
1138 	void *,
1139 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1140 #endif /* HAS_APPLE_PAC */
1141 
1142 
1143 
1144 
1145 PMAP_SUPPORT_PROTOTYPES(
1146 	kern_return_t,
1147 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1148 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1149 
1150 PMAP_SUPPORT_PROTOTYPES(
1151 	kern_return_t,
1152 	pmap_load_trust_cache_with_type, (TCType_t type,
1153 	const vm_address_t pmap_img4_payload,
1154 	const vm_size_t pmap_img4_payload_len,
1155 	const vm_address_t img4_manifest,
1156 	const vm_size_t img4_manifest_len,
1157 	const vm_address_t img4_aux_manifest,
1158 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1159 
1160 PMAP_SUPPORT_PROTOTYPES(
1161 	void,
1162 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1163 
1164 PMAP_SUPPORT_PROTOTYPES(
1165 	kern_return_t,
1166 	pmap_query_trust_cache, (TCQueryType_t query_type,
1167 	const uint8_t cdhash[kTCEntryHashSize],
1168 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1169 
1170 #if PMAP_CS_INCLUDE_CODE_SIGNING
1171 
1172 PMAP_SUPPORT_PROTOTYPES(
1173 	kern_return_t,
1174 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1175 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1176 
1177 PMAP_SUPPORT_PROTOTYPES(
1178 	kern_return_t,
1179 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1180 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1181 
1182 PMAP_SUPPORT_PROTOTYPES(
1183 	kern_return_t,
1184 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1185 	pmap_cs_profile_t * profile_obj),
1186 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1187 
1188 PMAP_SUPPORT_PROTOTYPES(
1189 	kern_return_t,
1190 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1191 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1192 
1193 PMAP_SUPPORT_PROTOTYPES(
1194 	kern_return_t,
1195 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1196 	const void *kernel_entitlements),
1197 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1198 
1199 PMAP_SUPPORT_PROTOTYPES(
1200 	kern_return_t,
1201 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1202 	const void **kernel_entitlements),
1203 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1204 
1205 PMAP_SUPPORT_PROTOTYPES(
1206 	kern_return_t,
1207 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1208 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1209 
1210 PMAP_SUPPORT_PROTOTYPES(
1211 	kern_return_t,
1212 	pmap_cs_allow_invalid, (pmap_t pmap),
1213 	PMAP_CS_ALLOW_INVALID_INDEX);
1214 
1215 PMAP_SUPPORT_PROTOTYPES(
1216 	void,
1217 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1218 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1219 
1220 PMAP_SUPPORT_PROTOTYPES(
1221 	bool,
1222 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1223 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1224 
1225 PMAP_SUPPORT_PROTOTYPES(
1226 	void,
1227 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1228 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1229 
1230 PMAP_SUPPORT_PROTOTYPES(
1231 	void,
1232 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1233 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1234 
1235 #endif
1236 
1237 PMAP_SUPPORT_PROTOTYPES(
1238 	uint32_t,
1239 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1240 
1241 PMAP_SUPPORT_PROTOTYPES(
1242 	bool,
1243 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1244 
1245 PMAP_SUPPORT_PROTOTYPES(
1246 	void,
1247 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1248 
1249 void pmap_footprint_suspend(vm_map_t    map,
1250     boolean_t   suspend);
1251 PMAP_SUPPORT_PROTOTYPES(
1252 	void,
1253 	pmap_footprint_suspend, (vm_map_t map,
1254 	boolean_t suspend),
1255 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1256 
1257 
1258 
1259 
1260 #if DEVELOPMENT || DEBUG
1261 PMAP_SUPPORT_PROTOTYPES(
1262 	kern_return_t,
1263 	pmap_test_text_corruption, (pmap_paddr_t),
1264 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1265 #endif /* DEVELOPMENT || DEBUG */
1266 
1267 /*
1268  * The low global vector page is mapped at a fixed alias.
1269  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1270  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1271  * to check both addresses anyway for backward compatibility. So for now
1272  * we leave H6 and H7 where they were.
1273  */
1274 #if (ARM_PGSHIFT == 14)
1275 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1276 #else
1277 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1278 #endif
1279 
1280 
1281 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1282 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1283 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1284 
1285 #if XNU_MONITOR
1286 
1287 #if __has_feature(ptrauth_calls)
1288 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1289 #else
1290 #define __ptrauth_ppl_handler
1291 #endif
1292 
1293 /*
1294  * Table of function pointers used for PPL dispatch.
1295  */
1296 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1297 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1298 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1299 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1300 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1301 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1302 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1303 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1304 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1305 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1306 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1307 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1308 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1309 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1310 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1311 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1312 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1313 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1314 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1315 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1316 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1317 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1318 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1319 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1320 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1321 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1322 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1323 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1324 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1325 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1326 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1327 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1328 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1329 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1330 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1331 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1332 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1333 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1334 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1335 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1336 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1337 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1338 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1339 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1340 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1341 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1342 #if PMAP_CS_INCLUDE_CODE_SIGNING
1343 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1344 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1345 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1346 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1347 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1348 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1349 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1350 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1351 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1352 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1353 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1354 #endif
1355 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1356 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1357 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1358 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1359 #if HAS_APPLE_PAC
1360 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1361 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1362 #endif /* HAS_APPLE_PAC */
1363 #if __ARM_RANGE_TLBI__
1364 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1365 #endif /* __ARM_RANGE_TLBI__ */
1366 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
1367 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1368 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
1369 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1370 
1371 #if DEVELOPMENT || DEBUG
1372 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1373 #endif /* DEVELOPMENT || DEBUG */
1374 
1375 };
1376 #endif
1377 
1378 #if XNU_MONITOR
1379 /**
1380  * A convenience function for setting protections on a single physical
1381  * aperture or static region mapping without invalidating the TLB.
1382  *
1383  * @note This function does not perform any TLB invalidations. That must be done
1384  *       separately to be able to safely use the updated mapping.
1385  *
1386  * @note This function understands the difference between the VM page size and
1387  *       the kernel page size and will update multiple PTEs if the sizes differ.
1388  *       In other words, enough PTEs will always get updated to change the
1389  *       permissions on a PAGE_SIZE amount of memory.
1390  *
1391  * @note The PVH lock for the physical page represented by this mapping must
1392  *       already be locked.
1393  *
1394  * @note This function assumes the caller has already verified that the PTE
1395  *       pointer does indeed point to a physical aperture or static region page
1396  *       table. Please validate your inputs before passing it along to this
1397  *       function.
1398  *
1399  * @param ptep Pointer to the physical aperture or static region page table to
1400  *             update with a new XPRR index.
1401  * @param expected_perm The XPRR index that is expected to already exist at the
1402  *                      current mapping. If the current index doesn't match this
1403  *                      then the system will panic.
1404  * @param new_perm The new XPRR index to update the mapping with.
1405  */
1406 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1407 pmap_set_pte_xprr_perm(
1408 	pt_entry_t * const ptep,
1409 	unsigned int expected_perm,
1410 	unsigned int new_perm)
1411 {
1412 	assert(ptep != NULL);
1413 
1414 	pt_entry_t spte = *ptep;
1415 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1416 
1417 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1418 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1419 		    __func__, ptep, new_perm, expected_perm);
1420 	}
1421 
1422 	/**
1423 	 * The PTE involved should be valid, should not have the hint bit set, and
1424 	 * should have the expected XPRR index.
1425 	 */
1426 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1427 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1428 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1429 		    __func__, ptep, spte, new_perm, expected_perm);
1430 	}
1431 
1432 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1433 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1434 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1435 		    __func__, ptep, spte, new_perm, expected_perm);
1436 	}
1437 
1438 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1439 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1440 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1441 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1442 	}
1443 
1444 	pt_entry_t template = spte;
1445 	template &= ~ARM_PTE_XPRR_MASK;
1446 	template |= xprr_perm_to_pte(new_perm);
1447 
1448 	write_pte_strong(ptep, template);
1449 }
1450 
1451 /**
1452  * Update the protections on a single physical aperture mapping and invalidate
1453  * the TLB so the mapping can be used.
1454  *
1455  * @note The PVH lock for the physical page must already be locked.
1456  *
1457  * @param pai The physical address index of the page whose physical aperture
1458  *            mapping will be updated with new permissions.
1459  * @param expected_perm The XPRR index that is expected to already exist at the
1460  *                      current mapping. If the current index doesn't match this
1461  *                      then the system will panic.
1462  * @param new_perm The new XPRR index to update the mapping with.
1463  */
1464 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1465 pmap_set_xprr_perm(
1466 	unsigned int pai,
1467 	unsigned int expected_perm,
1468 	unsigned int new_perm)
1469 {
1470 	pvh_assert_locked(pai);
1471 
1472 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1473 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1474 
1475 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1476 
1477 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true);
1478 	sync_tlb_flush();
1479 }
1480 
1481 /**
1482  * Update the protections on a range of physical aperture or static region
1483  * mappings and invalidate the TLB so the mappings can be used.
1484  *
1485  * @note Static region mappings can only be updated before machine_lockdown().
1486  *       Physical aperture mappings can be updated at any time.
1487  *
1488  * @param start The starting virtual address of the static region or physical
1489  *              aperture range whose permissions will be updated.
1490  * @param end The final (inclusive) virtual address of the static region or
1491  *            physical aperture range whose permissions will be updated.
1492  * @param expected_perm The XPRR index that is expected to already exist at the
1493  *                      current mappings. If the current indices don't match
1494  *                      this then the system will panic.
1495  * @param new_perm The new XPRR index to update the mappings with.
1496  */
1497 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1498 pmap_set_range_xprr_perm(
1499 	vm_address_t start,
1500 	vm_address_t end,
1501 	unsigned int expected_perm,
1502 	unsigned int new_perm)
1503 {
1504 	/**
1505 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1506 	 */
1507 	if (__improbable((start | end) & ARM_PGMASK)) {
1508 		panic_plain("%s: start or end not page aligned, "
1509 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1510 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1511 	}
1512 
1513 	if (__improbable(start > end)) {
1514 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1515 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1516 	}
1517 
1518 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1519 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1520 
1521 	if (__improbable(!(in_physmap || in_static))) {
1522 		panic_plain("%s: address not in static region or physical aperture, "
1523 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1524 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1525 	}
1526 
1527 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1528 		panic_plain("%s: invalid XPRR index, "
1529 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1530 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1531 	}
1532 
1533 	/*
1534 	 * Walk over the PTEs for the given range, and set the protections on those
1535 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1536 	 * one twig entry (whichever twig entry currently maps "va").
1537 	 */
1538 	vm_address_t va = start;
1539 	while (va < end) {
1540 		/**
1541 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1542 		 * PTEs from va to tte_va_end will have their permissions updated.
1543 		 */
1544 		vm_address_t tte_va_end =
1545 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1546 
1547 		if (tte_va_end > end) {
1548 			tte_va_end = end;
1549 		}
1550 
1551 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1552 
1553 		if (ttep == NULL) {
1554 			panic_plain("%s: physical aperture or static region tte is NULL, "
1555 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1556 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1557 		}
1558 
1559 		tt_entry_t tte = *ttep;
1560 
1561 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1562 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1563 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1564 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1565 		}
1566 
1567 		/* Walk over the given L3 page table page and update the PTEs. */
1568 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1569 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1570 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1571 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1572 
1573 		/**
1574 		 * The current PTE pointer is incremented by the page ratio (ratio of
1575 		 * VM page size to kernel hardware page size) because one call to
1576 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1577 		 * a PAGE_SIZE worth of hardware pages.
1578 		 */
1579 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1580 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1581 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1582 			pvh_lock(pai);
1583 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1584 			pvh_unlock(pai);
1585 		}
1586 
1587 		va = tte_va_end;
1588 	}
1589 
1590 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1591 }
1592 
1593 #endif /* XNU_MONITOR */
1594 
1595 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1596 PMAP_ZINFO_PALLOC(
1597 	pmap_t pmap, int bytes)
1598 {
1599 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1600 }
1601 
1602 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1603 PMAP_ZINFO_PFREE(
1604 	pmap_t pmap,
1605 	int bytes)
1606 {
1607 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1608 }
1609 
1610 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1611 pmap_tt_ledger_credit(
1612 	pmap_t          pmap,
1613 	vm_size_t       size)
1614 {
1615 	if (pmap != kernel_pmap) {
1616 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1617 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1618 	}
1619 }
1620 
1621 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1622 pmap_tt_ledger_debit(
1623 	pmap_t          pmap,
1624 	vm_size_t       size)
1625 {
1626 	if (pmap != kernel_pmap) {
1627 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1628 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1629 	}
1630 }
1631 
1632 static inline void
pmap_update_plru(uint16_t asid_index)1633 pmap_update_plru(uint16_t asid_index)
1634 {
1635 	if (__probable(pmap_asid_plru)) {
1636 		unsigned plru_index = asid_index >> 6;
1637 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1638 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1639 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1640 		}
1641 	}
1642 }
1643 
1644 static bool
alloc_asid(pmap_t pmap)1645 alloc_asid(pmap_t pmap)
1646 {
1647 	int vasid = -1;
1648 	uint16_t hw_asid;
1649 
1650 	pmap_simple_lock(&asid_lock);
1651 
1652 	if (__probable(pmap_asid_plru)) {
1653 		unsigned plru_index = 0;
1654 		uint64_t lowest_gen = asid_plru_generation[0];
1655 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1656 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1657 			if (asid_plru_generation[i] < lowest_gen) {
1658 				plru_index = i;
1659 				lowest_gen = asid_plru_generation[i];
1660 				lowest_gen_bitmap = asid_plru_bitmap[i];
1661 			}
1662 		}
1663 
1664 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1665 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1666 			if (temp_plru) {
1667 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1668 #if DEVELOPMENT || DEBUG
1669 				++pmap_asid_hits;
1670 #endif
1671 				break;
1672 			}
1673 		}
1674 	}
1675 	if (__improbable(vasid < 0)) {
1676 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1677 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1678 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1679 #if DEVELOPMENT || DEBUG
1680 		++pmap_asid_misses;
1681 #endif
1682 	}
1683 	if (__improbable(vasid < 0)) {
1684 		pmap_simple_unlock(&asid_lock);
1685 		return false;
1686 	}
1687 	assert((uint32_t)vasid < pmap_max_asids);
1688 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1689 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1690 	pmap_simple_unlock(&asid_lock);
1691 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1692 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1693 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1694 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1695 		 * reassign to a reserved VASID. */
1696 		assert(pmap->sw_asid < UINT8_MAX);
1697 		pmap->sw_asid = UINT8_MAX;
1698 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1699 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1700 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1701 		assert(hw_asid < MAX_HW_ASIDS);
1702 	}
1703 	pmap_update_plru(hw_asid);
1704 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1705 #if __ARM_KERNEL_PROTECT__
1706 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1707 #endif
1708 	pmap->hw_asid = hw_asid;
1709 	return true;
1710 }
1711 
1712 static void
free_asid(pmap_t pmap)1713 free_asid(pmap_t pmap)
1714 {
1715 	unsigned int vasid;
1716 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1717 	if (__improbable(hw_asid == 0)) {
1718 		return;
1719 	}
1720 
1721 #if __ARM_KERNEL_PROTECT__
1722 	hw_asid >>= 1;
1723 #endif
1724 	hw_asid -= 1;
1725 
1726 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1727 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1728 	} else {
1729 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1730 	}
1731 
1732 	if (__probable(pmap_asid_plru)) {
1733 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1734 	}
1735 	pmap_simple_lock(&asid_lock);
1736 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1737 	bitmap_set(&asid_bitmap[0], vasid);
1738 	pmap_simple_unlock(&asid_lock);
1739 }
1740 
1741 
1742 boolean_t
pmap_valid_address(pmap_paddr_t addr)1743 pmap_valid_address(
1744 	pmap_paddr_t addr)
1745 {
1746 	return pa_valid(addr);
1747 }
1748 
1749 
1750 
1751 
1752 
1753 
1754 /*
1755  *      Map memory at initialization.  The physical addresses being
1756  *      mapped are not managed and are never unmapped.
1757  *
1758  *      For now, VM is already on, we only need to map the
1759  *      specified memory.
1760  */
1761 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1762 pmap_map(
1763 	vm_map_address_t virt,
1764 	vm_offset_t start,
1765 	vm_offset_t end,
1766 	vm_prot_t prot,
1767 	unsigned int flags)
1768 {
1769 	kern_return_t   kr;
1770 	vm_size_t       ps;
1771 
1772 	ps = PAGE_SIZE;
1773 	while (start < end) {
1774 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1775 		    prot, VM_PROT_NONE, flags, FALSE);
1776 
1777 		if (kr != KERN_SUCCESS) {
1778 			panic("%s: failed pmap_enter, "
1779 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1780 			    __FUNCTION__,
1781 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1782 		}
1783 
1784 		virt += ps;
1785 		start += ps;
1786 	}
1787 	return virt;
1788 }
1789 
1790 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1791 pmap_map_bd_with_options(
1792 	vm_map_address_t virt,
1793 	vm_offset_t start,
1794 	vm_offset_t end,
1795 	vm_prot_t prot,
1796 	int32_t options)
1797 {
1798 	pt_entry_t      tmplate;
1799 	pt_entry_t     *ptep;
1800 	vm_map_address_t vaddr;
1801 	vm_offset_t     paddr;
1802 	pt_entry_t      mem_attr;
1803 
1804 	switch (options & PMAP_MAP_BD_MASK) {
1805 	case PMAP_MAP_BD_WCOMB:
1806 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1807 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1808 		break;
1809 	case PMAP_MAP_BD_POSTED:
1810 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1811 		break;
1812 	case PMAP_MAP_BD_POSTED_REORDERED:
1813 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1814 		break;
1815 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1816 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1817 		break;
1818 	default:
1819 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1820 		break;
1821 	}
1822 
1823 	tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1824 	    mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1825 #if __ARM_KERNEL_PROTECT__
1826 	tmplate |= ARM_PTE_NG;
1827 #endif /* __ARM_KERNEL_PROTECT__ */
1828 
1829 	vaddr = virt;
1830 	paddr = start;
1831 	while (paddr < end) {
1832 		ptep = pmap_pte(kernel_pmap, vaddr);
1833 		if (ptep == PT_ENTRY_NULL) {
1834 			panic("%s: no PTE for vaddr=%p, "
1835 			    "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1836 			    __FUNCTION__, (void*)vaddr,
1837 			    (void*)virt, (void*)start, (void*)end, prot, options);
1838 		}
1839 
1840 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1841 		write_pte_strong(ptep, tmplate);
1842 
1843 		pte_increment_pa(tmplate);
1844 		vaddr += PAGE_SIZE;
1845 		paddr += PAGE_SIZE;
1846 	}
1847 
1848 	if (end >= start) {
1849 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1850 	}
1851 
1852 	return vaddr;
1853 }
1854 
1855 /*
1856  *      Back-door routine for mapping kernel VM at initialization.
1857  *      Useful for mapping memory outside the range
1858  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1859  *      Otherwise like pmap_map.
1860  */
1861 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1862 pmap_map_bd(
1863 	vm_map_address_t virt,
1864 	vm_offset_t start,
1865 	vm_offset_t end,
1866 	vm_prot_t prot)
1867 {
1868 	pt_entry_t      tmplate;
1869 	pt_entry_t              *ptep;
1870 	vm_map_address_t vaddr;
1871 	vm_offset_t             paddr;
1872 
1873 	/* not cacheable and not buffered */
1874 	tmplate = pa_to_pte(start)
1875 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1876 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1877 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1878 #if __ARM_KERNEL_PROTECT__
1879 	tmplate |= ARM_PTE_NG;
1880 #endif /* __ARM_KERNEL_PROTECT__ */
1881 
1882 	vaddr = virt;
1883 	paddr = start;
1884 	while (paddr < end) {
1885 		ptep = pmap_pte(kernel_pmap, vaddr);
1886 		if (ptep == PT_ENTRY_NULL) {
1887 			panic("pmap_map_bd");
1888 		}
1889 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1890 		write_pte_strong(ptep, tmplate);
1891 
1892 		pte_increment_pa(tmplate);
1893 		vaddr += PAGE_SIZE;
1894 		paddr += PAGE_SIZE;
1895 	}
1896 
1897 	if (end >= start) {
1898 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1899 	}
1900 
1901 	return vaddr;
1902 }
1903 
1904 /*
1905  *      Back-door routine for mapping kernel VM at initialization.
1906  *      Useful for mapping memory specific physical addresses in early
1907  *      boot (i.e., before kernel_map is initialized).
1908  *
1909  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1910  */
1911 
1912 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1913 pmap_map_high_window_bd(
1914 	vm_offset_t pa_start,
1915 	vm_size_t len,
1916 	vm_prot_t prot)
1917 {
1918 	pt_entry_t              *ptep, pte;
1919 	vm_map_address_t        va_start = VREGION1_START;
1920 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1921 	vm_map_address_t        va_end;
1922 	vm_map_address_t        va;
1923 	vm_size_t               offset;
1924 
1925 	offset = pa_start & PAGE_MASK;
1926 	pa_start -= offset;
1927 	len += offset;
1928 
1929 	if (len > (va_max - va_start)) {
1930 		panic("%s: area too large, "
1931 		    "pa_start=%p, len=%p, prot=0x%x",
1932 		    __FUNCTION__,
1933 		    (void*)pa_start, (void*)len, prot);
1934 	}
1935 
1936 scan:
1937 	for (; va_start < va_max; va_start += PAGE_SIZE) {
1938 		ptep = pmap_pte(kernel_pmap, va_start);
1939 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1940 		if (*ptep == ARM_PTE_TYPE_FAULT) {
1941 			break;
1942 		}
1943 	}
1944 	if (va_start > va_max) {
1945 		panic("%s: insufficient pages, "
1946 		    "pa_start=%p, len=%p, prot=0x%x",
1947 		    __FUNCTION__,
1948 		    (void*)pa_start, (void*)len, prot);
1949 	}
1950 
1951 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1952 		ptep = pmap_pte(kernel_pmap, va_end);
1953 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1954 		if (*ptep != ARM_PTE_TYPE_FAULT) {
1955 			va_start = va_end + PAGE_SIZE;
1956 			goto scan;
1957 		}
1958 	}
1959 
1960 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1961 		ptep = pmap_pte(kernel_pmap, va);
1962 		pte = pa_to_pte(pa_start)
1963 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1964 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1965 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
1966 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
1967 #if __ARM_KERNEL_PROTECT__
1968 		pte |= ARM_PTE_NG;
1969 #endif /* __ARM_KERNEL_PROTECT__ */
1970 		write_pte_strong(ptep, pte);
1971 	}
1972 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
1973 #if KASAN
1974 	kasan_notify_address(va_start, len);
1975 #endif
1976 	return va_start;
1977 }
1978 
1979 static uint32_t
pmap_compute_max_asids(void)1980 pmap_compute_max_asids(void)
1981 {
1982 	DTEntry entry;
1983 	void const *prop = NULL;
1984 	uint32_t max_asids;
1985 	int err;
1986 	unsigned int prop_size;
1987 
1988 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
1989 	assert(err == kSuccess);
1990 
1991 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
1992 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
1993 		 * we can choose a more flexible default value here. */
1994 		return MAX_ASIDS;
1995 	}
1996 
1997 	if (prop_size != sizeof(max_asids)) {
1998 		panic("pmap-max-asids property is not a 32-bit integer");
1999 	}
2000 
2001 	max_asids = *((uint32_t const *)prop);
2002 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2003 	max_asids = (max_asids + 63) & ~63UL;
2004 
2005 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2006 		/* currently capped by size of pmap->sw_asid */
2007 		panic("pmap-max-asids too large");
2008 	}
2009 	if (max_asids == 0) {
2010 		panic("pmap-max-asids cannot be zero");
2011 	}
2012 	return max_asids;
2013 }
2014 
2015 #if __arm64__
2016 /*
2017  * pmap_get_arm64_prot
2018  *
2019  * return effective armv8 VMSA block protections including
2020  * table AP/PXN/XN overrides of a pmap entry
2021  *
2022  */
2023 
2024 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2025 pmap_get_arm64_prot(
2026 	pmap_t pmap,
2027 	vm_offset_t addr)
2028 {
2029 	tt_entry_t tte = 0;
2030 	unsigned int level = 0;
2031 	uint64_t tte_type = 0;
2032 	uint64_t effective_prot_bits = 0;
2033 	uint64_t aggregate_tte = 0;
2034 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2035 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2036 
2037 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2038 		tte = *pmap_ttne(pmap, level, addr);
2039 
2040 		if (!(tte & ARM_TTE_VALID)) {
2041 			return 0;
2042 		}
2043 
2044 		tte_type = tte & ARM_TTE_TYPE_MASK;
2045 
2046 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2047 		    (level == pt_attr->pta_max_level)) {
2048 			/* Block or page mapping; both have the same protection bit layout. */
2049 			break;
2050 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2051 			/* All of the table bits we care about are overrides, so just OR them together. */
2052 			aggregate_tte |= tte;
2053 		}
2054 	}
2055 
2056 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2057 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2058 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2059 
2060 	/* Start with the PTE bits. */
2061 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2062 
2063 	/* Table AP bits mask out block/page AP bits */
2064 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2065 
2066 	/* XN/PXN bits can be OR'd in. */
2067 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2068 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2069 
2070 	return effective_prot_bits;
2071 }
2072 #endif /* __arm64__ */
2073 
2074 static void
pmap_set_srd_fusing()2075 pmap_set_srd_fusing()
2076 {
2077 	DTEntry entry;
2078 	uint32_t const *prop = NULL;
2079 	int err;
2080 	unsigned int prop_size = 0;
2081 
2082 	err = SecureDTLookupEntry(NULL, "/chosen", &entry);
2083 	if (err != kSuccess) {
2084 		panic("PMAP: no chosen DT node");
2085 	}
2086 
2087 	if (kSuccess == SecureDTGetProperty(entry, "research-enabled", (const void**)&prop, &prop_size)) {
2088 		if (prop_size == sizeof(uint32_t)) {
2089 			srd_fused = *prop;
2090 		}
2091 	}
2092 
2093 #if DEVELOPMENT || DEBUG
2094 	PE_parse_boot_argn("srd_fusing", &srd_fused, sizeof(srd_fused));
2095 #endif
2096 }
2097 
2098 /*
2099  *	Bootstrap the system enough to run with virtual memory.
2100  *
2101  *	The early VM initialization code has already allocated
2102  *	the first CPU's translation table and made entries for
2103  *	all the one-to-one mappings to be found there.
2104  *
2105  *	We must set up the kernel pmap structures, the
2106  *	physical-to-virtual translation lookup tables for the
2107  *	physical memory to be managed (between avail_start and
2108  *	avail_end).
2109  *
2110  *	Map the kernel's code and data, and allocate the system page table.
2111  *	Page_size must already be set.
2112  *
2113  *	Parameters:
2114  *	first_avail	first available physical page -
2115  *			   after kernel page tables
2116  *	avail_start	PA of first managed physical page
2117  *	avail_end	PA of last managed physical page
2118  */
2119 
2120 void
pmap_bootstrap(vm_offset_t vstart)2121 pmap_bootstrap(
2122 	vm_offset_t vstart)
2123 {
2124 	vm_map_offset_t maxoffset;
2125 
2126 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2127 
2128 	pmap_set_srd_fusing();
2129 
2130 #if XNU_MONITOR
2131 
2132 #if DEVELOPMENT || DEBUG
2133 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2134 #endif
2135 
2136 #if CONFIG_CSR_FROM_DT
2137 	if (csr_unsafe_kernel_text) {
2138 		pmap_ppl_disable = true;
2139 	}
2140 #endif /* CONFIG_CSR_FROM_DT */
2141 
2142 #endif /* XNU_MONITOR */
2143 
2144 #if DEVELOPMENT || DEBUG
2145 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2146 		kprintf("Kernel traces for pmap operations enabled\n");
2147 	}
2148 #endif
2149 
2150 	/*
2151 	 *	Initialize the kernel pmap.
2152 	 */
2153 #if ARM_PARAMETERIZED_PMAP
2154 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2155 #endif /* ARM_PARAMETERIZED_PMAP */
2156 #if HAS_APPLE_PAC
2157 	kernel_pmap->disable_jop = 0;
2158 #endif /* HAS_APPLE_PAC */
2159 	kernel_pmap->tte = cpu_tte;
2160 	kernel_pmap->ttep = cpu_ttep;
2161 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2162 	kernel_pmap->max = UINTPTR_MAX;
2163 	os_atomic_init(&kernel_pmap->ref_count, 1);
2164 #if XNU_MONITOR
2165 	os_atomic_init(&kernel_pmap->nested_count, 0);
2166 #endif
2167 	kernel_pmap->nx_enabled = TRUE;
2168 #ifdef  __arm64__
2169 	kernel_pmap->is_64bit = TRUE;
2170 #else
2171 	kernel_pmap->is_64bit = FALSE;
2172 #endif
2173 #if CONFIG_ROSETTA
2174 	kernel_pmap->is_rosetta = FALSE;
2175 #endif
2176 
2177 #if ARM_PARAMETERIZED_PMAP
2178 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2179 #endif /* ARM_PARAMETERIZED_PMAP */
2180 
2181 	kernel_pmap->nested_region_addr = 0x0ULL;
2182 	kernel_pmap->nested_region_size = 0x0ULL;
2183 	kernel_pmap->nested_region_asid_bitmap = NULL;
2184 	kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2185 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2186 
2187 	kernel_pmap->hw_asid = 0;
2188 	kernel_pmap->sw_asid = 0;
2189 
2190 	pmap_lock_init(kernel_pmap);
2191 
2192 	pmap_max_asids = pmap_compute_max_asids();
2193 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2194 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2195 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2196 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2197 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2198 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2199 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2200 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2201 
2202 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2203 
2204 	/**
2205 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2206 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2207 	 * space for these data structures.
2208 	 */
2209 	pmap_data_bootstrap();
2210 
2211 	/**
2212 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2213 	 */
2214 	uat_bootstrap();
2215 
2216 
2217 	/**
2218 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2219 	 */
2220 	sart_bootstrap();
2221 
2222 	/**
2223 	 * Don't make any assumptions about the alignment of avail_start before this
2224 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2225 	 */
2226 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2227 
2228 	const pmap_paddr_t pmap_struct_start = avail_start;
2229 
2230 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2231 	avail_start = round_page(avail_start + asid_table_size);
2232 
2233 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2234 
2235 	vm_first_phys = gPhysBase;
2236 	vm_last_phys = trunc_page(avail_end);
2237 
2238 	queue_init(&map_pmap_list);
2239 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2240 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2241 	free_page_size_tt_count = 0;
2242 	free_page_size_tt_max = 0;
2243 	free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2244 	free_two_page_size_tt_count = 0;
2245 	free_two_page_size_tt_max = 0;
2246 	free_tt_list = TT_FREE_ENTRY_NULL;
2247 	free_tt_count = 0;
2248 	free_tt_max = 0;
2249 
2250 	virtual_space_start = vstart;
2251 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2252 
2253 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2254 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2255 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2256 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2257 
2258 
2259 
2260 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2261 		maxoffset = trunc_page(maxoffset);
2262 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2263 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2264 			arm_pmap_max_offset_default = maxoffset;
2265 		}
2266 	}
2267 #if defined(__arm64__)
2268 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2269 		maxoffset = trunc_page(maxoffset);
2270 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2271 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2272 			arm64_pmap_max_offset_default = maxoffset;
2273 		}
2274 	}
2275 #endif
2276 
2277 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2278 
2279 
2280 #if PMAP_CS_PPL_MONITOR
2281 	/* Initialize the PPL trust cache read-write lock */
2282 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2283 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2284 #endif
2285 
2286 #if MACH_ASSERT
2287 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2288 	    &vm_footprint_suspend_allowed,
2289 	    sizeof(vm_footprint_suspend_allowed));
2290 #endif /* MACH_ASSERT */
2291 
2292 #if KASAN
2293 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2294 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2295 #endif /* KASAN */
2296 
2297 	/**
2298 	 * Ensure that avail_start is always left on a page boundary. The calling
2299 	 * code might not perform any alignment before allocating page tables so
2300 	 * this is important.
2301 	 */
2302 	avail_start = round_page(avail_start);
2303 }
2304 
2305 #if XNU_MONITOR
2306 
2307 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2308 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2309 {
2310 	pmap_paddr_t cur_pa;
2311 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2312 		assert(pa_valid(cur_pa));
2313 		ppattr_pa_set_monitor(cur_pa);
2314 	}
2315 }
2316 
2317 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2318 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2319     pmap_paddr_t end_pa,
2320     unsigned int expected_perm,
2321     unsigned int new_perm)
2322 {
2323 	vm_offset_t start_va = phystokv(start_pa);
2324 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2325 
2326 	pa_set_range_monitor(start_pa, end_pa);
2327 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2328 }
2329 
2330 static void
pmap_lockdown_kc(void)2331 pmap_lockdown_kc(void)
2332 {
2333 	extern vm_offset_t vm_kernelcache_base;
2334 	extern vm_offset_t vm_kernelcache_top;
2335 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2336 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2337 	pmap_paddr_t cur_pa = start_pa;
2338 	vm_offset_t cur_va = vm_kernelcache_base;
2339 	while (cur_pa < end_pa) {
2340 		vm_size_t range_size = end_pa - cur_pa;
2341 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2342 		if (ptov_va != cur_va) {
2343 			/*
2344 			 * If the physical address maps back to a virtual address that is non-linear
2345 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2346 			 * reclaimed by the OS and should therefore not be locked down.
2347 			 */
2348 			cur_pa += range_size;
2349 			cur_va += range_size;
2350 			continue;
2351 		}
2352 		unsigned int pai = pa_index(cur_pa);
2353 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2354 
2355 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2356 
2357 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2358 			panic("pai %d already locked down", pai);
2359 		}
2360 
2361 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2362 		cur_pa += ARM_PGBYTES;
2363 		cur_va += ARM_PGBYTES;
2364 	}
2365 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2366 	extern uint64_t ctrr_ro_test;
2367 	extern uint64_t ctrr_nx_test;
2368 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2369 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2370 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2371 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2372 	}
2373 #endif
2374 }
2375 
2376 void
pmap_static_allocations_done(void)2377 pmap_static_allocations_done(void)
2378 {
2379 	pmap_paddr_t monitor_start_pa;
2380 	pmap_paddr_t monitor_end_pa;
2381 
2382 	/*
2383 	 * Protect the bootstrap (V=P and V->P) page tables.
2384 	 *
2385 	 * These bootstrap allocations will be used primarily for page tables.
2386 	 * If we wish to secure the page tables, we need to start by marking
2387 	 * these bootstrap allocations as pages that we want to protect.
2388 	 */
2389 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2390 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2391 
2392 	/* The bootstrap page tables are mapped RW at boostrap. */
2393 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2394 
2395 	/*
2396 	 * We use avail_start as a pointer to the first address that has not
2397 	 * been reserved for bootstrap, so we know which pages to give to the
2398 	 * virtual memory layer.
2399 	 */
2400 	monitor_start_pa = BootArgs->topOfKernelData;
2401 	monitor_end_pa = avail_start;
2402 
2403 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2404 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2405 
2406 	/*
2407 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2408 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2409 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2410 	 * they can't be allocated for other uses.  We don't need a special xPRR
2411 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2412 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2413 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2414 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2415 	 * to believe we are dealing with an user XO page upon performing a translation.
2416 	 */
2417 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2418 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2419 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2420 
2421 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2422 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2423 
2424 	/* PPL data is RW for the PPL, RO for the kernel. */
2425 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2426 
2427 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2428 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2429 
2430 	/* PPL text is RX for the PPL, RO for the kernel. */
2431 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2432 
2433 
2434 	/*
2435 	 * In order to support DTrace, the save areas for the PPL must be
2436 	 * writable.  This is due to the fact that DTrace will try to update
2437 	 * register state.
2438 	 */
2439 	if (pmap_ppl_disable) {
2440 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2441 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2442 
2443 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2444 	}
2445 
2446 
2447 	if (segSizePPLDATACONST > 0) {
2448 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2449 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2450 
2451 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2452 	}
2453 
2454 	/*
2455 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2456 	 * precaution.  The real RW mappings are at a different location with guard pages.
2457 	 */
2458 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2459 
2460 	/* Prevent remapping of the kernelcache */
2461 	pmap_lockdown_kc();
2462 }
2463 
2464 void
pmap_lockdown_ppl(void)2465 pmap_lockdown_ppl(void)
2466 {
2467 	/* Mark the PPL as being locked down. */
2468 
2469 	mp_disable_preemption(); // for _nopreempt locking operations
2470 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2471 	if (commpage_text_kva != 0) {
2472 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2473 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2474 	}
2475 	mp_enable_preemption();
2476 
2477 	/* Write-protect the kernel RO commpage. */
2478 #error "XPRR configuration error"
2479 }
2480 #endif /* XNU_MONITOR */
2481 
2482 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2483 pmap_virtual_space(
2484 	vm_offset_t *startp,
2485 	vm_offset_t *endp
2486 	)
2487 {
2488 	*startp = virtual_space_start;
2489 	*endp = virtual_space_end;
2490 }
2491 
2492 
2493 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2494 pmap_virtual_region(
2495 	unsigned int region_select,
2496 	vm_map_offset_t *startp,
2497 	vm_map_size_t *size
2498 	)
2499 {
2500 	boolean_t       ret = FALSE;
2501 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2502 	if (region_select == 0) {
2503 		/*
2504 		 * In this config, the bootstrap mappings should occupy their own L2
2505 		 * TTs, as they should be immutable after boot.  Having the associated
2506 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2507 		 * while allowing the rest of the kernel address range to be remapped.
2508 		 */
2509 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2510 #if defined(ARM_LARGE_MEMORY)
2511 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2512 #else
2513 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2514 #endif
2515 		ret = TRUE;
2516 	}
2517 
2518 #if defined(ARM_LARGE_MEMORY)
2519 	if (region_select == 1) {
2520 		*startp = VREGION1_START;
2521 		*size = VREGION1_SIZE;
2522 		ret = TRUE;
2523 	}
2524 #endif
2525 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2526 #if defined(ARM_LARGE_MEMORY)
2527 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2528 	if (region_select == 0) {
2529 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2530 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2531 		ret = TRUE;
2532 	}
2533 
2534 	if (region_select == 1) {
2535 		*startp = VREGION1_START;
2536 		*size = VREGION1_SIZE;
2537 		ret = TRUE;
2538 	}
2539 #else /* !defined(ARM_LARGE_MEMORY) */
2540 	unsigned long low_global_vr_mask = 0;
2541 	vm_map_size_t low_global_vr_size = 0;
2542 
2543 	if (region_select == 0) {
2544 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2545 		if (!TEST_PAGE_SIZE_4K) {
2546 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2547 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2548 		} else {
2549 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2550 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2551 		}
2552 		ret = TRUE;
2553 	}
2554 	if (region_select == 1) {
2555 		*startp = VREGION1_START;
2556 		*size = VREGION1_SIZE;
2557 		ret = TRUE;
2558 	}
2559 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2560 	if (!TEST_PAGE_SIZE_4K) {
2561 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2562 		low_global_vr_size = 0x2000000;
2563 	} else {
2564 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2565 		low_global_vr_size = 0x800000;
2566 	}
2567 
2568 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2569 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2570 		*size = low_global_vr_size;
2571 		ret = TRUE;
2572 	}
2573 
2574 	if (region_select == 3) {
2575 		/* In this config, we allow the bootstrap mappings to occupy the same
2576 		 * page table pages as the heap.
2577 		 */
2578 		*startp = VM_MIN_KERNEL_ADDRESS;
2579 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2580 		ret = TRUE;
2581 	}
2582 #endif /* defined(ARM_LARGE_MEMORY) */
2583 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2584 	return ret;
2585 }
2586 
2587 /*
2588  * Routines to track and allocate physical pages during early boot.
2589  * On most systems that memory runs from first_avail through to avail_end
2590  * with no gaps.
2591  *
2592  * If the system supports ECC and ecc_bad_pages_count > 0, we
2593  * need to skip those pages.
2594  */
2595 
2596 static unsigned int avail_page_count = 0;
2597 static bool need_ram_ranges_init = true;
2598 
2599 
2600 /**
2601  * Checks to see if a given page is in
2602  * the array of known bad pages
2603  *
2604  * @param ppn page number to check
2605  */
2606 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2607 pmap_is_bad_ram(__unused ppnum_t ppn)
2608 {
2609 	return false;
2610 }
2611 
2612 /**
2613  * Prepare bad ram pages to be skipped.
2614  */
2615 
2616 /*
2617  * Initialize the count of available pages. No lock needed here,
2618  * as this code is called while kernel boot up is single threaded.
2619  */
2620 static void
initialize_ram_ranges(void)2621 initialize_ram_ranges(void)
2622 {
2623 	pmap_paddr_t first = first_avail;
2624 	pmap_paddr_t end = avail_end;
2625 
2626 	assert(first <= end);
2627 	assert(first == (first & ~PAGE_MASK));
2628 	assert(end == (end & ~PAGE_MASK));
2629 	avail_page_count = atop(end - first);
2630 
2631 	need_ram_ranges_init = false;
2632 }
2633 
2634 unsigned int
pmap_free_pages(void)2635 pmap_free_pages(
2636 	void)
2637 {
2638 	if (need_ram_ranges_init) {
2639 		initialize_ram_ranges();
2640 	}
2641 	return avail_page_count;
2642 }
2643 
2644 unsigned int
pmap_free_pages_span(void)2645 pmap_free_pages_span(
2646 	void)
2647 {
2648 	if (need_ram_ranges_init) {
2649 		initialize_ram_ranges();
2650 	}
2651 	return (unsigned int)atop(avail_end - first_avail);
2652 }
2653 
2654 
2655 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2656 pmap_next_page_hi(
2657 	ppnum_t            * pnum,
2658 	__unused boolean_t might_free)
2659 {
2660 	return pmap_next_page(pnum);
2661 }
2662 
2663 
2664 boolean_t
pmap_next_page(ppnum_t * pnum)2665 pmap_next_page(
2666 	ppnum_t *pnum)
2667 {
2668 	if (need_ram_ranges_init) {
2669 		initialize_ram_ranges();
2670 	}
2671 
2672 
2673 	if (first_avail != avail_end) {
2674 		*pnum = (ppnum_t)atop(first_avail);
2675 		first_avail += PAGE_SIZE;
2676 		assert(avail_page_count > 0);
2677 		--avail_page_count;
2678 		return TRUE;
2679 	}
2680 	assert(avail_page_count == 0);
2681 	return FALSE;
2682 }
2683 
2684 
2685 /*
2686  *	Initialize the pmap module.
2687  *	Called by vm_init, to initialize any structures that the pmap
2688  *	system needs to map virtual memory.
2689  */
2690 void
pmap_init(void)2691 pmap_init(
2692 	void)
2693 {
2694 	/*
2695 	 *	Protect page zero in the kernel map.
2696 	 *	(can be overruled by permanent transltion
2697 	 *	table entries at page zero - see arm_vm_init).
2698 	 */
2699 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2700 
2701 	pmap_initialized = TRUE;
2702 
2703 	/*
2704 	 *	Create the zone of physical maps
2705 	 *	and the physical-to-virtual entries.
2706 	 */
2707 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2708 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2709 
2710 
2711 	/*
2712 	 *	Initialize the pmap object (for tracking the vm_page_t
2713 	 *	structures for pages we allocate to be page tables in
2714 	 *	pmap_expand().
2715 	 */
2716 	_vm_object_allocate(mem_size, pmap_object);
2717 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2718 
2719 	/*
2720 	 * The values of [hard_]maxproc may have been scaled, make sure
2721 	 * they are still less than the value of pmap_max_asids.
2722 	 */
2723 	if ((uint32_t)maxproc > pmap_max_asids) {
2724 		maxproc = pmap_max_asids;
2725 	}
2726 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2727 		hard_maxproc = pmap_max_asids;
2728 	}
2729 }
2730 
2731 /**
2732  * Verify that a given physical page contains no mappings (outside of the
2733  * default physical aperture mapping).
2734  *
2735  * @param ppnum Physical page number to check there are no mappings to.
2736  *
2737  * @return True if there are no mappings, false otherwise or if the page is not
2738  *         kernel-managed.
2739  */
2740 bool
pmap_verify_free(ppnum_t ppnum)2741 pmap_verify_free(ppnum_t ppnum)
2742 {
2743 	const pmap_paddr_t pa = ptoa(ppnum);
2744 
2745 	assert(pa != vm_page_fictitious_addr);
2746 
2747 	/* Only mappings to kernel-managed physical memory are tracked. */
2748 	if (!pa_valid(pa)) {
2749 		return false;
2750 	}
2751 
2752 	const unsigned int pai = pa_index(pa);
2753 	pv_entry_t **pvh = pai_to_pvh(pai);
2754 
2755 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2756 }
2757 
2758 #if MACH_ASSERT
2759 /**
2760  * Verify that a given physical page contains no mappings (outside of the
2761  * default physical aperture mapping) and if it does, then panic.
2762  *
2763  * @note It's recommended to use pmap_verify_free() directly when operating in
2764  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2765  *       normally being called from outside of the PPL, and the pv_head_table
2766  *       can't be modified outside of the PPL).
2767  *
2768  * @param ppnum Physical page number to check there are no mappings to.
2769  */
2770 void
pmap_assert_free(ppnum_t ppnum)2771 pmap_assert_free(ppnum_t ppnum)
2772 {
2773 	const pmap_paddr_t pa = ptoa(ppnum);
2774 
2775 	/* Only mappings to kernel-managed physical memory are tracked. */
2776 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2777 		return;
2778 	}
2779 
2780 	const unsigned int pai = pa_index(pa);
2781 	pv_entry_t **pvh = pai_to_pvh(pai);
2782 
2783 	/**
2784 	 * This function is always called from outside of the PPL. Because of this,
2785 	 * the PVH entry can't be locked. This function is generally only called
2786 	 * before the VM reclaims a physical page and shouldn't be creating new
2787 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2788 	 * the worst case is that the system will panic in another way, and we were
2789 	 * already about to panic anyway.
2790 	 */
2791 
2792 	/**
2793 	 * Since pmap_verify_free() returned false, that means there is at least one
2794 	 * mapping left. Let's get some extra info on the first mapping we find to
2795 	 * dump in the panic string (the common case is that there is one spare
2796 	 * mapping that was never unmapped).
2797 	 */
2798 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2799 
2800 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2801 		first_ptep = pvh_ptep(pvh);
2802 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2803 		pv_entry_t *pvep = pvh_pve_list(pvh);
2804 
2805 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2806 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2807 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2808 			if (first_ptep != PT_ENTRY_NULL) {
2809 				break;
2810 			}
2811 		}
2812 
2813 		/* The PVE should have at least one valid PTE. */
2814 		assert(first_ptep != PT_ENTRY_NULL);
2815 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2816 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2817 		    __func__, pvh, pai);
2818 	} else {
2819 		/**
2820 		 * The mapping disappeared between here and the pmap_verify_free() call.
2821 		 * The only way that can happen is if the VM was racing this call with
2822 		 * a call that unmaps PTEs. Operations on this page should not be
2823 		 * occurring at the same time as this check, and unfortunately we can't
2824 		 * lock the PVH entry to prevent it, so just panic instead.
2825 		 */
2826 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2827 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2828 		    __func__, pvh, pai);
2829 	}
2830 
2831 	/* Panic with a unique string identifying the first bad mapping and owner. */
2832 	{
2833 		/* First PTE is mapped by the main CPUs. */
2834 		pmap_t pmap = ptep_get_pmap(first_ptep);
2835 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2836 
2837 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2838 		    "%s CPU mapping (pmap: %p)",
2839 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2840 	}
2841 }
2842 #endif
2843 
2844 
2845 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2846 pmap_root_alloc_size(pmap_t pmap)
2847 {
2848 #pragma unused(pmap)
2849 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2850 	unsigned int root_level = pt_attr_root_level(pt_attr);
2851 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2852 }
2853 
2854 
2855 /*
2856  *	Create and return a physical map.
2857  *
2858  *	If the size specified for the map
2859  *	is zero, the map is an actual physical
2860  *	map, and may be referenced by the
2861  *	hardware.
2862  *
2863  *	If the size specified is non-zero,
2864  *	the map will be used in software only, and
2865  *	is bounded by that size.
2866  */
2867 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2868 pmap_create_options_internal(
2869 	ledger_t ledger,
2870 	vm_map_size_t size,
2871 	unsigned int flags,
2872 	kern_return_t *kr)
2873 {
2874 	unsigned        i;
2875 	unsigned        tte_index_max;
2876 	pmap_t          p;
2877 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2878 #if defined(HAS_APPLE_PAC)
2879 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2880 #endif /* defined(HAS_APPLE_PAC) */
2881 	kern_return_t   local_kr = KERN_SUCCESS;
2882 
2883 	if (size != 0) {
2884 		{
2885 			// Size parameter should only be set for stage 2.
2886 			return PMAP_NULL;
2887 		}
2888 	}
2889 
2890 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2891 		return PMAP_NULL;
2892 	}
2893 
2894 #if XNU_MONITOR
2895 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2896 		goto pmap_create_fail;
2897 	}
2898 
2899 	assert(p != PMAP_NULL);
2900 
2901 	if (ledger) {
2902 		pmap_ledger_validate(ledger);
2903 		pmap_ledger_retain(ledger);
2904 	}
2905 #else
2906 	/*
2907 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2908 	 *	the translation table of the right size for the pmap.
2909 	 */
2910 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2911 		local_kr = KERN_RESOURCE_SHORTAGE;
2912 		goto pmap_create_fail;
2913 	}
2914 #endif
2915 
2916 	p->ledger = ledger;
2917 
2918 
2919 	p->pmap_vm_map_cs_enforced = false;
2920 	p->min = 0;
2921 
2922 
2923 #if CONFIG_ROSETTA
2924 	if (flags & PMAP_CREATE_ROSETTA) {
2925 		p->is_rosetta = TRUE;
2926 	} else {
2927 		p->is_rosetta = FALSE;
2928 	}
2929 #endif /* CONFIG_ROSETTA */
2930 
2931 #if defined(HAS_APPLE_PAC)
2932 	p->disable_jop = disable_jop;
2933 #endif /* defined(HAS_APPLE_PAC) */
2934 
2935 	p->nested_region_true_start = 0;
2936 	p->nested_region_true_end = ~0;
2937 
2938 	p->nx_enabled = true;
2939 	p->is_64bit = is_64bit;
2940 	p->nested_pmap = PMAP_NULL;
2941 	p->type = PMAP_TYPE_USER;
2942 
2943 #if ARM_PARAMETERIZED_PMAP
2944 	/* Default to the native pt_attr */
2945 	p->pmap_pt_attr = native_pt_attr;
2946 #endif /* ARM_PARAMETERIZED_PMAP */
2947 #if __ARM_MIXED_PAGE_SIZE__
2948 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2949 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2950 	}
2951 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2952 	p->max = pmap_user_va_size(p);
2953 
2954 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2955 		local_kr = KERN_NO_SPACE;
2956 		goto id_alloc_fail;
2957 	}
2958 
2959 	pmap_lock_init(p);
2960 
2961 	p->tt_entry_free = (tt_entry_t *)0;
2962 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
2963 
2964 
2965 #if XNU_MONITOR
2966 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
2967 #else
2968 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
2969 #endif
2970 	if (!(p->tte)) {
2971 		local_kr = KERN_RESOURCE_SHORTAGE;
2972 		goto tt1_alloc_fail;
2973 	}
2974 
2975 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
2976 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2977 
2978 	/* nullify the translation table */
2979 	for (i = 0; i < tte_index_max; i++) {
2980 		p->tte[i] = ARM_TTE_TYPE_FAULT;
2981 	}
2982 
2983 	FLUSH_PTE();
2984 
2985 	/*
2986 	 *  initialize the rest of the structure
2987 	 */
2988 	p->nested_region_addr = 0x0ULL;
2989 	p->nested_region_size = 0x0ULL;
2990 	p->nested_region_asid_bitmap = NULL;
2991 	p->nested_region_asid_bitmap_size = 0x0UL;
2992 
2993 	p->nested_has_no_bounds_ref = false;
2994 	p->nested_no_bounds_refcnt = 0;
2995 	p->nested_bounds_set = false;
2996 
2997 
2998 #if MACH_ASSERT
2999 	p->pmap_pid = 0;
3000 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3001 #endif /* MACH_ASSERT */
3002 #if DEVELOPMENT || DEBUG
3003 	p->footprint_was_suspended = FALSE;
3004 #endif /* DEVELOPMENT || DEBUG */
3005 
3006 #if XNU_MONITOR
3007 	os_atomic_init(&p->nested_count, 0);
3008 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3009 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3010 	os_atomic_thread_fence(release);
3011 #endif
3012 	os_atomic_init(&p->ref_count, 1);
3013 	pmap_simple_lock(&pmaps_lock);
3014 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3015 	pmap_simple_unlock(&pmaps_lock);
3016 
3017 	/*
3018 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3019 	 * which can lead to a concurrent disconnect operation making the balance
3020 	 * transiently negative.  The ledger should still ultimately balance out,
3021 	 * which we still check upon pmap destruction.
3022 	 */
3023 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3024 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3025 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3026 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3027 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3028 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3029 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3030 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3031 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3032 
3033 	return p;
3034 
3035 tt1_alloc_fail:
3036 	pmap_get_pt_ops(p)->free_id(p);
3037 id_alloc_fail:
3038 #if XNU_MONITOR
3039 	pmap_free_pmap(p);
3040 
3041 	if (ledger) {
3042 		pmap_ledger_release(ledger);
3043 	}
3044 #else
3045 	zfree(pmap_zone, p);
3046 #endif
3047 pmap_create_fail:
3048 #if XNU_MONITOR
3049 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3050 #endif
3051 	*kr = local_kr;
3052 #if XNU_MONITOR
3053 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3054 #endif
3055 	return PMAP_NULL;
3056 }
3057 
3058 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3059 pmap_create_options(
3060 	ledger_t ledger,
3061 	vm_map_size_t size,
3062 	unsigned int flags)
3063 {
3064 	pmap_t pmap;
3065 	kern_return_t kr = KERN_SUCCESS;
3066 
3067 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3068 
3069 	ledger_reference(ledger);
3070 
3071 #if XNU_MONITOR
3072 	for (;;) {
3073 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3074 		if (kr != KERN_RESOURCE_SHORTAGE) {
3075 			break;
3076 		}
3077 		assert(pmap == PMAP_NULL);
3078 		pmap_alloc_page_for_ppl(0);
3079 		kr = KERN_SUCCESS;
3080 	}
3081 #else
3082 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3083 #endif
3084 
3085 	if (pmap == PMAP_NULL) {
3086 		ledger_dereference(ledger);
3087 	}
3088 
3089 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3090 
3091 	return pmap;
3092 }
3093 
3094 #if XNU_MONITOR
3095 /*
3096  * This symbol remains in place when the PPL is enabled so that the dispatch
3097  * table does not change from development to release configurations.
3098  */
3099 #endif
3100 #if MACH_ASSERT || XNU_MONITOR
3101 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3102 pmap_set_process_internal(
3103 	__unused pmap_t pmap,
3104 	__unused int pid,
3105 	__unused char *procname)
3106 {
3107 #if MACH_ASSERT
3108 	if (pmap == NULL || pmap->pmap_pid == -1) {
3109 		return;
3110 	}
3111 
3112 	validate_pmap_mutable(pmap);
3113 
3114 	pmap->pmap_pid = pid;
3115 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3116 #endif /* MACH_ASSERT */
3117 }
3118 #endif /* MACH_ASSERT || XNU_MONITOR */
3119 
3120 #if MACH_ASSERT
3121 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3122 pmap_set_process(
3123 	pmap_t pmap,
3124 	int pid,
3125 	char *procname)
3126 {
3127 #if XNU_MONITOR
3128 	pmap_set_process_ppl(pmap, pid, procname);
3129 #else
3130 	pmap_set_process_internal(pmap, pid, procname);
3131 #endif
3132 }
3133 #endif /* MACH_ASSERT */
3134 
3135 /*
3136  * pmap_deallocate_all_leaf_tts:
3137  *
3138  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3139  * removing and deallocating all TTEs.
3140  */
3141 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3142 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3143 {
3144 	tt_entry_t tte = ARM_TTE_EMPTY;
3145 	tt_entry_t * ttep = NULL;
3146 	tt_entry_t * last_ttep = NULL;
3147 
3148 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3149 
3150 	assert(level < pt_attr_leaf_level(pt_attr));
3151 
3152 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3153 
3154 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3155 		tte = *ttep;
3156 
3157 		if (!(tte & ARM_TTE_VALID)) {
3158 			continue;
3159 		}
3160 
3161 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3162 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3163 			    "pmap=%p, first_ttep=%p, level=%u",
3164 			    __FUNCTION__, ttep, (void *)tte,
3165 			    pmap, first_ttep, level);
3166 		}
3167 
3168 		/* Must be valid, type table */
3169 		if (level < pt_attr_twig_level(pt_attr)) {
3170 			/* If we haven't reached the twig level, recurse to the next level. */
3171 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3172 		}
3173 
3174 		/* Remove the TTE. */
3175 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3176 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3177 	}
3178 }
3179 
3180 /*
3181  * We maintain stats and ledgers so that a task's physical footprint is:
3182  * phys_footprint = ((internal - alternate_accounting)
3183  *                   + (internal_compressed - alternate_accounting_compressed)
3184  *                   + iokit_mapped
3185  *                   + purgeable_nonvolatile
3186  *                   + purgeable_nonvolatile_compressed
3187  *                   + page_table)
3188  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3189  */
3190 
3191 /*
3192  *	Retire the given physical map from service.
3193  *	Should only be called if the map contains
3194  *	no valid mappings.
3195  */
3196 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3197 pmap_destroy_internal(
3198 	pmap_t pmap)
3199 {
3200 	if (pmap == PMAP_NULL) {
3201 		return;
3202 	}
3203 
3204 	validate_pmap(pmap);
3205 
3206 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3207 
3208 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3209 	if (ref_count > 0) {
3210 		return;
3211 	} else if (__improbable(ref_count < 0)) {
3212 		panic("pmap %p: refcount underflow", pmap);
3213 	} else if (__improbable(pmap == kernel_pmap)) {
3214 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3215 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3216 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3217 	}
3218 
3219 #if XNU_MONITOR
3220 	/*
3221 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3222 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3223 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3224 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3225 	 * ref_count of 0 and panic.
3226 	 */
3227 	os_atomic_thread_fence(seq_cst);
3228 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3229 		panic("pmap %p: attempt to destroy while nested", pmap);
3230 	}
3231 	const int max_cpu = ml_get_max_cpu_number();
3232 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3233 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3234 		if (cpu_data == NULL) {
3235 			continue;
3236 		}
3237 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3238 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3239 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3240 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3241 		}
3242 	}
3243 #endif
3244 	pmap_unmap_commpage(pmap);
3245 
3246 	pmap_simple_lock(&pmaps_lock);
3247 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3248 	pmap_simple_unlock(&pmaps_lock);
3249 
3250 	pmap_trim_self(pmap);
3251 
3252 	/*
3253 	 *	Free the memory maps, then the
3254 	 *	pmap structure.
3255 	 */
3256 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3257 
3258 
3259 
3260 	if (pmap->tte) {
3261 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3262 		pmap->tte = (tt_entry_t *) NULL;
3263 		pmap->ttep = 0;
3264 	}
3265 
3266 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3267 
3268 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3269 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false);
3270 		sync_tlb_flush();
3271 	} else {
3272 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3273 		sync_tlb_flush();
3274 		/* return its asid to the pool */
3275 		pmap_get_pt_ops(pmap)->free_id(pmap);
3276 		if (pmap->nested_pmap != NULL) {
3277 #if XNU_MONITOR
3278 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3279 #endif
3280 			/* release the reference we hold on the nested pmap */
3281 			pmap_destroy_internal(pmap->nested_pmap);
3282 		}
3283 	}
3284 
3285 	pmap_check_ledgers(pmap);
3286 
3287 	if (pmap->nested_region_asid_bitmap) {
3288 #if XNU_MONITOR
3289 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3290 #else
3291 		kfree_data(pmap->nested_region_asid_bitmap,
3292 		    pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3293 #endif
3294 	}
3295 
3296 #if XNU_MONITOR
3297 	if (pmap->ledger) {
3298 		pmap_ledger_release(pmap->ledger);
3299 	}
3300 
3301 	pmap_lock_destroy(pmap);
3302 	pmap_free_pmap(pmap);
3303 #else
3304 	pmap_lock_destroy(pmap);
3305 	zfree(pmap_zone, pmap);
3306 #endif
3307 }
3308 
3309 void
pmap_destroy(pmap_t pmap)3310 pmap_destroy(
3311 	pmap_t pmap)
3312 {
3313 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3314 
3315 	ledger_t ledger = pmap->ledger;
3316 
3317 #if XNU_MONITOR
3318 	pmap_destroy_ppl(pmap);
3319 
3320 	pmap_ledger_check_balance(pmap);
3321 #else
3322 	pmap_destroy_internal(pmap);
3323 #endif
3324 
3325 	ledger_dereference(ledger);
3326 
3327 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3328 }
3329 
3330 
3331 /*
3332  *	Add a reference to the specified pmap.
3333  */
3334 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3335 pmap_reference_internal(
3336 	pmap_t pmap)
3337 {
3338 	if (pmap != PMAP_NULL) {
3339 		validate_pmap_mutable(pmap);
3340 		os_atomic_inc(&pmap->ref_count, relaxed);
3341 	}
3342 }
3343 
3344 void
pmap_reference(pmap_t pmap)3345 pmap_reference(
3346 	pmap_t pmap)
3347 {
3348 #if XNU_MONITOR
3349 	pmap_reference_ppl(pmap);
3350 #else
3351 	pmap_reference_internal(pmap);
3352 #endif
3353 }
3354 
3355 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3356 pmap_tt1_allocate(
3357 	pmap_t          pmap,
3358 	vm_size_t       size,
3359 	unsigned        option)
3360 {
3361 	tt_entry_t      *tt1 = NULL;
3362 	tt_free_entry_t *tt1_free;
3363 	pmap_paddr_t    pa;
3364 	vm_address_t    va;
3365 	vm_address_t    va_end;
3366 	kern_return_t   ret;
3367 
3368 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3369 		size = PAGE_SIZE;
3370 	}
3371 
3372 	pmap_simple_lock(&tt1_lock);
3373 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3374 		free_page_size_tt_count--;
3375 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3376 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3377 	} else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3378 		free_two_page_size_tt_count--;
3379 		tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3380 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3381 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3382 		free_tt_count--;
3383 		tt1 = (tt_entry_t *)free_tt_list;
3384 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3385 	}
3386 
3387 	pmap_simple_unlock(&tt1_lock);
3388 
3389 	if (tt1 != NULL) {
3390 		pmap_tt_ledger_credit(pmap, size);
3391 		return (tt_entry_t *)tt1;
3392 	}
3393 
3394 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3395 
3396 	if (ret == KERN_RESOURCE_SHORTAGE) {
3397 		return (tt_entry_t *)0;
3398 	}
3399 
3400 #if XNU_MONITOR
3401 	assert(pa);
3402 #endif
3403 
3404 	if (size < PAGE_SIZE) {
3405 		va = phystokv(pa) + size;
3406 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3407 		tt_free_entry_t *next_free = NULL;
3408 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3409 			tt1_free = (tt_free_entry_t *)va;
3410 			tt1_free->next = next_free;
3411 			next_free = tt1_free;
3412 		}
3413 		pmap_simple_lock(&tt1_lock);
3414 		local_free_list->next = free_tt_list;
3415 		free_tt_list = next_free;
3416 		free_tt_count += ((PAGE_SIZE / size) - 1);
3417 		if (free_tt_count > free_tt_max) {
3418 			free_tt_max = free_tt_count;
3419 		}
3420 		pmap_simple_unlock(&tt1_lock);
3421 	}
3422 
3423 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3424 	 * Depending on the device, this can vary between 512b and 16K. */
3425 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3426 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3427 	pmap_tt_ledger_credit(pmap, size);
3428 
3429 	return (tt_entry_t *) phystokv(pa);
3430 }
3431 
3432 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3433 pmap_tt1_deallocate(
3434 	pmap_t pmap,
3435 	tt_entry_t *tt,
3436 	vm_size_t size,
3437 	unsigned option)
3438 {
3439 	tt_free_entry_t *tt_entry;
3440 
3441 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3442 		size = PAGE_SIZE;
3443 	}
3444 
3445 	tt_entry = (tt_free_entry_t *)tt;
3446 	assert(not_in_kdp);
3447 	pmap_simple_lock(&tt1_lock);
3448 
3449 	if (size < PAGE_SIZE) {
3450 		free_tt_count++;
3451 		if (free_tt_count > free_tt_max) {
3452 			free_tt_max = free_tt_count;
3453 		}
3454 		tt_entry->next = free_tt_list;
3455 		free_tt_list = tt_entry;
3456 	}
3457 
3458 	if (size == PAGE_SIZE) {
3459 		free_page_size_tt_count++;
3460 		if (free_page_size_tt_count > free_page_size_tt_max) {
3461 			free_page_size_tt_max = free_page_size_tt_count;
3462 		}
3463 		tt_entry->next = free_page_size_tt_list;
3464 		free_page_size_tt_list = tt_entry;
3465 	}
3466 
3467 	if (size == 2 * PAGE_SIZE) {
3468 		free_two_page_size_tt_count++;
3469 		if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3470 			free_two_page_size_tt_max = free_two_page_size_tt_count;
3471 		}
3472 		tt_entry->next = free_two_page_size_tt_list;
3473 		free_two_page_size_tt_list = tt_entry;
3474 	}
3475 
3476 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3477 		pmap_simple_unlock(&tt1_lock);
3478 		pmap_tt_ledger_debit(pmap, size);
3479 		return;
3480 	}
3481 
3482 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3483 		free_page_size_tt_count--;
3484 		tt = (tt_entry_t *)free_page_size_tt_list;
3485 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3486 
3487 		pmap_simple_unlock(&tt1_lock);
3488 
3489 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3490 
3491 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3492 
3493 		pmap_simple_lock(&tt1_lock);
3494 	}
3495 
3496 	while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3497 		free_two_page_size_tt_count--;
3498 		tt = (tt_entry_t *)free_two_page_size_tt_list;
3499 		free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3500 
3501 		pmap_simple_unlock(&tt1_lock);
3502 
3503 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3504 
3505 		OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3506 
3507 		pmap_simple_lock(&tt1_lock);
3508 	}
3509 	pmap_simple_unlock(&tt1_lock);
3510 	pmap_tt_ledger_debit(pmap, size);
3511 }
3512 
3513 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3514 pmap_tt_allocate(
3515 	pmap_t pmap,
3516 	tt_entry_t **ttp,
3517 	unsigned int level,
3518 	unsigned int options)
3519 {
3520 	pmap_paddr_t pa;
3521 	*ttp = NULL;
3522 
3523 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3524 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3525 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3526 
3527 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3528 		tt_free_next = tt_free_cur->next;
3529 		tt_free_cur->next = NULL;
3530 		*ttp = (tt_entry_t *)tt_free_cur;
3531 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3532 	}
3533 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3534 
3535 	if (*ttp == NULL) {
3536 		pt_desc_t       *ptdp;
3537 
3538 		/*
3539 		 *  Allocate a VM page for the level x page table entries.
3540 		 */
3541 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3542 			if (options & PMAP_OPTIONS_NOWAIT) {
3543 				return KERN_RESOURCE_SHORTAGE;
3544 			}
3545 			VM_PAGE_WAIT();
3546 		}
3547 
3548 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3549 			if (options & PMAP_OPTIONS_NOWAIT) {
3550 				pmap_pages_free(pa, PAGE_SIZE);
3551 				return KERN_RESOURCE_SHORTAGE;
3552 			}
3553 			VM_PAGE_WAIT();
3554 		}
3555 
3556 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3557 			OSAddAtomic64(1, &alloc_ttepages_count);
3558 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3559 		} else {
3560 			OSAddAtomic64(1, &alloc_ptepages_count);
3561 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3562 		}
3563 
3564 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3565 
3566 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3567 
3568 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3569 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3570 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3571 
3572 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3573 		if (PAGE_SIZE > pmap_page_size) {
3574 			vm_address_t    va;
3575 			vm_address_t    va_end;
3576 
3577 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3578 
3579 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3580 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3581 				pmap->tt_entry_free = (tt_entry_t *)va;
3582 			}
3583 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3584 		}
3585 
3586 		*ttp = (tt_entry_t *)phystokv(pa);
3587 	}
3588 
3589 #if XNU_MONITOR
3590 	assert(*ttp);
3591 #endif
3592 
3593 	return KERN_SUCCESS;
3594 }
3595 
3596 
3597 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3598 pmap_tt_deallocate(
3599 	pmap_t pmap,
3600 	tt_entry_t *ttp,
3601 	unsigned int level)
3602 {
3603 	pt_desc_t *ptdp;
3604 	ptd_info_t *ptd_info;
3605 	unsigned pt_acc_cnt;
3606 	unsigned i;
3607 	vm_offset_t     free_page = 0;
3608 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3609 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3610 
3611 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3612 
3613 	ptdp = ptep_get_ptd(ttp);
3614 	ptd_info = ptd_get_info(ptdp, ttp);
3615 
3616 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3617 
3618 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3619 		ptd_info->refcnt = 0;
3620 	}
3621 
3622 	if (__improbable(ptd_info->refcnt != 0)) {
3623 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3624 	}
3625 
3626 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3627 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3628 	}
3629 
3630 	if (pt_acc_cnt == 0) {
3631 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3632 		unsigned pt_free_entry_cnt = 1;
3633 
3634 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3635 			tt_free_entry_t *tt_free_list_next;
3636 
3637 			tt_free_list_next = tt_free_list->next;
3638 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3639 				pt_free_entry_cnt++;
3640 			}
3641 			tt_free_list = tt_free_list_next;
3642 		}
3643 		if (pt_free_entry_cnt == max_pt_index) {
3644 			tt_free_entry_t *tt_free_list_cur;
3645 
3646 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3647 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3648 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3649 
3650 			while (tt_free_list_cur) {
3651 				tt_free_entry_t *tt_free_list_next;
3652 
3653 				tt_free_list_next = tt_free_list_cur->next;
3654 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3655 					tt_free_list->next = tt_free_list_next->next;
3656 				} else {
3657 					tt_free_list = tt_free_list_next;
3658 				}
3659 				tt_free_list_cur = tt_free_list_next;
3660 			}
3661 		} else {
3662 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3663 			pmap->tt_entry_free = ttp;
3664 		}
3665 	} else {
3666 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3667 		pmap->tt_entry_free = ttp;
3668 	}
3669 
3670 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3671 
3672 	if (free_page != 0) {
3673 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3674 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3675 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3676 		if (level < pt_attr_leaf_level(pt_attr)) {
3677 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3678 		} else {
3679 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3680 		}
3681 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3682 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3683 	}
3684 }
3685 
3686 /**
3687  * Safely clear out a translation table entry.
3688  *
3689  * @note If the TTE to clear out points to a leaf table, then that leaf table
3690  *       must have a refcnt of zero before the TTE can be removed.
3691  * @note This function expects to be called with pmap locked exclusive, and will
3692  *       return with pmap unlocked.
3693  *
3694  * @param pmap The pmap containing the page table whose TTE is being removed.
3695  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3696  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3697  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3698  * @param ttep Pointer to the TTE that should be cleared out.
3699  * @param level The level of the page table that contains the TTE to be removed.
3700  */
3701 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3702 pmap_tte_remove(
3703 	pmap_t pmap,
3704 	vm_offset_t va_start,
3705 	vm_offset_t va_end,
3706 	bool need_strong_sync,
3707 	tt_entry_t *ttep,
3708 	unsigned int level)
3709 {
3710 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3711 
3712 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3713 	const tt_entry_t tte = *ttep;
3714 
3715 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3716 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3717 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3718 	}
3719 
3720 	*ttep = (tt_entry_t) 0;
3721 	FLUSH_PTE_STRONG();
3722 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3723 	if (va_end > va_start) {
3724 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3725 	}
3726 
3727 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3728 
3729 	/**
3730 	 * Remember, the passed in "level" parameter refers to the level above the
3731 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3732 	 * page table).
3733 	 */
3734 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3735 
3736 	/**
3737 	 * Non-leaf pagetables don't track active references in the PTD and instead
3738 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3739 	 * the real refcount below.
3740 	 */
3741 	unsigned short refcnt = PT_DESC_REFCOUNT;
3742 
3743 	/*
3744 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3745 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3746 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3747 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3748 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3749 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3750 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3751 	 * synchronize it against the disconnect operation.  If that removal caused the
3752 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3753 	 * operation is finished using the relevant pagetable descriptor.
3754 	 * Address these cases by waiting until all CPUs have been observed to not be
3755 	 * executing pmap_disconnect().
3756 	 */
3757 	if (remove_leaf_table) {
3758 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3759 		const int max_cpu = ml_get_max_cpu_number();
3760 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3761 		bool inflight_disconnect;
3762 
3763 		/*
3764 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3765 		 * ahead of any prior PTE load which may have observed the effect of a
3766 		 * concurrent disconnect operation.  An acquire fence is required for this;
3767 		 * a load-acquire operation is insufficient.
3768 		 */
3769 		os_atomic_thread_fence(acquire);
3770 		do {
3771 			inflight_disconnect = false;
3772 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3773 			    i >= 0;
3774 			    i = bitmap_next(&active_disconnects[0], i)) {
3775 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3776 				if (cpu_data == NULL) {
3777 					continue;
3778 				}
3779 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3780 					__builtin_arm_wfe();
3781 					inflight_disconnect = true;
3782 					continue;
3783 				}
3784 				os_atomic_clear_exclusive();
3785 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3786 			}
3787 		} while (inflight_disconnect);
3788 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3789 		os_atomic_thread_fence(acquire);
3790 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3791 	}
3792 
3793 #if MACH_ASSERT
3794 	/**
3795 	 * On internal devices, always do the page table consistency check
3796 	 * regardless of page table level or the actual refcnt value.
3797 	 */
3798 	{
3799 #else /* MACH_ASSERT */
3800 	/**
3801 	 * Only perform the page table consistency check when deleting leaf page
3802 	 * tables and it seems like there might be valid/compressed mappings
3803 	 * leftover.
3804 	 */
3805 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3806 #endif /* MACH_ASSERT */
3807 
3808 		/**
3809 		 * There are multiple problems that can arise as a non-zero refcnt:
3810 		 * 1. A bug in the refcnt management logic.
3811 		 * 2. A memory stomper or hardware failure.
3812 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3813 		 *    space before destroying a pmap.
3814 		 *
3815 		 * By looping over the page table and determining how many valid or
3816 		 * compressed entries there actually are, we can narrow down which of
3817 		 * these three cases is causing this panic. If the expected refcnt
3818 		 * (valid + compressed) and the actual refcnt don't match then the
3819 		 * problem is probably either a memory corruption issue (if the
3820 		 * non-empty entries don't match valid+compressed, that could also be a
3821 		 * sign of corruption) or refcnt management bug. Otherwise, there
3822 		 * actually are leftover mappings and the higher layers of xnu are
3823 		 * probably at fault.
3824 		 */
3825 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3826 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3827 
3828 		pt_entry_t *ptep = bpte;
3829 		unsigned short non_empty = 0, valid = 0, comp = 0;
3830 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3831 			/* Keep track of all non-empty entries to detect memory corruption. */
3832 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3833 				non_empty++;
3834 			}
3835 
3836 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3837 				comp++;
3838 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3839 				valid++;
3840 			}
3841 		}
3842 
3843 #if MACH_ASSERT
3844 		/**
3845 		 * On internal machines, panic whenever a page table getting deleted has
3846 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3847 		 * non-zero refcnt.
3848 		 */
3849 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3850 #else /* MACH_ASSERT */
3851 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3852 		{
3853 #endif /* MACH_ASSERT */
3854 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3855 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3856 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3857 		}
3858 	}
3859 }
3860 
3861 /**
3862  * Given a pointer to an entry within a `level` page table, delete the
3863  * page table at `level` + 1 that is represented by that entry. For instance,
3864  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3865  * contains the PA of the L3 table, and `level` would be "2".
3866  *
3867  * @note If the table getting deallocated is a leaf table, then that leaf table
3868  *       must have a refcnt of zero before getting deallocated. All other levels
3869  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3870  * @note This function expects to be called with pmap locked exclusive and will
3871  *       return with pmap unlocked.
3872  *
3873  * @param pmap The pmap that owns the page table to be deallocated.
3874  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3875  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3876  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3877  * @param ttep Pointer to the `level` TTE to remove.
3878  * @param level The level of the table that contains an entry pointing to the
3879  *              table to be removed. The deallocated page table will be a
3880  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3881  *              deleted).
3882  */
3883 void
3884 pmap_tte_deallocate(
3885 	pmap_t pmap,
3886 	vm_offset_t va_start,
3887 	vm_offset_t va_end,
3888 	bool need_strong_sync,
3889 	tt_entry_t *ttep,
3890 	unsigned int level)
3891 {
3892 	tt_entry_t tte;
3893 
3894 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3895 
3896 	tte = *ttep;
3897 
3898 	if (tte_get_ptd(tte)->pmap != pmap) {
3899 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3900 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3901 	}
3902 
3903 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3904 	    __func__, ttep, (unsigned long long)tte);
3905 
3906 	/* pmap_tte_remove() will drop the pmap lock */
3907 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3908 
3909 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3910 }
3911 
3912 /*
3913  *	Remove a range of hardware page-table entries.
3914  *	The entries given are the first (inclusive)
3915  *	and last (exclusive) entries for the VM pages.
3916  *	The virtual address is the va for the first pte.
3917  *
3918  *	The pmap must be locked.
3919  *	If the pmap is not the kernel pmap, the range must lie
3920  *	entirely within one pte-page.  This is NOT checked.
3921  *	Assumes that the pte-page exists.
3922  *
3923  *	Returns the number of PTE changed
3924  */
3925 MARK_AS_PMAP_TEXT static int
3926 pmap_remove_range(
3927 	pmap_t pmap,
3928 	vm_map_address_t va,
3929 	pt_entry_t *bpte,
3930 	pt_entry_t *epte)
3931 {
3932 	bool need_strong_sync = false;
3933 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
3934 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
3935 	if (num_changed > 0) {
3936 		PMAP_UPDATE_TLBS(pmap, va,
3937 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
3938 	}
3939 	return num_changed;
3940 }
3941 
3942 
3943 #ifdef PVH_FLAG_EXEC
3944 
3945 /*
3946  *	Update the access protection bits of the physical aperture mapping for a page.
3947  *	This is useful, for example, in guranteeing that a verified executable page
3948  *	has no writable mappings anywhere in the system, including the physical
3949  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
3950  *	synchronization overhead in cases where the call to this function is
3951  *	guaranteed to be followed by other TLB operations.
3952  */
3953 void
3954 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
3955 {
3956 #if __ARM_PTE_PHYSMAP__
3957 	pvh_assert_locked(pai);
3958 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
3959 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
3960 
3961 	pt_entry_t tmplate = *pte_p;
3962 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
3963 		return;
3964 	}
3965 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
3966 	if (tmplate & ARM_PTE_HINT_MASK) {
3967 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
3968 		    __func__, pte_p, (void *)kva, tmplate);
3969 	}
3970 	write_pte_strong(pte_p, tmplate);
3971 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
3972 	if (!flush_tlb_async) {
3973 		sync_tlb_flush();
3974 	}
3975 #endif
3976 }
3977 #endif /* defined(PVH_FLAG_EXEC) */
3978 
3979 
3980 
3981 MARK_AS_PMAP_TEXT int
3982 pmap_remove_range_options(
3983 	pmap_t pmap,
3984 	vm_map_address_t va,
3985 	pt_entry_t *bpte,
3986 	pt_entry_t *epte,
3987 	vm_map_address_t *eva,
3988 	bool *need_strong_sync __unused,
3989 	int options)
3990 {
3991 	pt_entry_t     *cpte;
3992 	size_t          npages = 0;
3993 	int             num_removed, num_unwired;
3994 	int             num_pte_changed;
3995 	unsigned int    pai = 0;
3996 	pmap_paddr_t    pa;
3997 	int             num_external, num_internal, num_reusable;
3998 	int             num_alt_internal;
3999 	uint64_t        num_compressed, num_alt_compressed;
4000 	int16_t         refcnt = 0;
4001 
4002 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4003 
4004 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4005 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4006 
4007 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4008 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4009 	}
4010 
4011 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4012 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4013 	}
4014 
4015 	num_removed = 0;
4016 	num_unwired = 0;
4017 	num_pte_changed = 0;
4018 	num_external = 0;
4019 	num_internal = 0;
4020 	num_reusable = 0;
4021 	num_compressed = 0;
4022 	num_alt_internal = 0;
4023 	num_alt_compressed = 0;
4024 
4025 #if XNU_MONITOR
4026 	bool ro_va = false;
4027 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4028 		ro_va = true;
4029 	}
4030 #endif
4031 	for (cpte = bpte; cpte < epte;
4032 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4033 		pt_entry_t      spte;
4034 		boolean_t       managed = FALSE;
4035 
4036 		/*
4037 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4038 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4039 		 */
4040 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4041 			*eva = va;
4042 			break;
4043 		}
4044 
4045 		spte = *((volatile pt_entry_t*)cpte);
4046 
4047 		while (!managed) {
4048 			if (pmap != kernel_pmap &&
4049 			    (options & PMAP_OPTIONS_REMOVE) &&
4050 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4051 				/*
4052 				 * "pmap" must be locked at this point,
4053 				 * so this should not race with another
4054 				 * pmap_remove_range() or pmap_enter().
4055 				 */
4056 
4057 				/* one less "compressed"... */
4058 				num_compressed++;
4059 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4060 					/* ... but it used to be "ALTACCT" */
4061 					num_alt_compressed++;
4062 				}
4063 
4064 				/* clear marker */
4065 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4066 				/*
4067 				 * "refcnt" also accounts for
4068 				 * our "compressed" markers,
4069 				 * so let's update it here.
4070 				 */
4071 				--refcnt;
4072 				spte = *((volatile pt_entry_t*)cpte);
4073 			}
4074 			/*
4075 			 * It may be possible for the pte to transition from managed
4076 			 * to unmanaged in this timeframe; for now, elide the assert.
4077 			 * We should break out as a consequence of checking pa_valid.
4078 			 */
4079 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4080 			pa = pte_to_pa(spte);
4081 			if (!pa_valid(pa)) {
4082 #if XNU_MONITOR
4083 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4084 #endif
4085 #if XNU_MONITOR
4086 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4087 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4088 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4089 					    __func__, (uint64_t)pa);
4090 				}
4091 #endif
4092 				break;
4093 			}
4094 			pai = pa_index(pa);
4095 			pvh_lock(pai);
4096 			spte = *((volatile pt_entry_t*)cpte);
4097 			pa = pte_to_pa(spte);
4098 			if (pai == pa_index(pa)) {
4099 				managed = TRUE;
4100 				break; // Leave pai locked as we will unlock it after we free the PV entry
4101 			}
4102 			pvh_unlock(pai);
4103 		}
4104 
4105 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4106 			/*
4107 			 * There used to be a valid mapping here but it
4108 			 * has already been removed when the page was
4109 			 * sent to the VM compressor, so nothing left to
4110 			 * remove now...
4111 			 */
4112 			continue;
4113 		}
4114 
4115 		/* remove the translation, do not flush the TLB */
4116 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4117 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4118 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4119 #if MACH_ASSERT
4120 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4121 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4122 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4123 			}
4124 #endif
4125 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4126 			num_pte_changed++;
4127 		}
4128 
4129 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4130 		    (pmap != kernel_pmap)) {
4131 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4132 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4133 			--refcnt;
4134 		}
4135 
4136 		if (pte_is_wired(spte)) {
4137 			pte_set_wired(pmap, cpte, 0);
4138 			num_unwired++;
4139 		}
4140 		/*
4141 		 * if not managed, we're done
4142 		 */
4143 		if (!managed) {
4144 			continue;
4145 		}
4146 
4147 #if XNU_MONITOR
4148 		if (__improbable(ro_va)) {
4149 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4150 		}
4151 #endif
4152 
4153 		/*
4154 		 * find and remove the mapping from the chain for this
4155 		 * physical address.
4156 		 */
4157 		bool is_internal, is_altacct;
4158 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4159 
4160 		if (is_altacct) {
4161 			assert(is_internal);
4162 			num_internal++;
4163 			num_alt_internal++;
4164 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4165 				ppattr_clear_altacct(pai);
4166 				ppattr_clear_internal(pai);
4167 			}
4168 		} else if (is_internal) {
4169 			if (ppattr_test_reusable(pai)) {
4170 				num_reusable++;
4171 			} else {
4172 				num_internal++;
4173 			}
4174 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4175 				ppattr_clear_internal(pai);
4176 			}
4177 		} else {
4178 			num_external++;
4179 		}
4180 		pvh_unlock(pai);
4181 		num_removed++;
4182 	}
4183 
4184 	/*
4185 	 *	Update the counts
4186 	 */
4187 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4188 
4189 	if (pmap != kernel_pmap) {
4190 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4191 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4192 		}
4193 
4194 		/* update ledgers */
4195 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4196 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4197 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4198 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4199 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4200 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4201 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4202 		/* make needed adjustments to phys_footprint */
4203 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4204 		    ((num_internal -
4205 		    num_alt_internal) +
4206 		    (num_compressed -
4207 		    num_alt_compressed)) * pmap_page_size);
4208 	}
4209 
4210 	/* flush the ptable entries we have written */
4211 	if (num_pte_changed > 0) {
4212 		FLUSH_PTE_STRONG();
4213 	}
4214 
4215 	return num_pte_changed;
4216 }
4217 
4218 
4219 /*
4220  *	Remove the given range of addresses
4221  *	from the specified map.
4222  *
4223  *	It is assumed that the start and end are properly
4224  *	rounded to the hardware page size.
4225  */
4226 void
4227 pmap_remove(
4228 	pmap_t pmap,
4229 	vm_map_address_t start,
4230 	vm_map_address_t end)
4231 {
4232 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4233 }
4234 
4235 MARK_AS_PMAP_TEXT vm_map_address_t
4236 pmap_remove_options_internal(
4237 	pmap_t pmap,
4238 	vm_map_address_t start,
4239 	vm_map_address_t end,
4240 	int options)
4241 {
4242 	vm_map_address_t eva = end;
4243 	pt_entry_t     *bpte, *epte;
4244 	pt_entry_t     *pte_p;
4245 	tt_entry_t     *tte_p;
4246 	int             remove_count = 0;
4247 	bool            need_strong_sync = false;
4248 	bool            unlock = true;
4249 
4250 	if (__improbable(end < start)) {
4251 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4252 	}
4253 
4254 	validate_pmap_mutable(pmap);
4255 
4256 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4257 
4258 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4259 
4260 	tte_p = pmap_tte(pmap, start);
4261 
4262 	if (tte_p == (tt_entry_t *) NULL) {
4263 		goto done;
4264 	}
4265 
4266 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4267 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4268 		bpte = &pte_p[pte_index(pt_attr, start)];
4269 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4270 
4271 		/*
4272 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4273 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4274 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4275 		 */
4276 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4277 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4278 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4279 		}
4280 
4281 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4282 		    &need_strong_sync, options);
4283 
4284 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4285 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4286 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4287 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4288 		}
4289 	}
4290 
4291 done:
4292 	if (unlock) {
4293 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4294 	}
4295 
4296 	if (remove_count > 0) {
4297 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4298 	}
4299 	return eva;
4300 }
4301 
4302 void
4303 pmap_remove_options(
4304 	pmap_t pmap,
4305 	vm_map_address_t start,
4306 	vm_map_address_t end,
4307 	int options)
4308 {
4309 	vm_map_address_t va;
4310 
4311 	if (pmap == PMAP_NULL) {
4312 		return;
4313 	}
4314 
4315 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4316 
4317 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4318 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4319 	    VM_KERNEL_ADDRHIDE(end));
4320 
4321 #if MACH_ASSERT
4322 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4323 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4324 		    pmap, (uint64_t)start, (uint64_t)end);
4325 	}
4326 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4327 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4328 		    pmap, (uint64_t)start, (uint64_t)end);
4329 	}
4330 #endif
4331 
4332 	/*
4333 	 * We allow single-page requests to execute non-preemptibly,
4334 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4335 	 * operation, and there are a couple of special use cases that
4336 	 * require a non-preemptible single-page operation.
4337 	 */
4338 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4339 		pmap_verify_preemptible();
4340 	}
4341 
4342 	/*
4343 	 *      Invalidate the translation buffer first
4344 	 */
4345 	va = start;
4346 	while (va < end) {
4347 		vm_map_address_t l;
4348 
4349 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4350 		if (l > end) {
4351 			l = end;
4352 		}
4353 
4354 #if XNU_MONITOR
4355 		va = pmap_remove_options_ppl(pmap, va, l, options);
4356 
4357 		pmap_ledger_check_balance(pmap);
4358 #else
4359 		va = pmap_remove_options_internal(pmap, va, l, options);
4360 #endif
4361 	}
4362 
4363 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4364 }
4365 
4366 
4367 /*
4368  *	Remove phys addr if mapped in specified map
4369  */
4370 void
4371 pmap_remove_some_phys(
4372 	__unused pmap_t map,
4373 	__unused ppnum_t pn)
4374 {
4375 	/* Implement to support working set code */
4376 }
4377 
4378 /*
4379  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4380  * switch a thread onto a new vm_map.
4381  */
4382 void
4383 pmap_switch_user(thread_t thread, vm_map_t new_map)
4384 {
4385 	pmap_t new_pmap = new_map->pmap;
4386 
4387 
4388 	thread->map = new_map;
4389 	pmap_set_pmap(new_pmap, thread);
4390 
4391 }
4392 
4393 void
4394 pmap_set_pmap(
4395 	pmap_t pmap,
4396 #if     !__ARM_USER_PROTECT__
4397 	__unused
4398 #endif
4399 	thread_t        thread)
4400 {
4401 	pmap_switch(pmap);
4402 #if __ARM_USER_PROTECT__
4403 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4404 	thread->machine.asid = pmap->hw_asid;
4405 #endif
4406 }
4407 
4408 static void
4409 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4410 {
4411 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4412 }
4413 
4414 static inline bool
4415 pmap_user_ttb_is_clear(void)
4416 {
4417 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4418 }
4419 
4420 MARK_AS_PMAP_TEXT void
4421 pmap_switch_internal(
4422 	pmap_t pmap)
4423 {
4424 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4425 #if XNU_MONITOR
4426 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4427 #endif
4428 	validate_pmap_mutable(pmap);
4429 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4430 	uint16_t asid_index = pmap->hw_asid;
4431 	bool do_asid_flush = false;
4432 	bool do_commpage_flush = false;
4433 
4434 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4435 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4436 	}
4437 #if __ARM_KERNEL_PROTECT__
4438 	asid_index >>= 1;
4439 #endif
4440 
4441 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4442 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4443 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4444 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4445 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4446 	bool break_before_make = do_shared_region_flush;
4447 
4448 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4449 		asid_index -= 1;
4450 		pmap_update_plru(asid_index);
4451 
4452 		/* Paranoia. */
4453 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4454 
4455 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4456 		uint8_t new_sw_asid = pmap->sw_asid;
4457 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4458 
4459 		if (new_sw_asid != last_sw_asid) {
4460 			/*
4461 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4462 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4463 			 * then this switch runs the risk of aliasing.  We need to flush the
4464 			 * TLB for this phyiscal ASID in this case.
4465 			 */
4466 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4467 			do_asid_flush = true;
4468 			break_before_make = true;
4469 		}
4470 	}
4471 
4472 #if __ARM_MIXED_PAGE_SIZE__
4473 	if (pt_attr->pta_tcr_value != get_tcr()) {
4474 		break_before_make = true;
4475 	}
4476 #endif
4477 #if __ARM_MIXED_PAGE_SIZE__
4478 	/*
4479 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4480 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4481 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4482 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4483 	 * conflict abort or other unpredictable behavior.
4484 	 */
4485 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4486 		do_commpage_flush = true;
4487 	}
4488 	if (do_commpage_flush) {
4489 		break_before_make = true;
4490 	}
4491 #endif
4492 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4493 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4494 		pmap_clear_user_ttb_internal();
4495 	}
4496 
4497 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4498 	 * to flush the userspace mappings for that region.  Those mappings are global
4499 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4500 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4501 	if (__improbable(do_shared_region_flush)) {
4502 #if __ARM_RANGE_TLBI__
4503 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4504 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4505 
4506 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4507 		 * There may still be non-global entries that overlap with the incoming pmap's
4508 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4509 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4510 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4511 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4512 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4513 		 * to consider additional invalidation here in the future. */
4514 		if (npages_prev <= ARM64_TLB_RANGE_PAGES) {
4515 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4516 		} else {
4517 			do_asid_flush = false;
4518 			flush_core_tlb_async();
4519 		}
4520 #else
4521 		do_asid_flush = false;
4522 		flush_core_tlb_async();
4523 #endif // __ARM_RANGE_TLBI__
4524 	}
4525 
4526 #if __ARM_MIXED_PAGE_SIZE__
4527 	if (__improbable(do_commpage_flush)) {
4528 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4529 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4530 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4531 		flush_core_tlb_allrange_async(rtlbi_param);
4532 	}
4533 #endif
4534 	if (__improbable(do_asid_flush)) {
4535 		pmap_flush_core_tlb_asid_async(pmap);
4536 #if DEVELOPMENT || DEBUG
4537 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4538 #endif
4539 	}
4540 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4541 		sync_tlb_flush_local();
4542 	}
4543 
4544 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4545 }
4546 
4547 void
4548 pmap_switch(
4549 	pmap_t pmap)
4550 {
4551 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4552 #if XNU_MONITOR
4553 	pmap_switch_ppl(pmap);
4554 #else
4555 	pmap_switch_internal(pmap);
4556 #endif
4557 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4558 }
4559 
4560 void
4561 pmap_page_protect(
4562 	ppnum_t ppnum,
4563 	vm_prot_t prot)
4564 {
4565 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4566 }
4567 
4568 /*
4569  *	Routine:	pmap_page_protect_options
4570  *
4571  *	Function:
4572  *		Lower the permission for all mappings to a given
4573  *		page.
4574  */
4575 MARK_AS_PMAP_TEXT static void
4576 pmap_page_protect_options_with_flush_range(
4577 	ppnum_t ppnum,
4578 	vm_prot_t prot,
4579 	unsigned int options,
4580 	pmap_tlb_flush_range_t *flush_range)
4581 {
4582 	pmap_paddr_t    phys = ptoa(ppnum);
4583 	pv_entry_t    **pv_h;
4584 	pv_entry_t     *pve_p, *orig_pve_p;
4585 	pv_entry_t     *pveh_p;
4586 	pv_entry_t     *pvet_p;
4587 	pt_entry_t     *pte_p, *orig_pte_p;
4588 	pv_entry_t     *new_pve_p;
4589 	pt_entry_t     *new_pte_p;
4590 	vm_offset_t     pvh_flags;
4591 	unsigned int    pai;
4592 	bool            remove;
4593 	bool            set_NX;
4594 	unsigned int    pvh_cnt = 0;
4595 	unsigned int    pass1_updated = 0;
4596 	unsigned int    pass2_updated = 0;
4597 
4598 	assert(ppnum != vm_page_fictitious_addr);
4599 
4600 	/* Only work with managed pages. */
4601 	if (!pa_valid(phys)) {
4602 		return;
4603 	}
4604 
4605 	/*
4606 	 * Determine the new protection.
4607 	 */
4608 	switch (prot) {
4609 	case VM_PROT_ALL:
4610 		return;         /* nothing to do */
4611 	case VM_PROT_READ:
4612 	case VM_PROT_READ | VM_PROT_EXECUTE:
4613 		remove = false;
4614 		break;
4615 	default:
4616 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4617 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4618 		remove = true;
4619 		break;
4620 	}
4621 
4622 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4623 	if (remove) {
4624 #if !XNU_MONITOR
4625 		mp_disable_preemption();
4626 #endif
4627 		pmap_cpu_data = pmap_get_cpu_data();
4628 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4629 		/*
4630 		 * Ensure the store to inflight_disconnect will be observed before any of the
4631 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4632 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4633 		 * another CPU, in between this function's clearing a PTE and dropping the
4634 		 * corresponding pagetable refcount.  That can lead to a panic if the
4635 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4636 		 * store barrier; a store-release operation would not be sufficient.
4637 		 */
4638 		os_atomic_thread_fence(release);
4639 	}
4640 
4641 	pai = pa_index(phys);
4642 	pvh_lock(pai);
4643 	pv_h = pai_to_pvh(pai);
4644 	pvh_flags = pvh_get_flags(pv_h);
4645 
4646 #if XNU_MONITOR
4647 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4648 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4649 	}
4650 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4651 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4652 	}
4653 #endif
4654 
4655 
4656 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4657 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4658 	pveh_p = PV_ENTRY_NULL;
4659 	pvet_p = PV_ENTRY_NULL;
4660 	new_pve_p = PV_ENTRY_NULL;
4661 	new_pte_p = PT_ENTRY_NULL;
4662 
4663 
4664 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4665 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4666 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4667 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4668 		pveh_p = pve_p;
4669 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4670 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4671 	}
4672 
4673 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4674 	int pve_ptep_idx = 0;
4675 
4676 	/*
4677 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4678 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4679 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4680 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4681 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4682 	 * tlb_flush_needed to be true while issue_tlbi is false.
4683 	 */
4684 	bool issue_tlbi = false;
4685 	bool tlb_flush_needed = false;
4686 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4687 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4688 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4689 		bool update = false;
4690 
4691 		if (pve_p != PV_ENTRY_NULL) {
4692 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4693 			if (pte_p == PT_ENTRY_NULL) {
4694 				goto protect_skip_pve_pass1;
4695 			}
4696 		}
4697 
4698 #ifdef PVH_FLAG_IOMMU
4699 		if (pvh_ptep_is_iommu(pte_p)) {
4700 #if XNU_MONITOR
4701 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4702 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4703 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4704 			}
4705 #endif
4706 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4707 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4708 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4709 			}
4710 			goto protect_skip_pve_pass1;
4711 		}
4712 #endif
4713 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4714 		const pmap_t pmap = ptdp->pmap;
4715 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4716 
4717 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4718 #if MACH_ASSERT
4719 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4720 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4721 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4722 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4723 
4724 				pv_entry_t *check_pvep = pve_p;
4725 
4726 				do {
4727 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4728 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4729 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4730 					}
4731 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4732 
4733 				/* Restore previous PTEP value. */
4734 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4735 			}
4736 #endif
4737 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4738 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4739 		}
4740 
4741 #if DEVELOPMENT || DEBUG
4742 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4743 #else
4744 		if ((prot & VM_PROT_EXECUTE))
4745 #endif
4746 		{
4747 			set_NX = false;
4748 		} else {
4749 			set_NX = true;
4750 		}
4751 
4752 		/* Remove the mapping if new protection is NONE */
4753 		if (remove) {
4754 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4755 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4756 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4757 			pt_entry_t spte = *pte_p;
4758 
4759 			if (pte_is_wired(spte)) {
4760 				pte_set_wired(pmap, pte_p, 0);
4761 				spte = *pte_p;
4762 				if (pmap != kernel_pmap) {
4763 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4764 				}
4765 			}
4766 
4767 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4768 			    (uint64_t)spte, pte_p, ppnum);
4769 
4770 			if (compress && is_internal && (pmap != kernel_pmap)) {
4771 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4772 				/* mark this PTE as having been "compressed" */
4773 				tmplate = ARM_PTE_COMPRESSED;
4774 				if (is_altacct) {
4775 					tmplate |= ARM_PTE_COMPRESSED_ALT;
4776 				}
4777 			} else {
4778 				tmplate = ARM_PTE_TYPE_FAULT;
4779 			}
4780 
4781 			assert(spte != tmplate);
4782 			write_pte_fast(pte_p, tmplate);
4783 			update = true;
4784 			++pass1_updated;
4785 
4786 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4787 
4788 			if (pmap != kernel_pmap) {
4789 				if (ppattr_test_reusable(pai) &&
4790 				    is_internal &&
4791 				    !is_altacct) {
4792 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4793 				} else if (!is_internal) {
4794 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4795 				}
4796 
4797 				if (is_altacct) {
4798 					assert(is_internal);
4799 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4800 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4801 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4802 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4803 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4804 					}
4805 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4806 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4807 				} else if (ppattr_test_reusable(pai)) {
4808 					assert(is_internal);
4809 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4810 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4811 						/* was not in footprint, but is now */
4812 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4813 					}
4814 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4815 				} else if (is_internal) {
4816 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4817 
4818 					/*
4819 					 * Update all stats related to physical footprint, which only
4820 					 * deals with internal pages.
4821 					 */
4822 					if (options & PMAP_OPTIONS_COMPRESSOR) {
4823 						/*
4824 						 * This removal is only being done so we can send this page to
4825 						 * the compressor; therefore it mustn't affect total task footprint.
4826 						 */
4827 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4828 					} else {
4829 						/*
4830 						 * This internal page isn't going to the compressor, so adjust stats to keep
4831 						 * phys_footprint up to date.
4832 						 */
4833 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4834 					}
4835 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4836 				} else {
4837 					/* external page: no impact on ledgers */
4838 				}
4839 			}
4840 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4841 		} else {
4842 			pt_entry_t spte = *pte_p;
4843 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4844 
4845 			if (pmap == kernel_pmap) {
4846 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4847 			} else {
4848 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4849 			}
4850 
4851 			/*
4852 			 * While the naive implementation of this would serve to add execute
4853 			 * permission, this is not how the VM uses this interface, or how
4854 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4855 			 */
4856 			if (set_NX) {
4857 				tmplate |= pt_attr_leaf_xn(pt_attr);
4858 			}
4859 
4860 
4861 			assert(spte != ARM_PTE_TYPE_FAULT);
4862 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4863 
4864 			if (spte != tmplate) {
4865 				/*
4866 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4867 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4868 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
4869 				 * should always be cleared by this function.
4870 				 */
4871 				pte_set_was_writeable(tmplate, true);
4872 				write_pte_fast(pte_p, tmplate);
4873 				update = true;
4874 				++pass1_updated;
4875 			} else if (pte_was_writeable(tmplate)) {
4876 				/*
4877 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4878 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
4879 				 * write access to a page, this function should always at least clear that flag for
4880 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4881 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
4882 				 * be handled through arm_fast_fault().
4883 				 */
4884 				pte_set_was_writeable(tmplate, false);
4885 				write_pte_fast(pte_p, tmplate);
4886 			}
4887 		}
4888 
4889 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4890 			tlb_flush_needed = true;
4891 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4892 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4893 				issue_tlbi = true;
4894 			}
4895 		}
4896 protect_skip_pve_pass1:
4897 		pte_p = PT_ENTRY_NULL;
4898 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4899 			pve_ptep_idx = 0;
4900 			pve_p = pve_next(pve_p);
4901 		}
4902 	}
4903 
4904 	if (tlb_flush_needed) {
4905 		FLUSH_PTE_STRONG();
4906 	}
4907 
4908 	if (!remove && !issue_tlbi) {
4909 		goto protect_finish;
4910 	}
4911 
4912 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
4913 	pv_entry_t **pve_pp = pv_h;
4914 	pve_p = orig_pve_p;
4915 	pte_p = orig_pte_p;
4916 	pve_ptep_idx = 0;
4917 
4918 	/*
4919 	 * We need to keep track of whether a particular PVE list contains IOMMU
4920 	 * mappings when removing entries, because we should only remove CPU
4921 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4922 	 * it around.
4923 	 */
4924 	bool iommu_mapping_in_pve = false;
4925 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4926 		if (pve_p != PV_ENTRY_NULL) {
4927 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4928 			if (pte_p == PT_ENTRY_NULL) {
4929 				goto protect_skip_pve_pass2;
4930 			}
4931 		}
4932 
4933 #ifdef PVH_FLAG_IOMMU
4934 		if (pvh_ptep_is_iommu(pte_p)) {
4935 			iommu_mapping_in_pve = true;
4936 			if (remove && (pve_p == PV_ENTRY_NULL)) {
4937 				/*
4938 				 * We've found an IOMMU entry and it's the only entry in the PV list.
4939 				 * We don't discard IOMMU entries, so simply set up the new PV list to
4940 				 * contain the single IOMMU PTE and exit the loop.
4941 				 */
4942 				new_pte_p = pte_p;
4943 				break;
4944 			}
4945 			goto protect_skip_pve_pass2;
4946 		}
4947 #endif
4948 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4949 		const pmap_t pmap = ptdp->pmap;
4950 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4951 
4952 		if (remove) {
4953 			if (!compress && (pmap != kernel_pmap)) {
4954 				/*
4955 				 * We must wait to decrement the refcount until we're completely finished using the PTE
4956 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
4957 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
4958 				 * under us.
4959 				 */
4960 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
4961 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
4962 				}
4963 			}
4964 			/* Remove this CPU mapping from PVE list. */
4965 			if (pve_p != PV_ENTRY_NULL) {
4966 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4967 			}
4968 		} else {
4969 			pt_entry_t spte = *pte_p;
4970 			if (pte_was_writeable(spte)) {
4971 				pte_set_was_writeable(spte, false);
4972 				write_pte_fast(pte_p, spte);
4973 			} else {
4974 				goto protect_skip_pve_pass2;
4975 			}
4976 		}
4977 		++pass2_updated;
4978 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4979 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4980 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
4981 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
4982 		}
4983 
4984 protect_skip_pve_pass2:
4985 		pte_p = PT_ENTRY_NULL;
4986 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4987 			pve_ptep_idx = 0;
4988 
4989 			if (remove) {
4990 				/**
4991 				 * If there are any IOMMU mappings in the PVE list, preserve
4992 				 * those mappings in a new PVE list (new_pve_p) which will later
4993 				 * become the new PVH entry. Keep track of the CPU mappings in
4994 				 * pveh_p/pvet_p so they can be deallocated later.
4995 				 */
4996 				if (iommu_mapping_in_pve) {
4997 					iommu_mapping_in_pve = false;
4998 					pv_entry_t *temp_pve_p = pve_next(pve_p);
4999 					pve_remove(pv_h, pve_pp, pve_p);
5000 					pveh_p = pvh_pve_list(pv_h);
5001 					pve_p->pve_next = new_pve_p;
5002 					new_pve_p = pve_p;
5003 					pve_p = temp_pve_p;
5004 					continue;
5005 				} else {
5006 					pvet_p = pve_p;
5007 					pvh_cnt++;
5008 				}
5009 			}
5010 
5011 			pve_pp = pve_next_ptr(pve_p);
5012 			pve_p = pve_next(pve_p);
5013 			iommu_mapping_in_pve = false;
5014 		}
5015 	}
5016 
5017 protect_finish:
5018 
5019 #ifdef PVH_FLAG_EXEC
5020 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5021 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5022 	}
5023 #endif
5024 	if (__improbable(pass1_updated != pass2_updated)) {
5025 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5026 		    __func__, pass1_updated, pass2_updated);
5027 	}
5028 	/* if we removed a bunch of entries, take care of them now */
5029 	if (remove) {
5030 		if (new_pve_p != PV_ENTRY_NULL) {
5031 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5032 			pvh_set_flags(pv_h, pvh_flags);
5033 		} else if (new_pte_p != PT_ENTRY_NULL) {
5034 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5035 			pvh_set_flags(pv_h, pvh_flags);
5036 		} else {
5037 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5038 		}
5039 	}
5040 
5041 	if (flush_range && tlb_flush_needed) {
5042 		if (!remove) {
5043 			flush_range->ptfr_flush_needed = true;
5044 			tlb_flush_needed = false;
5045 		}
5046 	}
5047 
5048 	/*
5049 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5050 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5051 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5052 	 * a page to be repurposed while it is still live in the TLBs.
5053 	 */
5054 	if (remove && tlb_flush_needed) {
5055 		sync_tlb_flush();
5056 	}
5057 
5058 	pvh_unlock(pai);
5059 
5060 	if (remove) {
5061 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5062 #if !XNU_MONITOR
5063 		mp_enable_preemption();
5064 #endif
5065 	}
5066 
5067 	if (!remove && tlb_flush_needed) {
5068 		sync_tlb_flush();
5069 	}
5070 
5071 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5072 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5073 	}
5074 }
5075 
5076 MARK_AS_PMAP_TEXT void
5077 pmap_page_protect_options_internal(
5078 	ppnum_t ppnum,
5079 	vm_prot_t prot,
5080 	unsigned int options,
5081 	void *arg)
5082 {
5083 	if (arg != NULL) {
5084 		/*
5085 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5086 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5087 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5088 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5089 		 * In that case, force the flush to take place.
5090 		 */
5091 		options &= ~PMAP_OPTIONS_NOFLUSH;
5092 	}
5093 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5094 }
5095 
5096 void
5097 pmap_page_protect_options(
5098 	ppnum_t ppnum,
5099 	vm_prot_t prot,
5100 	unsigned int options,
5101 	void *arg)
5102 {
5103 	pmap_paddr_t    phys = ptoa(ppnum);
5104 
5105 	assert(ppnum != vm_page_fictitious_addr);
5106 
5107 	/* Only work with managed pages. */
5108 	if (!pa_valid(phys)) {
5109 		return;
5110 	}
5111 
5112 	/*
5113 	 * Determine the new protection.
5114 	 */
5115 	if (prot == VM_PROT_ALL) {
5116 		return;         /* nothing to do */
5117 	}
5118 
5119 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5120 
5121 #if XNU_MONITOR
5122 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5123 #else
5124 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5125 #endif
5126 
5127 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5128 }
5129 
5130 
5131 #if __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX)
5132 MARK_AS_PMAP_TEXT void
5133 pmap_disable_user_jop_internal(pmap_t pmap)
5134 {
5135 	if (pmap == kernel_pmap) {
5136 		panic("%s: called with kernel_pmap", __func__);
5137 	}
5138 	validate_pmap_mutable(pmap);
5139 	pmap->disable_jop = true;
5140 }
5141 
5142 void
5143 pmap_disable_user_jop(pmap_t pmap)
5144 {
5145 #if XNU_MONITOR
5146 	pmap_disable_user_jop_ppl(pmap);
5147 #else
5148 	pmap_disable_user_jop_internal(pmap);
5149 #endif
5150 }
5151 #endif /* __has_feature(ptrauth_calls) && defined(XNU_TARGET_OS_OSX) */
5152 
5153 /*
5154  * Indicates if the pmap layer enforces some additional restrictions on the
5155  * given set of protections.
5156  */
5157 bool
5158 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5159 {
5160 	return false;
5161 }
5162 
5163 /*
5164  *	Set the physical protection on the
5165  *	specified range of this map as requested.
5166  *	VERY IMPORTANT: Will not increase permissions.
5167  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5168  */
5169 void
5170 pmap_protect(
5171 	pmap_t pmap,
5172 	vm_map_address_t b,
5173 	vm_map_address_t e,
5174 	vm_prot_t prot)
5175 {
5176 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5177 }
5178 
5179 MARK_AS_PMAP_TEXT vm_map_address_t
5180 pmap_protect_options_internal(
5181 	pmap_t pmap,
5182 	vm_map_address_t start,
5183 	vm_map_address_t end,
5184 	vm_prot_t prot,
5185 	unsigned int options,
5186 	__unused void *args)
5187 {
5188 	tt_entry_t      *tte_p;
5189 	pt_entry_t      *bpte_p, *epte_p;
5190 	pt_entry_t      *pte_p;
5191 	boolean_t        set_NX = TRUE;
5192 	boolean_t        set_XO = FALSE;
5193 	boolean_t        should_have_removed = FALSE;
5194 	bool             need_strong_sync = false;
5195 
5196 	/* Validate the pmap input before accessing its data. */
5197 	validate_pmap_mutable(pmap);
5198 
5199 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5200 
5201 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5202 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5203 	}
5204 
5205 #if DEVELOPMENT || DEBUG
5206 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5207 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5208 			should_have_removed = TRUE;
5209 		}
5210 	} else
5211 #endif
5212 	{
5213 		/* Determine the new protection. */
5214 		switch (prot) {
5215 		case VM_PROT_EXECUTE:
5216 			set_XO = TRUE;
5217 			OS_FALLTHROUGH;
5218 		case VM_PROT_READ:
5219 		case VM_PROT_READ | VM_PROT_EXECUTE:
5220 			break;
5221 		case VM_PROT_READ | VM_PROT_WRITE:
5222 		case VM_PROT_ALL:
5223 			return end;         /* nothing to do */
5224 		default:
5225 			should_have_removed = TRUE;
5226 		}
5227 	}
5228 
5229 	if (should_have_removed) {
5230 		panic("%s: should have been a remove operation, "
5231 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5232 		    __FUNCTION__,
5233 		    pmap, (void *)start, (void *)end, prot, options, args);
5234 	}
5235 
5236 #if DEVELOPMENT || DEBUG
5237 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5238 #else
5239 	if ((prot & VM_PROT_EXECUTE))
5240 #endif
5241 	{
5242 		set_NX = FALSE;
5243 	} else {
5244 		set_NX = TRUE;
5245 	}
5246 
5247 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5248 	vm_map_address_t va = start;
5249 	unsigned int npages = 0;
5250 
5251 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5252 
5253 	tte_p = pmap_tte(pmap, start);
5254 
5255 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5256 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5257 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5258 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5259 		pte_p = bpte_p;
5260 
5261 		for (pte_p = bpte_p;
5262 		    pte_p < epte_p;
5263 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5264 			++npages;
5265 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5266 			    pmap_pending_preemption())) {
5267 				break;
5268 			}
5269 			pt_entry_t spte;
5270 #if DEVELOPMENT || DEBUG
5271 			boolean_t  force_write = FALSE;
5272 #endif
5273 
5274 			spte = *((volatile pt_entry_t*)pte_p);
5275 
5276 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5277 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5278 				continue;
5279 			}
5280 
5281 			pmap_paddr_t    pa;
5282 			unsigned int    pai = 0;
5283 			boolean_t       managed = FALSE;
5284 
5285 			while (!managed) {
5286 				/*
5287 				 * It may be possible for the pte to transition from managed
5288 				 * to unmanaged in this timeframe; for now, elide the assert.
5289 				 * We should break out as a consequence of checking pa_valid.
5290 				 */
5291 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5292 				pa = pte_to_pa(spte);
5293 				if (!pa_valid(pa)) {
5294 					break;
5295 				}
5296 				pai = pa_index(pa);
5297 				pvh_lock(pai);
5298 				spte = *((volatile pt_entry_t*)pte_p);
5299 				pa = pte_to_pa(spte);
5300 				if (pai == pa_index(pa)) {
5301 					managed = TRUE;
5302 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5303 				}
5304 				pvh_unlock(pai);
5305 			}
5306 
5307 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5308 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5309 				continue;
5310 			}
5311 
5312 			pt_entry_t      tmplate;
5313 
5314 			if (pmap == kernel_pmap) {
5315 #if DEVELOPMENT || DEBUG
5316 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5317 					force_write = TRUE;
5318 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5319 				} else
5320 #endif
5321 				{
5322 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5323 				}
5324 			} else {
5325 #if DEVELOPMENT || DEBUG
5326 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5327 					assert(pmap->type != PMAP_TYPE_NESTED);
5328 					force_write = TRUE;
5329 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5330 				} else
5331 #endif
5332 				{
5333 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5334 				}
5335 			}
5336 
5337 			/*
5338 			 * XXX Removing "NX" would
5339 			 * grant "execute" access
5340 			 * immediately, bypassing any
5341 			 * checks VM might want to do
5342 			 * in its soft fault path.
5343 			 * pmap_protect() and co. are
5344 			 * not allowed to increase
5345 			 * access permissions.
5346 			 */
5347 			if (set_NX) {
5348 				tmplate |= pt_attr_leaf_xn(pt_attr);
5349 			} else {
5350 				if (pmap == kernel_pmap) {
5351 					/* do NOT clear "PNX"! */
5352 					tmplate |= ARM_PTE_NX;
5353 				} else {
5354 					/* do NOT clear "NX"! */
5355 					tmplate |= pt_attr_leaf_x(pt_attr);
5356 					if (set_XO) {
5357 						tmplate &= ~ARM_PTE_APMASK;
5358 						tmplate |= pt_attr_leaf_rona(pt_attr);
5359 					}
5360 				}
5361 			}
5362 
5363 #if DEVELOPMENT || DEBUG
5364 			if (force_write) {
5365 				/*
5366 				 * TODO: Run CS/Monitor checks here.
5367 				 */
5368 				if (managed) {
5369 					/*
5370 					 * We are marking the page as writable,
5371 					 * so we consider it to be modified and
5372 					 * referenced.
5373 					 */
5374 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5375 					tmplate |= ARM_PTE_AF;
5376 
5377 					if (ppattr_test_reffault(pai)) {
5378 						ppattr_clear_reffault(pai);
5379 					}
5380 
5381 					if (ppattr_test_modfault(pai)) {
5382 						ppattr_clear_modfault(pai);
5383 					}
5384 				}
5385 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5386 				/*
5387 				 * An immediate request for anything other than
5388 				 * write should still mark the page as
5389 				 * referenced if managed.
5390 				 */
5391 				if (managed) {
5392 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5393 					tmplate |= ARM_PTE_AF;
5394 
5395 					if (ppattr_test_reffault(pai)) {
5396 						ppattr_clear_reffault(pai);
5397 					}
5398 				}
5399 			}
5400 #endif
5401 
5402 			/* We do not expect to write fast fault the entry. */
5403 			pte_set_was_writeable(tmplate, false);
5404 
5405 			write_pte_fast(pte_p, tmplate);
5406 
5407 			if (managed) {
5408 				pvh_assert_locked(pai);
5409 				pvh_unlock(pai);
5410 			}
5411 		}
5412 		FLUSH_PTE_STRONG();
5413 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5414 	} else {
5415 		va = end;
5416 	}
5417 
5418 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5419 	return va;
5420 }
5421 
5422 void
5423 pmap_protect_options(
5424 	pmap_t pmap,
5425 	vm_map_address_t b,
5426 	vm_map_address_t e,
5427 	vm_prot_t prot,
5428 	unsigned int options,
5429 	__unused void *args)
5430 {
5431 	vm_map_address_t l, beg;
5432 
5433 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5434 
5435 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5436 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5437 		    pmap, (uint64_t)b, (uint64_t)e);
5438 	}
5439 
5440 	/*
5441 	 * We allow single-page requests to execute non-preemptibly,
5442 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5443 	 * operation, and there are a couple of special use cases that
5444 	 * require a non-preemptible single-page operation.
5445 	 */
5446 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5447 		pmap_verify_preemptible();
5448 	}
5449 
5450 #if DEVELOPMENT || DEBUG
5451 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5452 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5453 			pmap_remove_options(pmap, b, e, options);
5454 			return;
5455 		}
5456 	} else
5457 #endif
5458 	{
5459 		/* Determine the new protection. */
5460 		switch (prot) {
5461 		case VM_PROT_EXECUTE:
5462 		case VM_PROT_READ:
5463 		case VM_PROT_READ | VM_PROT_EXECUTE:
5464 			break;
5465 		case VM_PROT_READ | VM_PROT_WRITE:
5466 		case VM_PROT_ALL:
5467 			return;         /* nothing to do */
5468 		default:
5469 			pmap_remove_options(pmap, b, e, options);
5470 			return;
5471 		}
5472 	}
5473 
5474 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5475 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5476 	    VM_KERNEL_ADDRHIDE(e));
5477 
5478 	beg = b;
5479 
5480 	while (beg < e) {
5481 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5482 
5483 		if (l > e) {
5484 			l = e;
5485 		}
5486 
5487 #if XNU_MONITOR
5488 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5489 #else
5490 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5491 #endif
5492 	}
5493 
5494 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5495 }
5496 
5497 /**
5498  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5499  *
5500  * @param pmap pmap to insert the pages into.
5501  * @param va virtual address to map the pages into.
5502  * @param pa page number of the first physical page to map.
5503  * @param size block size, in number of pages.
5504  * @param prot mapping protection attributes.
5505  * @param attr flags to pass to pmap_enter().
5506  *
5507  * @return KERN_SUCCESS.
5508  */
5509 kern_return_t
5510 pmap_map_block(
5511 	pmap_t pmap,
5512 	addr64_t va,
5513 	ppnum_t pa,
5514 	uint32_t size,
5515 	vm_prot_t prot,
5516 	int attr,
5517 	unsigned int flags)
5518 {
5519 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5520 }
5521 
5522 /**
5523  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5524  * As opposed to pmap_map_block(), this function takes
5525  * a physical address as an input and operates using the
5526  * page size associated with the input pmap.
5527  *
5528  * @param pmap pmap to insert the pages into.
5529  * @param va virtual address to map the pages into.
5530  * @param pa physical address of the first physical page to map.
5531  * @param size block size, in number of pages.
5532  * @param prot mapping protection attributes.
5533  * @param attr flags to pass to pmap_enter().
5534  *
5535  * @return KERN_SUCCESS.
5536  */
5537 kern_return_t
5538 pmap_map_block_addr(
5539 	pmap_t pmap,
5540 	addr64_t va,
5541 	pmap_paddr_t pa,
5542 	uint32_t size,
5543 	vm_prot_t prot,
5544 	int attr,
5545 	unsigned int flags)
5546 {
5547 #if __ARM_MIXED_PAGE_SIZE__
5548 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5549 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5550 #else
5551 	const uint64_t pmap_page_size = PAGE_SIZE;
5552 #endif
5553 
5554 	for (ppnum_t page = 0; page < size; page++) {
5555 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5556 			panic("%s: failed pmap_enter_addr, "
5557 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5558 			    __FUNCTION__,
5559 			    pmap, va, (uint64_t)pa, size, prot, flags);
5560 		}
5561 
5562 		va += pmap_page_size;
5563 		pa += pmap_page_size;
5564 	}
5565 
5566 	return KERN_SUCCESS;
5567 }
5568 
5569 kern_return_t
5570 pmap_enter_addr(
5571 	pmap_t pmap,
5572 	vm_map_address_t v,
5573 	pmap_paddr_t pa,
5574 	vm_prot_t prot,
5575 	vm_prot_t fault_type,
5576 	unsigned int flags,
5577 	boolean_t wired)
5578 {
5579 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5580 }
5581 
5582 /*
5583  *	Insert the given physical page (p) at
5584  *	the specified virtual address (v) in the
5585  *	target physical map with the protection requested.
5586  *
5587  *	If specified, the page will be wired down, meaning
5588  *	that the related pte can not be reclaimed.
5589  *
5590  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5591  *	or lose information.  That is, this routine must actually
5592  *	insert this page into the given map eventually (must make
5593  *	forward progress eventually.
5594  */
5595 kern_return_t
5596 pmap_enter(
5597 	pmap_t pmap,
5598 	vm_map_address_t v,
5599 	ppnum_t pn,
5600 	vm_prot_t prot,
5601 	vm_prot_t fault_type,
5602 	unsigned int flags,
5603 	boolean_t wired)
5604 {
5605 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5606 }
5607 
5608 /*
5609  * Attempt to commit the pte.
5610  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5611  * Performs no page table or accounting writes on failures.
5612  */
5613 static inline bool
5614 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5615 {
5616 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5617 	bool success = false, changed_wiring = false;
5618 
5619 	__unreachable_ok_push
5620 	if (TEST_PAGE_RATIO_4) {
5621 		/*
5622 		 * 16K virtual pages w/ 4K hw pages.
5623 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5624 		 * As a result we require the exclusive pmap lock.
5625 		 */
5626 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5627 		*old_pte = *pte_p;
5628 		if (*old_pte == new_pte) {
5629 			/* Another thread completed this operation. Nothing to do here. */
5630 			success = true;
5631 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5632 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5633 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5634 			success = false;
5635 		} else {
5636 			write_pte_fast(pte_p, new_pte);
5637 			success = true;
5638 		}
5639 	} else {
5640 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5641 	}
5642 	__unreachable_ok_pop
5643 
5644 	if (success && *old_pte != new_pte) {
5645 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5646 			FLUSH_PTE_STRONG();
5647 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), false, true);
5648 		} else {
5649 			FLUSH_PTE();
5650 			__builtin_arm_isb(ISB_SY);
5651 		}
5652 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5653 		    (new_pte & ARM_PTE_WIRED) != 0 :
5654 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5655 
5656 		if (pmap != kernel_pmap && changed_wiring) {
5657 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5658 			if (new_pte & ARM_PTE_WIRED) {
5659 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5660 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5661 			} else {
5662 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5663 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5664 			}
5665 		}
5666 
5667 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5668 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5669 	}
5670 	return success;
5671 }
5672 
5673 MARK_AS_PMAP_TEXT static pt_entry_t
5674 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5675 {
5676 	pt_entry_t pte;
5677 
5678 	switch (wimg & (VM_WIMG_MASK)) {
5679 	case VM_WIMG_IO:
5680 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5681 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5682 		// AP, while preserving the security benefits of using device
5683 		// mapping against side-channel attacks. On pre-H14 platforms,
5684 		// the accesses will still be strongly ordered.
5685 		if (is_dram_addr(pa)) {
5686 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5687 		} else {
5688 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5689 		}
5690 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5691 		break;
5692 	case VM_WIMG_RT:
5693 #if HAS_UCNORMAL_MEM
5694 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5695 #else
5696 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5697 #endif
5698 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5699 		break;
5700 	case VM_WIMG_POSTED:
5701 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5702 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5703 		break;
5704 	case VM_WIMG_POSTED_REORDERED:
5705 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5706 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5707 		break;
5708 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5709 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5710 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5711 		break;
5712 	case VM_WIMG_WCOMB:
5713 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5714 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5715 		break;
5716 	case VM_WIMG_WTHRU:
5717 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5718 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5719 		break;
5720 	case VM_WIMG_COPYBACK:
5721 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5722 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5723 		break;
5724 	case VM_WIMG_INNERWBACK:
5725 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5726 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5727 		break;
5728 	default:
5729 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5730 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5731 	}
5732 
5733 	return pte;
5734 }
5735 
5736 
5737 /*
5738  * Construct a PTE (and the physical page attributes) for the given virtual to
5739  * physical mapping.
5740  *
5741  * This function has no side effects and is safe to call so that it is safe to
5742  * call while attempting a pmap_enter transaction.
5743  */
5744 MARK_AS_PMAP_TEXT static pt_entry_t
5745 pmap_construct_pte(
5746 	const pmap_t pmap,
5747 	vm_map_address_t va,
5748 	pmap_paddr_t pa,
5749 	vm_prot_t prot,
5750 	vm_prot_t fault_type,
5751 	boolean_t wired,
5752 	const pt_attr_t* const pt_attr,
5753 	uint16_t *pp_attr_bits /* OUTPUT */
5754 	)
5755 {
5756 	bool set_NX = false, set_XO = false;
5757 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5758 	assert(pp_attr_bits != NULL);
5759 	*pp_attr_bits = 0;
5760 
5761 	if (wired) {
5762 		pte |= ARM_PTE_WIRED;
5763 	}
5764 
5765 #if DEVELOPMENT || DEBUG
5766 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5767 #else
5768 	if ((prot & VM_PROT_EXECUTE))
5769 #endif
5770 	{
5771 		set_NX = false;
5772 	} else {
5773 		set_NX = true;
5774 	}
5775 
5776 	if (prot == VM_PROT_EXECUTE) {
5777 		set_XO = true;
5778 	}
5779 
5780 	if (set_NX) {
5781 		pte |= pt_attr_leaf_xn(pt_attr);
5782 	} else {
5783 		if (pmap == kernel_pmap) {
5784 			pte |= ARM_PTE_NX;
5785 		} else {
5786 			pte |= pt_attr_leaf_x(pt_attr);
5787 		}
5788 	}
5789 
5790 	if (pmap == kernel_pmap) {
5791 #if __ARM_KERNEL_PROTECT__
5792 		pte |= ARM_PTE_NG;
5793 #endif /* __ARM_KERNEL_PROTECT__ */
5794 		if (prot & VM_PROT_WRITE) {
5795 			pte |= ARM_PTE_AP(AP_RWNA);
5796 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5797 		} else {
5798 			pte |= ARM_PTE_AP(AP_RONA);
5799 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5800 		}
5801 	} else {
5802 		if (pmap->type != PMAP_TYPE_NESTED) {
5803 			pte |= ARM_PTE_NG;
5804 		} else if ((pmap->nested_region_asid_bitmap)
5805 		    && (va >= pmap->nested_region_addr)
5806 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5807 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5808 
5809 			if ((pmap->nested_region_asid_bitmap)
5810 			    && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5811 				pte |= ARM_PTE_NG;
5812 			}
5813 		}
5814 		if (prot & VM_PROT_WRITE) {
5815 			assert(pmap->type != PMAP_TYPE_NESTED);
5816 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5817 				if (fault_type & VM_PROT_WRITE) {
5818 					pte |= pt_attr_leaf_rw(pt_attr);
5819 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5820 				} else {
5821 					pte |= pt_attr_leaf_ro(pt_attr);
5822 					/*
5823 					 * Mark the page as MODFAULT so that a subsequent write
5824 					 * may be handled through arm_fast_fault().
5825 					 */
5826 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5827 					pte_set_was_writeable(pte, true);
5828 				}
5829 			} else {
5830 				pte |= pt_attr_leaf_rw(pt_attr);
5831 				*pp_attr_bits |= PP_ATTR_REFERENCED;
5832 			}
5833 		} else {
5834 			if (set_XO) {
5835 				pte |= pt_attr_leaf_rona(pt_attr);
5836 			} else {
5837 				pte |= pt_attr_leaf_ro(pt_attr);
5838 			}
5839 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5840 		}
5841 	}
5842 
5843 	pte |= ARM_PTE_AF;
5844 	return pte;
5845 }
5846 
5847 MARK_AS_PMAP_TEXT kern_return_t
5848 pmap_enter_options_internal(
5849 	pmap_t pmap,
5850 	vm_map_address_t v,
5851 	pmap_paddr_t pa,
5852 	vm_prot_t prot,
5853 	vm_prot_t fault_type,
5854 	unsigned int flags,
5855 	boolean_t wired,
5856 	unsigned int options)
5857 {
5858 	ppnum_t         pn = (ppnum_t)atop(pa);
5859 	pt_entry_t      pte;
5860 	pt_entry_t      spte;
5861 	pt_entry_t      *pte_p;
5862 	bool            refcnt_updated;
5863 	bool            wiredcnt_updated;
5864 	bool            ro_va = false;
5865 	unsigned int    wimg_bits;
5866 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5867 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5868 	kern_return_t   kr = KERN_SUCCESS;
5869 	uint16_t pp_attr_bits;
5870 	volatile uint16_t *refcnt;
5871 	volatile uint16_t *wiredcnt;
5872 	pv_free_list_t *local_pv_free;
5873 
5874 	validate_pmap_mutable(pmap);
5875 
5876 #if XNU_MONITOR
5877 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
5878 		panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
5879 	}
5880 #endif
5881 
5882 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5883 
5884 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5885 		panic("pmap_enter_options() pmap %p v 0x%llx",
5886 		    pmap, (uint64_t)v);
5887 	}
5888 
5889 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
5890 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5891 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5892 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5893 	}
5894 
5895 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5896 		panic("pmap_enter_options() pmap %p pa 0x%llx",
5897 		    pmap, (uint64_t)pa);
5898 	}
5899 
5900 	/* The PA should not extend beyond the architected physical address space */
5901 	pa &= ARM_PTE_PAGE_MASK;
5902 
5903 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5904 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
5905 		extern vm_offset_t ctrr_test_page;
5906 		if (__probable(v != ctrr_test_page))
5907 #endif
5908 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5909 	}
5910 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
5911 		if (__improbable(prot != VM_PROT_READ)) {
5912 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
5913 			    __func__, (unsigned long long)v, prot);
5914 		}
5915 		ro_va = true;
5916 	}
5917 	assert(pn != vm_page_fictitious_addr);
5918 
5919 	refcnt_updated = false;
5920 	wiredcnt_updated = false;
5921 
5922 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
5923 		/*
5924 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
5925 		 *
5926 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
5927 		 */
5928 		lock_mode = PMAP_LOCK_EXCLUSIVE;
5929 	}
5930 
5931 	if (!pmap_lock_preempt(pmap, lock_mode)) {
5932 		return KERN_ABORTED;
5933 	}
5934 
5935 	/*
5936 	 *	Expand pmap to include this pte.  Assume that
5937 	 *	pmap is always expanded to include enough hardware
5938 	 *	pages to map one VM page.
5939 	 */
5940 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5941 		/* Must unlock to expand the pmap. */
5942 		pmap_unlock(pmap, lock_mode);
5943 
5944 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5945 
5946 		if (kr != KERN_SUCCESS) {
5947 			return kr;
5948 		}
5949 
5950 		if (!pmap_lock_preempt(pmap, lock_mode)) {
5951 			return KERN_ABORTED;
5952 		}
5953 	}
5954 
5955 	if (options & PMAP_OPTIONS_NOENTER) {
5956 		pmap_unlock(pmap, lock_mode);
5957 		return KERN_SUCCESS;
5958 	}
5959 
5960 	/*
5961 	 * Since we may not hold the pmap lock exclusive, updating the pte is
5962 	 * done via a cmpxchg loop.
5963 	 * We need to be careful about modifying non-local data structures before commiting
5964 	 * the new pte since we may need to re-do the transaction.
5965 	 */
5966 	spte = os_atomic_load(pte_p, relaxed);
5967 	while (!committed) {
5968 		refcnt = NULL;
5969 		wiredcnt = NULL;
5970 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
5971 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
5972 
5973 		if (pmap != kernel_pmap) {
5974 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
5975 			refcnt = &ptd_info->refcnt;
5976 			wiredcnt = &ptd_info->wiredcnt;
5977 			/*
5978 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
5979 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
5980 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
5981 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
5982 			 * have PTDs, so we can't use the check there.
5983 			 */
5984 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
5985 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
5986 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
5987 			}
5988 			/*
5989 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
5990 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
5991 			 * or acquire the pmap lock exclusive.
5992 			 */
5993 			if (!wiredcnt_updated) {
5994 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
5995 				wiredcnt_updated = true;
5996 			}
5997 			if (!refcnt_updated) {
5998 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
5999 				refcnt_updated = true;
6000 				drop_refcnt = true;
6001 			}
6002 		}
6003 
6004 #if XNU_MONITOR
6005 		/**
6006 		 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6007 		 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6008 		 * created and later removed.  We must therefore prevent an attacker from downgrading a
6009 		 * a writable mapping in order to allow it to be removed and remapped to something else.
6010 		 */
6011 		if (__improbable(had_valid_mapping && !pa_valid(pte_to_pa(spte)) &&
6012 		    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !(prot & VM_PROT_WRITE) &&
6013 		    (pmap_cache_attributes((ppnum_t)atop(pte_to_pa(spte))) & PP_ATTR_MONITOR))) {
6014 			panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6015 			    __func__, (uint64_t)pte_to_pa(spte));
6016 		}
6017 #endif
6018 
6019 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6020 			/*
6021 			 * There is already a mapping here & it's for a different physical page.
6022 			 * First remove that mapping.
6023 			 *
6024 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6025 			 */
6026 			if (lock_mode == PMAP_LOCK_SHARED) {
6027 				if (pmap_lock_shared_to_exclusive(pmap)) {
6028 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6029 				} else {
6030 					/*
6031 					 * We failed to upgrade to an exclusive lock.
6032 					 * As a result we no longer hold the lock at all,
6033 					 * so we need to re-acquire it and restart the transaction.
6034 					 */
6035 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6036 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6037 					/* pmap might have changed after we dropped the lock. Try again. */
6038 					spte = os_atomic_load(pte_p, relaxed);
6039 					continue;
6040 				}
6041 			}
6042 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6043 			spte = ARM_PTE_TYPE_FAULT;
6044 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6045 		}
6046 
6047 		/*
6048 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6049 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6050 		 * read-write protection. The PMAP layer though still needs to use the right
6051 		 * index, which is the older XO-now-TPRO one and that is specially selected
6052 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6053 		 */
6054 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6055 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6056 		} else {
6057 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6058 		}
6059 
6060 		if (pa_valid(pa)) {
6061 			unsigned int pai;
6062 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6063 
6064 			is_internal = FALSE;
6065 			is_altacct = FALSE;
6066 
6067 			pai = pa_index(pa);
6068 
6069 			pvh_lock(pai);
6070 
6071 			/*
6072 			 * Make sure that the current per-cpu PV free list has
6073 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6074 			 * if the transaction succeeds. We're either in the
6075 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6076 			 * Note that we can still be interrupted, but a primary
6077 			 * interrupt handler can never enter the pmap.
6078 			 */
6079 #if !XNU_MONITOR
6080 			assert(get_preemption_level() > 0);
6081 #endif
6082 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6083 			pv_entry_t **pv_h = pai_to_pvh(pai);
6084 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6085 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6086 
6087 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6088 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6089 				int new_allocated_pves = 0;
6090 
6091 				while (new_allocated_pves < 2) {
6092 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6093 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6094 					if (pv_status == PV_ALLOC_FAIL) {
6095 						break;
6096 					} else if (pv_status == PV_ALLOC_RETRY) {
6097 						/*
6098 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6099 						 * it will have dropped the pmap lock while doing so.
6100 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6101 						 * be on a different CPU now.
6102 						 */
6103 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6104 					} else {
6105 						/* If we've gotten this far then a node should've been allocated. */
6106 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6107 
6108 						new_allocated_pves++;
6109 					}
6110 				}
6111 
6112 				for (int i = 0; i < new_allocated_pves; i++) {
6113 					pv_free(new_pve_p[i]);
6114 				}
6115 			}
6116 
6117 			if (pv_status == PV_ALLOC_FAIL) {
6118 				pvh_unlock(pai);
6119 				kr = KERN_RESOURCE_SHORTAGE;
6120 				break;
6121 			} else if (pv_status == PV_ALLOC_RETRY) {
6122 				pvh_unlock(pai);
6123 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6124 				spte = os_atomic_load(pte_p, relaxed);
6125 				continue;
6126 			}
6127 
6128 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6129 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6130 			} else {
6131 				wimg_bits = pmap_cache_attributes(pn);
6132 			}
6133 
6134 			/* We may be retrying this operation after dropping the PVH lock.
6135 			 * Cache attributes for the physical page may have changed while the lock
6136 			 * was dropped, so clear any cache attributes we may have previously set
6137 			 * in the PTE template. */
6138 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6139 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6140 
6141 #if XNU_MONITOR
6142 			/* The regular old kernel is not allowed to remap PPL pages. */
6143 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6144 				panic("%s: page belongs to PPL, "
6145 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6146 				    __FUNCTION__,
6147 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6148 			}
6149 
6150 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6151 				panic("%s: page locked down, "
6152 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6153 				    __FUNCTION__,
6154 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6155 			}
6156 #endif
6157 
6158 
6159 
6160 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6161 			if (!committed) {
6162 				pvh_unlock(pai);
6163 				continue;
6164 			}
6165 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6166 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6167 
6168 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6169 			/*
6170 			 * If there was already a valid pte here then we reuse its reference
6171 			 * on the ptd and drop the one that we took above.
6172 			 */
6173 			drop_refcnt = had_valid_mapping;
6174 
6175 			if (!had_valid_mapping) {
6176 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6177 				int pve_ptep_idx = 0;
6178 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6179 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6180 				if (pv_status != PV_ALLOC_SUCCESS) {
6181 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6182 					    __func__, pv_status, new_pve_p, pmap);
6183 				}
6184 
6185 				if (pmap != kernel_pmap) {
6186 					if (options & PMAP_OPTIONS_INTERNAL) {
6187 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6188 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6189 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6190 							/*
6191 							 * Make a note to ourselves that this
6192 							 * mapping is using alternative
6193 							 * accounting. We'll need this in order
6194 							 * to know which ledger to debit when
6195 							 * the mapping is removed.
6196 							 *
6197 							 * The altacct bit must be set while
6198 							 * the pv head is locked. Defer the
6199 							 * ledger accounting until after we've
6200 							 * dropped the lock.
6201 							 */
6202 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6203 							is_altacct = TRUE;
6204 						}
6205 					}
6206 					if (ppattr_test_reusable(pai) &&
6207 					    !is_altacct) {
6208 						is_reusable = TRUE;
6209 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6210 						is_internal = TRUE;
6211 					} else {
6212 						is_external = TRUE;
6213 					}
6214 				}
6215 			}
6216 
6217 			pvh_unlock(pai);
6218 
6219 			if (pp_attr_bits != 0) {
6220 				ppattr_pa_set_bits(pa, pp_attr_bits);
6221 			}
6222 
6223 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6224 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6225 
6226 				if (is_internal) {
6227 					/*
6228 					 * Make corresponding adjustments to
6229 					 * phys_footprint statistics.
6230 					 */
6231 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6232 					if (is_altacct) {
6233 						/*
6234 						 * If this page is internal and
6235 						 * in an IOKit region, credit
6236 						 * the task's total count of
6237 						 * dirty, internal IOKit pages.
6238 						 * It should *not* count towards
6239 						 * the task's total physical
6240 						 * memory footprint, because
6241 						 * this entire region was
6242 						 * already billed to the task
6243 						 * at the time the mapping was
6244 						 * created.
6245 						 *
6246 						 * Put another way, this is
6247 						 * internal++ and
6248 						 * alternate_accounting++, so
6249 						 * net effect on phys_footprint
6250 						 * is 0. That means: don't
6251 						 * touch phys_footprint here.
6252 						 */
6253 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6254 					} else {
6255 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6256 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6257 							skip_footprint_debit = true;
6258 						} else {
6259 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6260 						}
6261 					}
6262 				}
6263 				if (is_reusable) {
6264 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6265 				} else if (is_external) {
6266 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6267 				}
6268 			}
6269 		} else {
6270 			if (prot & VM_PROT_EXECUTE) {
6271 				kr = KERN_FAILURE;
6272 				break;
6273 			}
6274 
6275 			wimg_bits = pmap_cache_attributes(pn);
6276 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6277 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6278 			}
6279 
6280 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6281 
6282 #if XNU_MONITOR
6283 			if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
6284 				uint64_t xprr_perm = pte_to_xprr_perm(pte);
6285 				switch (xprr_perm) {
6286 				case XPRR_KERN_RO_PERM:
6287 					break;
6288 				case XPRR_KERN_RW_PERM:
6289 					pte &= ~ARM_PTE_XPRR_MASK;
6290 					pte |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
6291 					break;
6292 				default:
6293 					panic("Unsupported xPRR perm %llu for pte 0x%llx", xprr_perm, (uint64_t)pte);
6294 				}
6295 			}
6296 #endif
6297 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6298 			if (committed) {
6299 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6300 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6301 
6302 				/**
6303 				 * If there was already a valid pte here then we reuse its
6304 				 * reference on the ptd and drop the one that we took above.
6305 				 */
6306 				drop_refcnt = had_valid_mapping;
6307 			}
6308 		}
6309 		if (committed) {
6310 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6311 				assert(pmap != kernel_pmap);
6312 
6313 				/* One less "compressed" */
6314 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6315 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6316 
6317 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6318 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6319 				} else if (!skip_footprint_debit) {
6320 					/* Was part of the footprint */
6321 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6322 				}
6323 				/* The old entry held a reference so drop the extra one that we took above. */
6324 				drop_refcnt = true;
6325 			}
6326 		}
6327 	}
6328 
6329 	if (drop_refcnt && refcnt != NULL) {
6330 		assert(refcnt_updated);
6331 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6332 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6333 		}
6334 	}
6335 
6336 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6337 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6338 	}
6339 
6340 	pmap_unlock(pmap, lock_mode);
6341 
6342 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6343 		pmap_phys_write_disable(v);
6344 	}
6345 
6346 	return kr;
6347 }
6348 
6349 kern_return_t
6350 pmap_enter_options_addr(
6351 	pmap_t pmap,
6352 	vm_map_address_t v,
6353 	pmap_paddr_t pa,
6354 	vm_prot_t prot,
6355 	vm_prot_t fault_type,
6356 	unsigned int flags,
6357 	boolean_t wired,
6358 	unsigned int options,
6359 	__unused void   *arg)
6360 {
6361 	kern_return_t kr = KERN_FAILURE;
6362 
6363 
6364 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6365 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6366 
6367 
6368 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6369 	do {
6370 #if XNU_MONITOR
6371 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6372 #else
6373 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6374 #endif
6375 
6376 		if (kr == KERN_RESOURCE_SHORTAGE) {
6377 #if XNU_MONITOR
6378 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6379 #endif
6380 			if (nowait_requested) {
6381 				break;
6382 			}
6383 		}
6384 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6385 
6386 #if XNU_MONITOR
6387 	pmap_ledger_check_balance(pmap);
6388 #endif
6389 
6390 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6391 
6392 	return kr;
6393 }
6394 
6395 kern_return_t
6396 pmap_enter_options(
6397 	pmap_t pmap,
6398 	vm_map_address_t v,
6399 	ppnum_t pn,
6400 	vm_prot_t prot,
6401 	vm_prot_t fault_type,
6402 	unsigned int flags,
6403 	boolean_t wired,
6404 	unsigned int options,
6405 	__unused void   *arg)
6406 {
6407 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6408 }
6409 
6410 /*
6411  *	Routine:	pmap_change_wiring
6412  *	Function:	Change the wiring attribute for a map/virtual-address
6413  *			pair.
6414  *	In/out conditions:
6415  *			The mapping must already exist in the pmap.
6416  */
6417 MARK_AS_PMAP_TEXT kern_return_t
6418 pmap_change_wiring_internal(
6419 	pmap_t pmap,
6420 	vm_map_address_t v,
6421 	boolean_t wired)
6422 {
6423 	pt_entry_t     *pte_p;
6424 	pmap_paddr_t    pa;
6425 
6426 	validate_pmap_mutable(pmap);
6427 
6428 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6429 		return KERN_ABORTED;
6430 	}
6431 
6432 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6433 
6434 	pte_p = pmap_pte(pmap, v);
6435 	if (pte_p == PT_ENTRY_NULL) {
6436 		if (!wired) {
6437 			/*
6438 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6439 			 * may have been freed by a remove operation.
6440 			 */
6441 			goto pmap_change_wiring_return;
6442 		} else {
6443 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6444 		}
6445 	}
6446 	/*
6447 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6448 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6449 	 */
6450 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6451 
6452 	while (pa_valid(pa)) {
6453 		pmap_paddr_t new_pa;
6454 
6455 		pvh_lock(pa_index(pa));
6456 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6457 
6458 		if (pa == new_pa) {
6459 			break;
6460 		}
6461 
6462 		pvh_unlock(pa_index(pa));
6463 		pa = new_pa;
6464 	}
6465 
6466 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6467 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6468 		if (!wired) {
6469 			/* PTE cleared by prior remove/disconnect operation */
6470 			goto pmap_change_wiring_cleanup;
6471 		} else {
6472 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6473 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6474 		}
6475 	}
6476 
6477 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6478 	if (wired != pte_is_wired(*pte_p)) {
6479 		pte_set_wired(pmap, pte_p, wired);
6480 		if (pmap != kernel_pmap) {
6481 			if (wired) {
6482 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6483 			} else if (!wired) {
6484 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6485 			}
6486 		}
6487 	}
6488 
6489 pmap_change_wiring_cleanup:
6490 	if (pa_valid(pa)) {
6491 		pvh_unlock(pa_index(pa));
6492 	}
6493 
6494 pmap_change_wiring_return:
6495 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6496 
6497 	return KERN_SUCCESS;
6498 }
6499 
6500 void
6501 pmap_change_wiring(
6502 	pmap_t pmap,
6503 	vm_map_address_t v,
6504 	boolean_t wired)
6505 {
6506 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6507 	pmap_verify_preemptible();
6508 
6509 	kern_return_t kr = KERN_FAILURE;
6510 #if XNU_MONITOR
6511 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6512 	do {
6513 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6514 	} while (kr == KERN_ABORTED);
6515 
6516 	pmap_ledger_check_balance(pmap);
6517 #else
6518 	/* Since we verified preemptibility, call the helper only once. */
6519 	kr = pmap_change_wiring_internal(pmap, v, wired);
6520 #endif
6521 
6522 	if (kr != KERN_SUCCESS) {
6523 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6524 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6525 	}
6526 }
6527 
6528 MARK_AS_PMAP_TEXT pmap_paddr_t
6529 pmap_find_pa_internal(
6530 	pmap_t pmap,
6531 	addr64_t va)
6532 {
6533 	pmap_paddr_t    pa = 0;
6534 
6535 	validate_pmap(pmap);
6536 
6537 	if (pmap != kernel_pmap) {
6538 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6539 	}
6540 
6541 	pa = pmap_vtophys(pmap, va);
6542 
6543 	if (pmap != kernel_pmap) {
6544 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6545 	}
6546 
6547 	return pa;
6548 }
6549 
6550 pmap_paddr_t
6551 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6552 {
6553 	pmap_paddr_t pa = 0;
6554 
6555 	if (pmap == kernel_pmap) {
6556 		pa = mmu_kvtop(va);
6557 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6558 		/*
6559 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6560 		 * translation even if PAN would prevent kernel access through the translation.
6561 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6562 		 */
6563 		pa = mmu_uvtop(va);
6564 	}
6565 	return pa;
6566 }
6567 
6568 pmap_paddr_t
6569 pmap_find_pa(
6570 	pmap_t pmap,
6571 	addr64_t va)
6572 {
6573 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6574 
6575 	if (pa != 0) {
6576 		return pa;
6577 	}
6578 
6579 	if (not_in_kdp) {
6580 #if XNU_MONITOR
6581 		return pmap_find_pa_ppl(pmap, va);
6582 #else
6583 		return pmap_find_pa_internal(pmap, va);
6584 #endif
6585 	} else {
6586 		return pmap_vtophys(pmap, va);
6587 	}
6588 }
6589 
6590 ppnum_t
6591 pmap_find_phys_nofault(
6592 	pmap_t pmap,
6593 	addr64_t va)
6594 {
6595 	ppnum_t ppn;
6596 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6597 	return ppn;
6598 }
6599 
6600 ppnum_t
6601 pmap_find_phys(
6602 	pmap_t pmap,
6603 	addr64_t va)
6604 {
6605 	ppnum_t ppn;
6606 	ppn = atop(pmap_find_pa(pmap, va));
6607 	return ppn;
6608 }
6609 
6610 /**
6611  * Translate a kernel virtual address into a physical address.
6612  *
6613  * @param va The kernel virtual address to translate. Does not work on user
6614  *           virtual addresses.
6615  *
6616  * @return The physical address if the translation was successful, or zero if
6617  *         no valid mappings were found for the given virtual address.
6618  */
6619 pmap_paddr_t
6620 kvtophys(vm_offset_t va)
6621 {
6622 	/**
6623 	 * Attempt to do the translation first in hardware using the AT (address
6624 	 * translation) instruction. This will attempt to use the MMU to do the
6625 	 * translation for us.
6626 	 */
6627 	pmap_paddr_t pa = mmu_kvtop(va);
6628 
6629 	if (pa) {
6630 		return pa;
6631 	}
6632 
6633 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6634 	return pmap_vtophys(kernel_pmap, va);
6635 }
6636 
6637 /**
6638  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6639  * points to a non-kernel-managed physical page, then this call will panic().
6640  *
6641  * @note The output of this function is guaranteed to be a kernel-managed
6642  *       physical page, which means it's safe to pass the output directly to
6643  *       pa_index() to create a physical address index for various pmap data
6644  *       structures.
6645  *
6646  * @param va The kernel virtual address to translate. Does not work on user
6647  *           virtual addresses.
6648  *
6649  * @return The translated physical address for the given virtual address.
6650  */
6651 pmap_paddr_t
6652 kvtophys_nofail(vm_offset_t va)
6653 {
6654 	pmap_paddr_t pa = kvtophys(va);
6655 
6656 	if (!pa_valid(pa)) {
6657 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6658 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6659 	}
6660 
6661 	return pa;
6662 }
6663 
6664 pmap_paddr_t
6665 pmap_vtophys(
6666 	pmap_t pmap,
6667 	addr64_t va)
6668 {
6669 	if ((va < pmap->min) || (va >= pmap->max)) {
6670 		return 0;
6671 	}
6672 
6673 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6674 
6675 	tt_entry_t * ttp = NULL;
6676 	tt_entry_t * ttep = NULL;
6677 	tt_entry_t   tte = ARM_TTE_EMPTY;
6678 	pmap_paddr_t pa = 0;
6679 	unsigned int cur_level;
6680 
6681 	ttp = pmap->tte;
6682 
6683 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6684 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6685 
6686 		tte = *ttep;
6687 
6688 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6689 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6690 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6691 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6692 
6693 		if ((tte & valid_mask) != valid_mask) {
6694 			return (pmap_paddr_t) 0;
6695 		}
6696 
6697 		/* This detects both leaf entries and intermediate block mappings. */
6698 		if ((tte & type_mask) == type_block) {
6699 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6700 			break;
6701 		}
6702 
6703 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6704 	}
6705 
6706 	return pa;
6707 }
6708 
6709 /*
6710  *	pmap_init_pte_page - Initialize a page table page.
6711  */
6712 MARK_AS_PMAP_TEXT void
6713 pmap_init_pte_page(
6714 	pmap_t pmap,
6715 	pt_entry_t *pte_p,
6716 	vm_offset_t va,
6717 	unsigned int ttlevel,
6718 	boolean_t alloc_ptd)
6719 {
6720 	pt_desc_t   *ptdp = NULL;
6721 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6722 
6723 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6724 		if (alloc_ptd) {
6725 			/*
6726 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6727 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6728 			 * bootstrap request, so we check for an existing PTD here.
6729 			 */
6730 			ptdp = ptd_alloc(pmap);
6731 			if (ptdp == NULL) {
6732 				panic("%s: unable to allocate PTD", __func__);
6733 			}
6734 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6735 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6736 			pvh_set_flags(pvh, 0);
6737 		} else {
6738 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
6739 		}
6740 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6741 		ptdp = pvh_ptd(pvh);
6742 	} else {
6743 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6744 	}
6745 
6746 	// below barrier ensures previous updates to the page are visible to PTW before
6747 	// it is linked to the PTE of previous level
6748 	__builtin_arm_dmb(DMB_ISHST);
6749 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6750 }
6751 
6752 /*
6753  *	Routine:	pmap_expand
6754  *
6755  *	Expands a pmap to be able to map the specified virtual address.
6756  *
6757  *	Allocates new memory for the default (COARSE) translation table
6758  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6759  *	also allocates space for the corresponding pv entries.
6760  *
6761  *	Nothing should be locked.
6762  */
6763 MARK_AS_PMAP_TEXT static kern_return_t
6764 pmap_expand(
6765 	pmap_t pmap,
6766 	vm_map_address_t v,
6767 	unsigned int options,
6768 	unsigned int level)
6769 {
6770 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6771 
6772 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6773 		return KERN_INVALID_ADDRESS;
6774 	}
6775 	pmap_paddr_t    pa;
6776 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
6777 	tt_entry_t              *tte_p;
6778 	tt_entry_t              *tt_p;
6779 
6780 	pa = 0x0ULL;
6781 	tt_p =  (tt_entry_t *)NULL;
6782 
6783 	for (; ttlevel < level; ttlevel++) {
6784 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6785 			return KERN_ABORTED;
6786 		}
6787 
6788 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6789 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6790 			while (pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
6791 				if (options & PMAP_OPTIONS_NOWAIT) {
6792 					return KERN_RESOURCE_SHORTAGE;
6793 				}
6794 #if XNU_MONITOR
6795 				panic("%s: failed to allocate tt, "
6796 				    "pmap=%p, v=%p, options=0x%x, level=%u",
6797 				    __FUNCTION__,
6798 				    pmap, (void *)v, options, level);
6799 #else
6800 				VM_PAGE_WAIT();
6801 #endif
6802 			}
6803 
6804 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6805 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6806 				return KERN_ABORTED;
6807 			}
6808 
6809 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6810 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6811 				pa = kvtophys_nofail((vm_offset_t)tt_p);
6812 				tte_p = pmap_ttne(pmap, ttlevel, v);
6813 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6814 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6815 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6816 				pa = 0x0ULL;
6817 				tt_p = (tt_entry_t *)NULL;
6818 			}
6819 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6820 		} else {
6821 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
6822 		}
6823 
6824 		if (tt_p != (tt_entry_t *)NULL) {
6825 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6826 			tt_p = (tt_entry_t *)NULL;
6827 		}
6828 	}
6829 
6830 	return KERN_SUCCESS;
6831 }
6832 
6833 /*
6834  *	Routine:	pmap_gc
6835  *	Function:
6836  *              Pmap garbage collection
6837  *		Called by the pageout daemon when pages are scarce.
6838  *
6839  */
6840 void
6841 pmap_gc(void)
6842 {
6843 	/*
6844 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6845 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6846 	 * or may contain wired mappings.  However, with the relatively recent change to
6847 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6848 	 * page, it may make sense to call that function here.
6849 	 */
6850 }
6851 
6852 /*
6853  *      By default, don't attempt pmap GC more frequently
6854  *      than once / 1 minutes.
6855  */
6856 
6857 void
6858 compute_pmap_gc_throttle(
6859 	void *arg __unused)
6860 {
6861 }
6862 
6863 /*
6864  * pmap_attribute_cache_sync(vm_offset_t pa)
6865  *
6866  * Invalidates all of the instruction cache on a physical page and
6867  * pushes any dirty data from the data cache for the same physical page
6868  */
6869 
6870 kern_return_t
6871 pmap_attribute_cache_sync(
6872 	ppnum_t pp,
6873 	vm_size_t size,
6874 	__unused vm_machine_attribute_t attribute,
6875 	__unused vm_machine_attribute_val_t * value)
6876 {
6877 	if (size > PAGE_SIZE) {
6878 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6879 	} else {
6880 		cache_sync_page(pp);
6881 	}
6882 
6883 	return KERN_SUCCESS;
6884 }
6885 
6886 /*
6887  * pmap_sync_page_data_phys(ppnum_t pp)
6888  *
6889  * Invalidates all of the instruction cache on a physical page and
6890  * pushes any dirty data from the data cache for the same physical page
6891  */
6892 void
6893 pmap_sync_page_data_phys(
6894 	ppnum_t pp)
6895 {
6896 	cache_sync_page(pp);
6897 }
6898 
6899 /*
6900  * pmap_sync_page_attributes_phys(ppnum_t pp)
6901  *
6902  * Write back and invalidate all cachelines on a physical page.
6903  */
6904 void
6905 pmap_sync_page_attributes_phys(
6906 	ppnum_t pp)
6907 {
6908 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6909 }
6910 
6911 #if CONFIG_COREDUMP
6912 /* temporary workaround */
6913 boolean_t
6914 coredumpok(
6915 	vm_map_t map,
6916 	mach_vm_offset_t va)
6917 {
6918 	pt_entry_t     *pte_p;
6919 	pt_entry_t      spte;
6920 
6921 	pte_p = pmap_pte(map->pmap, va);
6922 	if (0 == pte_p) {
6923 		return FALSE;
6924 	}
6925 	if (vm_map_entry_has_device_pager(map, va)) {
6926 		return FALSE;
6927 	}
6928 	spte = *pte_p;
6929 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6930 }
6931 #endif
6932 
6933 void
6934 fillPage(
6935 	ppnum_t pn,
6936 	unsigned int fill)
6937 {
6938 	unsigned int   *addr;
6939 	int             count;
6940 
6941 	addr = (unsigned int *) phystokv(ptoa(pn));
6942 	count = PAGE_SIZE / sizeof(unsigned int);
6943 	while (count--) {
6944 		*addr++ = fill;
6945 	}
6946 }
6947 
6948 extern void     mapping_set_mod(ppnum_t pn);
6949 
6950 void
6951 mapping_set_mod(
6952 	ppnum_t pn)
6953 {
6954 	pmap_set_modify(pn);
6955 }
6956 
6957 extern void     mapping_set_ref(ppnum_t pn);
6958 
6959 void
6960 mapping_set_ref(
6961 	ppnum_t pn)
6962 {
6963 	pmap_set_reference(pn);
6964 }
6965 
6966 /*
6967  * Clear specified attribute bits.
6968  *
6969  * Try to force an arm_fast_fault() for all mappings of
6970  * the page - to force attributes to be set again at fault time.
6971  * If the forcing succeeds, clear the cached bits at the head.
6972  * Otherwise, something must have been wired, so leave the cached
6973  * attributes alone.
6974  */
6975 MARK_AS_PMAP_TEXT static void
6976 phys_attribute_clear_with_flush_range(
6977 	ppnum_t         pn,
6978 	unsigned int    bits,
6979 	int             options,
6980 	void            *arg,
6981 	pmap_tlb_flush_range_t *flush_range)
6982 {
6983 	pmap_paddr_t    pa = ptoa(pn);
6984 	vm_prot_t       allow_mode = VM_PROT_ALL;
6985 
6986 #if XNU_MONITOR
6987 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
6988 		panic("%s: illegal request, "
6989 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
6990 		    __FUNCTION__,
6991 		    pn, bits, options, arg, flush_range);
6992 	}
6993 #endif
6994 	if ((arg != NULL) || (flush_range != NULL)) {
6995 		options = options & ~PMAP_OPTIONS_NOFLUSH;
6996 	}
6997 
6998 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
6999 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7000 		panic("phys_attribute_clear(0x%x,0x%x,0x%x,%p,%p): "
7001 		    "should not clear 'modified' without flushing TLBs\n",
7002 		    pn, bits, options, arg, flush_range);
7003 	}
7004 
7005 	assert(pn != vm_page_fictitious_addr);
7006 
7007 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7008 		assert(bits == PP_ATTR_MODIFIED);
7009 
7010 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7011 		/*
7012 		 * We short circuit this case; it should not need to
7013 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7014 		 * pmap_page_protect has taken care of resetting
7015 		 * the state so that we'll see the next write as a fault to
7016 		 * the VM (i.e. we don't want a fast fault).
7017 		 */
7018 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7019 		return;
7020 	}
7021 	if (bits & PP_ATTR_REFERENCED) {
7022 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7023 	}
7024 	if (bits & PP_ATTR_MODIFIED) {
7025 		allow_mode &= ~VM_PROT_WRITE;
7026 	}
7027 
7028 	if (bits == PP_ATTR_NOENCRYPT) {
7029 		/*
7030 		 * We short circuit this case; it should not need to
7031 		 * invoke arm_force_fast_fault, so just clear and
7032 		 * return.  On ARM, this bit is just a debugging aid.
7033 		 */
7034 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7035 		return;
7036 	}
7037 
7038 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7039 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7040 	}
7041 }
7042 
7043 MARK_AS_PMAP_TEXT void
7044 phys_attribute_clear_internal(
7045 	ppnum_t         pn,
7046 	unsigned int    bits,
7047 	int             options,
7048 	void            *arg)
7049 {
7050 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7051 }
7052 
7053 #if __ARM_RANGE_TLBI__
7054 MARK_AS_PMAP_TEXT static vm_map_address_t
7055 phys_attribute_clear_twig_internal(
7056 	pmap_t pmap,
7057 	vm_map_address_t start,
7058 	vm_map_address_t end,
7059 	unsigned int bits,
7060 	unsigned int options,
7061 	pmap_tlb_flush_range_t *flush_range)
7062 {
7063 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7064 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7065 	assert(end >= start);
7066 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7067 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7068 	vm_map_address_t va = start;
7069 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7070 	tt_entry_t     *tte_p;
7071 	tte_p = pmap_tte(pmap, start);
7072 	unsigned int npages = 0;
7073 
7074 	if (tte_p == (tt_entry_t *) NULL) {
7075 		return end;
7076 	}
7077 
7078 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7079 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7080 
7081 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7082 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7083 		assert(end_pte_p >= start_pte_p);
7084 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7085 			if (__improbable(npages++ && pmap_pending_preemption())) {
7086 				return va;
7087 			}
7088 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7089 			if (pa_valid(pa)) {
7090 				ppnum_t pn = (ppnum_t) atop(pa);
7091 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7092 			}
7093 		}
7094 	}
7095 	return end;
7096 }
7097 
7098 MARK_AS_PMAP_TEXT vm_map_address_t
7099 phys_attribute_clear_range_internal(
7100 	pmap_t pmap,
7101 	vm_map_address_t start,
7102 	vm_map_address_t end,
7103 	unsigned int bits,
7104 	unsigned int options)
7105 {
7106 	if (__improbable(end < start)) {
7107 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7108 	}
7109 	validate_pmap_mutable(pmap);
7110 
7111 	vm_map_address_t va = start;
7112 	pmap_tlb_flush_range_t flush_range = {
7113 		.ptfr_pmap = pmap,
7114 		.ptfr_start = start,
7115 		.ptfr_end = end,
7116 		.ptfr_flush_needed = false
7117 	};
7118 
7119 	pmap_lock(pmap, PMAP_LOCK_SHARED);
7120 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7121 
7122 	while (va < end) {
7123 		vm_map_address_t curr_end;
7124 
7125 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7126 		if (curr_end > end) {
7127 			curr_end = end;
7128 		}
7129 
7130 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7131 		if ((va < curr_end) || pmap_pending_preemption()) {
7132 			break;
7133 		}
7134 	}
7135 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7136 	if (flush_range.ptfr_flush_needed) {
7137 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7138 			flush_range.ptfr_start,
7139 			flush_range.ptfr_end - flush_range.ptfr_start,
7140 			flush_range.ptfr_pmap,
7141 			true);
7142 		sync_tlb_flush();
7143 	}
7144 	return va;
7145 }
7146 
7147 static void
7148 phys_attribute_clear_range(
7149 	pmap_t pmap,
7150 	vm_map_address_t start,
7151 	vm_map_address_t end,
7152 	unsigned int bits,
7153 	unsigned int options)
7154 {
7155 	/*
7156 	 * We allow single-page requests to execute non-preemptibly,
7157 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7158 	 * operation, and there are a couple of special use cases that
7159 	 * require a non-preemptible single-page operation.
7160 	 */
7161 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7162 		pmap_verify_preemptible();
7163 	}
7164 
7165 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7166 
7167 	while (start < end) {
7168 #if XNU_MONITOR
7169 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7170 #else
7171 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7172 #endif
7173 	}
7174 
7175 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7176 }
7177 #endif /* __ARM_RANGE_TLBI__ */
7178 
7179 static void
7180 phys_attribute_clear(
7181 	ppnum_t         pn,
7182 	unsigned int    bits,
7183 	int             options,
7184 	void            *arg)
7185 {
7186 	/*
7187 	 * Do we really want this tracepoint?  It will be extremely chatty.
7188 	 * Also, should we have a corresponding trace point for the set path?
7189 	 */
7190 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7191 
7192 #if XNU_MONITOR
7193 	phys_attribute_clear_ppl(pn, bits, options, arg);
7194 #else
7195 	phys_attribute_clear_internal(pn, bits, options, arg);
7196 #endif
7197 
7198 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7199 }
7200 
7201 /*
7202  *	Set specified attribute bits.
7203  *
7204  *	Set cached value in the pv head because we have
7205  *	no per-mapping hardware support for referenced and
7206  *	modify bits.
7207  */
7208 MARK_AS_PMAP_TEXT void
7209 phys_attribute_set_internal(
7210 	ppnum_t pn,
7211 	unsigned int bits)
7212 {
7213 	pmap_paddr_t    pa = ptoa(pn);
7214 	assert(pn != vm_page_fictitious_addr);
7215 
7216 #if XNU_MONITOR
7217 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7218 		panic("%s: illegal request, "
7219 		    "pn=%u, bits=%#x",
7220 		    __FUNCTION__,
7221 		    pn, bits);
7222 	}
7223 #endif
7224 
7225 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7226 
7227 	return;
7228 }
7229 
7230 static void
7231 phys_attribute_set(
7232 	ppnum_t pn,
7233 	unsigned int bits)
7234 {
7235 #if XNU_MONITOR
7236 	phys_attribute_set_ppl(pn, bits);
7237 #else
7238 	phys_attribute_set_internal(pn, bits);
7239 #endif
7240 }
7241 
7242 
7243 /*
7244  *	Check specified attribute bits.
7245  *
7246  *	use the software cached bits (since no hw support).
7247  */
7248 static boolean_t
7249 phys_attribute_test(
7250 	ppnum_t pn,
7251 	unsigned int bits)
7252 {
7253 	pmap_paddr_t    pa = ptoa(pn);
7254 	assert(pn != vm_page_fictitious_addr);
7255 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7256 }
7257 
7258 
7259 /*
7260  *	Set the modify/reference bits on the specified physical page.
7261  */
7262 void
7263 pmap_set_modify(ppnum_t pn)
7264 {
7265 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7266 }
7267 
7268 
7269 /*
7270  *	Clear the modify bits on the specified physical page.
7271  */
7272 void
7273 pmap_clear_modify(
7274 	ppnum_t pn)
7275 {
7276 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7277 }
7278 
7279 
7280 /*
7281  *	pmap_is_modified:
7282  *
7283  *	Return whether or not the specified physical page is modified
7284  *	by any physical maps.
7285  */
7286 boolean_t
7287 pmap_is_modified(
7288 	ppnum_t pn)
7289 {
7290 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7291 }
7292 
7293 
7294 /*
7295  *	Set the reference bit on the specified physical page.
7296  */
7297 static void
7298 pmap_set_reference(
7299 	ppnum_t pn)
7300 {
7301 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7302 }
7303 
7304 /*
7305  *	Clear the reference bits on the specified physical page.
7306  */
7307 void
7308 pmap_clear_reference(
7309 	ppnum_t pn)
7310 {
7311 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7312 }
7313 
7314 
7315 /*
7316  *	pmap_is_referenced:
7317  *
7318  *	Return whether or not the specified physical page is referenced
7319  *	by any physical maps.
7320  */
7321 boolean_t
7322 pmap_is_referenced(
7323 	ppnum_t pn)
7324 {
7325 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7326 }
7327 
7328 /*
7329  * pmap_get_refmod(phys)
7330  *  returns the referenced and modified bits of the specified
7331  *  physical page.
7332  */
7333 unsigned int
7334 pmap_get_refmod(
7335 	ppnum_t pn)
7336 {
7337 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7338 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7339 }
7340 
7341 static inline unsigned int
7342 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7343 {
7344 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7345 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7346 }
7347 
7348 /*
7349  * pmap_clear_refmod(phys, mask)
7350  *  clears the referenced and modified bits as specified by the mask
7351  *  of the specified physical page.
7352  */
7353 void
7354 pmap_clear_refmod_options(
7355 	ppnum_t         pn,
7356 	unsigned int    mask,
7357 	unsigned int    options,
7358 	void            *arg)
7359 {
7360 	unsigned int    bits;
7361 
7362 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7363 	phys_attribute_clear(pn, bits, options, arg);
7364 }
7365 
7366 /*
7367  * Perform pmap_clear_refmod_options on a virtual address range.
7368  * The operation will be performed in bulk & tlb flushes will be coalesced
7369  * if possible.
7370  *
7371  * Returns true if the operation is supported on this platform.
7372  * If this function returns false, the operation is not supported and
7373  * nothing has been modified in the pmap.
7374  */
7375 bool
7376 pmap_clear_refmod_range_options(
7377 	pmap_t pmap __unused,
7378 	vm_map_address_t start __unused,
7379 	vm_map_address_t end __unused,
7380 	unsigned int mask __unused,
7381 	unsigned int options __unused)
7382 {
7383 #if __ARM_RANGE_TLBI__
7384 	unsigned int    bits;
7385 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7386 	phys_attribute_clear_range(pmap, start, end, bits, options);
7387 	return true;
7388 #else /* __ARM_RANGE_TLBI__ */
7389 #pragma unused(pmap, start, end, mask, options)
7390 	/*
7391 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7392 	 * contiguous range of addresses. This is large performance improvement on
7393 	 * platforms that support ranged tlbi instructions. But on older platforms,
7394 	 * we can only flush per-page or the entire asid. So we currently
7395 	 * only support this operation on platforms that support ranged tlbi.
7396 	 * instructions. On other platforms, we require that
7397 	 * the VM modify the bits on a per-page basis.
7398 	 */
7399 	return false;
7400 #endif /* __ARM_RANGE_TLBI__ */
7401 }
7402 
7403 void
7404 pmap_clear_refmod(
7405 	ppnum_t pn,
7406 	unsigned int mask)
7407 {
7408 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7409 }
7410 
7411 unsigned int
7412 pmap_disconnect_options(
7413 	ppnum_t pn,
7414 	unsigned int options,
7415 	void *arg)
7416 {
7417 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7418 		/*
7419 		 * On ARM, the "modified" bit is managed by software, so
7420 		 * we know up-front if the physical page is "modified",
7421 		 * without having to scan all the PTEs pointing to it.
7422 		 * The caller should have made the VM page "busy" so noone
7423 		 * should be able to establish any new mapping and "modify"
7424 		 * the page behind us.
7425 		 */
7426 		if (pmap_is_modified(pn)) {
7427 			/*
7428 			 * The page has been modified and will be sent to
7429 			 * the VM compressor.
7430 			 */
7431 			options |= PMAP_OPTIONS_COMPRESSOR;
7432 		} else {
7433 			/*
7434 			 * The page hasn't been modified and will be freed
7435 			 * instead of compressed.
7436 			 */
7437 		}
7438 	}
7439 
7440 	/* disconnect the page */
7441 	pmap_page_protect_options(pn, 0, options, arg);
7442 
7443 	/* return ref/chg status */
7444 	return pmap_get_refmod(pn);
7445 }
7446 
7447 /*
7448  *	Routine:
7449  *		pmap_disconnect
7450  *
7451  *	Function:
7452  *		Disconnect all mappings for this page and return reference and change status
7453  *		in generic format.
7454  *
7455  */
7456 unsigned int
7457 pmap_disconnect(
7458 	ppnum_t pn)
7459 {
7460 	pmap_page_protect(pn, 0);       /* disconnect the page */
7461 	return pmap_get_refmod(pn);   /* return ref/chg status */
7462 }
7463 
7464 boolean_t
7465 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7466 {
7467 	if (ptoa(first) >= vm_last_phys) {
7468 		return FALSE;
7469 	}
7470 	if (ptoa(last) < vm_first_phys) {
7471 		return FALSE;
7472 	}
7473 
7474 	return TRUE;
7475 }
7476 
7477 /*
7478  * The state maintained by the noencrypt functions is used as a
7479  * debugging aid on ARM.  This incurs some overhead on the part
7480  * of the caller.  A special case check in phys_attribute_clear
7481  * (the most expensive path) currently minimizes this overhead,
7482  * but stubbing these functions out on RELEASE kernels yields
7483  * further wins.
7484  */
7485 boolean_t
7486 pmap_is_noencrypt(
7487 	ppnum_t pn)
7488 {
7489 #if DEVELOPMENT || DEBUG
7490 	boolean_t result = FALSE;
7491 
7492 	if (!pa_valid(ptoa(pn))) {
7493 		return FALSE;
7494 	}
7495 
7496 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7497 
7498 	return result;
7499 #else
7500 #pragma unused(pn)
7501 	return FALSE;
7502 #endif
7503 }
7504 
7505 void
7506 pmap_set_noencrypt(
7507 	ppnum_t pn)
7508 {
7509 #if DEVELOPMENT || DEBUG
7510 	if (!pa_valid(ptoa(pn))) {
7511 		return;
7512 	}
7513 
7514 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7515 #else
7516 #pragma unused(pn)
7517 #endif
7518 }
7519 
7520 void
7521 pmap_clear_noencrypt(
7522 	ppnum_t pn)
7523 {
7524 #if DEVELOPMENT || DEBUG
7525 	if (!pa_valid(ptoa(pn))) {
7526 		return;
7527 	}
7528 
7529 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7530 #else
7531 #pragma unused(pn)
7532 #endif
7533 }
7534 
7535 #if XNU_MONITOR
7536 boolean_t
7537 pmap_is_monitor(ppnum_t pn)
7538 {
7539 	assert(pa_valid(ptoa(pn)));
7540 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7541 }
7542 #endif
7543 
7544 void
7545 pmap_lock_phys_page(ppnum_t pn)
7546 {
7547 #if !XNU_MONITOR
7548 	unsigned int    pai;
7549 	pmap_paddr_t    phys = ptoa(pn);
7550 
7551 	if (pa_valid(phys)) {
7552 		pai = pa_index(phys);
7553 		pvh_lock(pai);
7554 	} else
7555 #else
7556 	(void)pn;
7557 #endif
7558 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7559 }
7560 
7561 
7562 void
7563 pmap_unlock_phys_page(ppnum_t pn)
7564 {
7565 #if !XNU_MONITOR
7566 	unsigned int    pai;
7567 	pmap_paddr_t    phys = ptoa(pn);
7568 
7569 	if (pa_valid(phys)) {
7570 		pai = pa_index(phys);
7571 		pvh_unlock(pai);
7572 	} else
7573 #else
7574 	(void)pn;
7575 #endif
7576 	{ simple_unlock(&phys_backup_lock);}
7577 }
7578 
7579 MARK_AS_PMAP_TEXT static void
7580 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7581 {
7582 	if (pmap != kernel_pmap) {
7583 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7584 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7585 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7586 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7587 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7588 #if __ARM_MIXED_PAGE_SIZE__
7589 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7590 #endif
7591 	}
7592 
7593 
7594 #if __ARM_MIXED_PAGE_SIZE__
7595 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7596 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7597 	}
7598 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7599 
7600 
7601 	if (pmap != kernel_pmap) {
7602 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7603 	} else if (!pmap_user_ttb_is_clear()) {
7604 		pmap_clear_user_ttb_internal();
7605 	}
7606 }
7607 
7608 MARK_AS_PMAP_TEXT void
7609 pmap_clear_user_ttb_internal(void)
7610 {
7611 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7612 }
7613 
7614 void
7615 pmap_clear_user_ttb(void)
7616 {
7617 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7618 #if XNU_MONITOR
7619 	pmap_clear_user_ttb_ppl();
7620 #else
7621 	pmap_clear_user_ttb_internal();
7622 #endif
7623 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7624 }
7625 
7626 
7627 #if defined(__arm64__)
7628 /*
7629  * Marker for use in multi-pass fast-fault PV list processing.
7630  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7631  * these functions, as compressed PTEs should never be present in PV lists.
7632  * Note that this only holds true for arm64; for arm32 we don't have enough
7633  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7634  * and WRITEABLE marker depending on whether the PTE is valid.
7635  */
7636 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7637 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7638 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7639 #endif
7640 
7641 
7642 MARK_AS_PMAP_TEXT static boolean_t
7643 arm_force_fast_fault_with_flush_range(
7644 	ppnum_t         ppnum,
7645 	vm_prot_t       allow_mode,
7646 	int             options,
7647 	pmap_tlb_flush_range_t *flush_range)
7648 {
7649 	pmap_paddr_t     phys = ptoa(ppnum);
7650 	pv_entry_t      *pve_p;
7651 	pt_entry_t      *pte_p;
7652 	unsigned int     pai;
7653 	unsigned int     pass1_updated = 0;
7654 	unsigned int     pass2_updated = 0;
7655 	boolean_t        result;
7656 	pv_entry_t     **pv_h;
7657 	bool             is_reusable;
7658 	bool             ref_fault;
7659 	bool             mod_fault;
7660 	bool             clear_write_fault = false;
7661 	bool             ref_aliases_mod = false;
7662 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7663 
7664 	assert(ppnum != vm_page_fictitious_addr);
7665 
7666 	if (!pa_valid(phys)) {
7667 		return FALSE;   /* Not a managed page. */
7668 	}
7669 
7670 	result = TRUE;
7671 	ref_fault = false;
7672 	mod_fault = false;
7673 	pai = pa_index(phys);
7674 	if (__probable(mustsynch)) {
7675 		pvh_lock(pai);
7676 	}
7677 	pv_h = pai_to_pvh(pai);
7678 
7679 #if XNU_MONITOR
7680 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7681 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7682 	}
7683 #endif
7684 	pte_p = PT_ENTRY_NULL;
7685 	pve_p = PV_ENTRY_NULL;
7686 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7687 		pte_p = pvh_ptep(pv_h);
7688 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7689 		pve_p = pvh_pve_list(pv_h);
7690 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7691 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7692 	}
7693 
7694 	is_reusable = ppattr_test_reusable(pai);
7695 
7696 	/*
7697 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7698 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7699 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7700 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7701 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7702 	 * tlb_flush_needed to be true while issue_tlbi is false.
7703 	 */
7704 	bool issue_tlbi = false;
7705 	bool tlb_flush_needed = false;
7706 
7707 	pv_entry_t *orig_pve_p = pve_p;
7708 	pt_entry_t *orig_pte_p = pte_p;
7709 	int pve_ptep_idx = 0;
7710 
7711 	/*
7712 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7713 	 * TLB invalidation in pass 2.
7714 	 */
7715 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7716 		pt_entry_t       spte;
7717 		pt_entry_t       tmplate;
7718 
7719 		if (pve_p != PV_ENTRY_NULL) {
7720 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7721 			if (pte_p == PT_ENTRY_NULL) {
7722 				goto fff_skip_pve_pass1;
7723 			}
7724 		}
7725 
7726 #ifdef PVH_FLAG_IOMMU
7727 		if (pvh_ptep_is_iommu(pte_p)) {
7728 			goto fff_skip_pve_pass1;
7729 		}
7730 #endif
7731 		if (*pte_p == ARM_PTE_EMPTY) {
7732 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7733 		}
7734 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7735 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7736 		}
7737 
7738 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7739 		const pmap_t pmap = ptdp->pmap;
7740 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7741 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7742 
7743 		assert(va >= pmap->min && va < pmap->max);
7744 
7745 		/* update pmap stats and ledgers */
7746 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7747 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7748 		if (is_altacct) {
7749 			/*
7750 			 * We do not track "reusable" status for
7751 			 * "alternate accounting" mappings.
7752 			 */
7753 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7754 		    is_reusable &&
7755 		    is_internal &&
7756 		    pmap != kernel_pmap) {
7757 			/* one less "reusable" */
7758 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7759 			/* one more "internal" */
7760 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7761 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7762 
7763 			/*
7764 			 * Since the page is being marked non-reusable, we assume that it will be
7765 			 * modified soon.  Avoid the cost of another trap to handle the fast
7766 			 * fault when we next write to this page.
7767 			 */
7768 			clear_write_fault = true;
7769 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7770 		    !is_reusable &&
7771 		    is_internal &&
7772 		    pmap != kernel_pmap) {
7773 			/* one more "reusable" */
7774 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7775 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7776 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7777 		}
7778 
7779 		bool wiredskip = pte_is_wired(*pte_p) &&
7780 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7781 
7782 		if (wiredskip) {
7783 			result = FALSE;
7784 			goto fff_skip_pve_pass1;
7785 		}
7786 
7787 		spte = *pte_p;
7788 		tmplate = spte;
7789 
7790 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7791 			/* read protection sets the pte to fault */
7792 			tmplate =  tmplate & ~ARM_PTE_AF;
7793 			ref_fault = true;
7794 		}
7795 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7796 			/* take away write permission if set */
7797 			if (pmap == kernel_pmap) {
7798 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7799 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7800 					pte_set_was_writeable(tmplate, true);
7801 					mod_fault = true;
7802 				}
7803 			} else {
7804 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7805 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7806 					pte_set_was_writeable(tmplate, true);
7807 					mod_fault = true;
7808 				}
7809 			}
7810 		}
7811 
7812 #if MACH_ASSERT && XNU_MONITOR
7813 		if (is_pte_xprr_protected(pmap, spte)) {
7814 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7815 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7816 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7817 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7818 				    ppnum, options, allow_mode);
7819 			}
7820 		}
7821 #endif /* MACH_ASSERT && XNU_MONITOR */
7822 
7823 		if (result && (tmplate != spte)) {
7824 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7825 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
7826 				tlb_flush_needed = true;
7827 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7828 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7829 #ifdef ARM_PTE_FF_MARKER
7830 					assert(!(spte & ARM_PTE_FF_MARKER));
7831 					tmplate |= ARM_PTE_FF_MARKER;
7832 					++pass1_updated;
7833 #endif
7834 					issue_tlbi = true;
7835 				}
7836 			}
7837 			write_pte_fast(pte_p, tmplate);
7838 		}
7839 
7840 fff_skip_pve_pass1:
7841 		pte_p = PT_ENTRY_NULL;
7842 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7843 			pve_ptep_idx = 0;
7844 			pve_p = pve_next(pve_p);
7845 		}
7846 	}
7847 
7848 	if (tlb_flush_needed) {
7849 		FLUSH_PTE_STRONG();
7850 	}
7851 
7852 	if (!issue_tlbi) {
7853 		goto fff_finish;
7854 	}
7855 
7856 	/* Pass 2: Issue any required TLB invalidations */
7857 	pve_p = orig_pve_p;
7858 	pte_p = orig_pte_p;
7859 	pve_ptep_idx = 0;
7860 
7861 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7862 		if (pve_p != PV_ENTRY_NULL) {
7863 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7864 			if (pte_p == PT_ENTRY_NULL) {
7865 				goto fff_skip_pve_pass2;
7866 			}
7867 		}
7868 
7869 #ifdef PVH_FLAG_IOMMU
7870 		if (pvh_ptep_is_iommu(pte_p)) {
7871 			goto fff_skip_pve_pass2;
7872 		}
7873 #endif
7874 
7875 #ifdef ARM_PTE_FF_MARKER
7876 		pt_entry_t spte = *pte_p;
7877 
7878 		if (!(spte & ARM_PTE_FF_MARKER)) {
7879 			goto fff_skip_pve_pass2;
7880 		} else {
7881 			spte &= (~ARM_PTE_FF_MARKER);
7882 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
7883 			write_pte_fast(pte_p, spte);
7884 			++pass2_updated;
7885 		}
7886 #endif
7887 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7888 		const pmap_t pmap = ptdp->pmap;
7889 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7890 
7891 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7892 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7893 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
7894 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
7895 		}
7896 
7897 fff_skip_pve_pass2:
7898 		pte_p = PT_ENTRY_NULL;
7899 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7900 			pve_ptep_idx = 0;
7901 			pve_p = pve_next(pve_p);
7902 		}
7903 	}
7904 
7905 fff_finish:
7906 	if (__improbable(pass1_updated != pass2_updated)) {
7907 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
7908 		    __func__, pass1_updated, pass2_updated);
7909 	}
7910 
7911 	/*
7912 	 * If we are using the same approach for ref and mod
7913 	 * faults on this PTE, do not clear the write fault;
7914 	 * this would cause both ref and mod to be set on the
7915 	 * page again, and prevent us from taking ANY read/write
7916 	 * fault on the mapping.
7917 	 */
7918 	if (clear_write_fault && !ref_aliases_mod) {
7919 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
7920 	}
7921 	if (tlb_flush_needed) {
7922 		if (flush_range) {
7923 			/* Delayed flush. Signal to the caller that the flush is needed. */
7924 			flush_range->ptfr_flush_needed = true;
7925 		} else {
7926 			sync_tlb_flush();
7927 		}
7928 	}
7929 
7930 	/* update global "reusable" status for this page */
7931 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7932 		ppattr_clear_reusable(pai);
7933 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7934 		ppattr_set_reusable(pai);
7935 	}
7936 
7937 	if (mod_fault) {
7938 		ppattr_set_modfault(pai);
7939 	}
7940 	if (ref_fault) {
7941 		ppattr_set_reffault(pai);
7942 	}
7943 	if (__probable(mustsynch)) {
7944 		pvh_unlock(pai);
7945 	}
7946 	return result;
7947 }
7948 
7949 MARK_AS_PMAP_TEXT boolean_t
7950 arm_force_fast_fault_internal(
7951 	ppnum_t         ppnum,
7952 	vm_prot_t       allow_mode,
7953 	int             options)
7954 {
7955 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7956 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7957 	}
7958 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
7959 }
7960 
7961 /*
7962  *	Routine:	arm_force_fast_fault
7963  *
7964  *	Function:
7965  *		Force all mappings for this page to fault according
7966  *		to the access modes allowed, so we can gather ref/modify
7967  *		bits again.
7968  */
7969 
7970 boolean_t
7971 arm_force_fast_fault(
7972 	ppnum_t         ppnum,
7973 	vm_prot_t       allow_mode,
7974 	int             options,
7975 	__unused void   *arg)
7976 {
7977 	pmap_paddr_t    phys = ptoa(ppnum);
7978 
7979 	assert(ppnum != vm_page_fictitious_addr);
7980 
7981 	if (!pa_valid(phys)) {
7982 		return FALSE;   /* Not a managed page. */
7983 	}
7984 
7985 #if XNU_MONITOR
7986 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
7987 #else
7988 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
7989 #endif
7990 }
7991 
7992 /*
7993  *	Routine:	arm_clear_fast_fault
7994  *
7995  *	Function:
7996  *		Clear pending force fault for all mappings for this page based on
7997  *		the observed fault type, update ref/modify bits.
7998  */
7999 MARK_AS_PMAP_TEXT static boolean_t
8000 arm_clear_fast_fault(
8001 	ppnum_t ppnum,
8002 	vm_prot_t fault_type,
8003 	pt_entry_t *pte_p)
8004 {
8005 	pmap_paddr_t    pa = ptoa(ppnum);
8006 	pv_entry_t     *pve_p;
8007 	unsigned int    pai;
8008 	boolean_t       result;
8009 	bool            tlb_flush_needed = false;
8010 	pv_entry_t    **pv_h;
8011 	unsigned int    npve = 0;
8012 	unsigned int    pass1_updated = 0;
8013 	unsigned int    pass2_updated = 0;
8014 
8015 	assert(ppnum != vm_page_fictitious_addr);
8016 
8017 	if (!pa_valid(pa)) {
8018 		return FALSE;   /* Not a managed page. */
8019 	}
8020 
8021 	result = FALSE;
8022 	pai = pa_index(pa);
8023 	pvh_assert_locked(pai);
8024 	pv_h = pai_to_pvh(pai);
8025 
8026 	pve_p = PV_ENTRY_NULL;
8027 	if (pte_p == PT_ENTRY_NULL) {
8028 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8029 			pte_p = pvh_ptep(pv_h);
8030 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8031 			pve_p = pvh_pve_list(pv_h);
8032 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8033 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8034 		}
8035 	}
8036 
8037 	pv_entry_t *orig_pve_p = pve_p;
8038 	pt_entry_t *orig_pte_p = pte_p;
8039 	int pve_ptep_idx = 0;
8040 
8041 	/*
8042 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8043 	 * TLB invalidation in pass 2.
8044 	 */
8045 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8046 		pt_entry_t spte;
8047 		pt_entry_t tmplate;
8048 
8049 		if (pve_p != PV_ENTRY_NULL) {
8050 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8051 			if (pte_p == PT_ENTRY_NULL) {
8052 				goto cff_skip_pve_pass1;
8053 			}
8054 		}
8055 
8056 #ifdef PVH_FLAG_IOMMU
8057 		if (pvh_ptep_is_iommu(pte_p)) {
8058 			goto cff_skip_pve_pass1;
8059 		}
8060 #endif
8061 		if (*pte_p == ARM_PTE_EMPTY) {
8062 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8063 		}
8064 
8065 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8066 		const pmap_t pmap = ptdp->pmap;
8067 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8068 
8069 		assert(va >= pmap->min && va < pmap->max);
8070 
8071 		spte = *pte_p;
8072 		tmplate = spte;
8073 
8074 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8075 			{
8076 				if (pmap == kernel_pmap) {
8077 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8078 				} else {
8079 					assert(pmap->type != PMAP_TYPE_NESTED);
8080 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8081 				}
8082 			}
8083 
8084 			tmplate |= ARM_PTE_AF;
8085 
8086 			pte_set_was_writeable(tmplate, false);
8087 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8088 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8089 			tmplate = spte | ARM_PTE_AF;
8090 
8091 			{
8092 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8093 			}
8094 		}
8095 
8096 #if MACH_ASSERT && XNU_MONITOR
8097 		if (is_pte_xprr_protected(pmap, spte)) {
8098 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8099 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8100 				    "ppnum=0x%x, fault_type=0x%x",
8101 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8102 				    ppnum, fault_type);
8103 			}
8104 		}
8105 #endif /* MACH_ASSERT && XNU_MONITOR */
8106 
8107 		assert(spte != ARM_PTE_TYPE_FAULT);
8108 		if (spte != tmplate) {
8109 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8110 #ifdef ARM_PTE_FF_MARKER
8111 				assert(!(spte & ARM_PTE_FF_MARKER));
8112 				tmplate |= ARM_PTE_FF_MARKER;
8113 				++pass1_updated;
8114 #endif
8115 				tlb_flush_needed = true;
8116 			}
8117 			write_pte_fast(pte_p, tmplate);
8118 			result = TRUE;
8119 		}
8120 
8121 cff_skip_pve_pass1:
8122 		pte_p = PT_ENTRY_NULL;
8123 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8124 			pve_ptep_idx = 0;
8125 			pve_p = pve_next(pve_p);
8126 			++npve;
8127 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8128 				break;
8129 			}
8130 		}
8131 	}
8132 
8133 	if (!tlb_flush_needed) {
8134 		goto cff_finish;
8135 	}
8136 
8137 	FLUSH_PTE_STRONG();
8138 
8139 	/* Pass 2: Issue any required TLB invalidations */
8140 	pve_p = orig_pve_p;
8141 	pte_p = orig_pte_p;
8142 	pve_ptep_idx = 0;
8143 	npve = 0;
8144 
8145 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8146 		if (pve_p != PV_ENTRY_NULL) {
8147 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8148 			if (pte_p == PT_ENTRY_NULL) {
8149 				goto cff_skip_pve_pass2;
8150 			}
8151 		}
8152 
8153 #ifdef PVH_FLAG_IOMMU
8154 		if (pvh_ptep_is_iommu(pte_p)) {
8155 			goto cff_skip_pve_pass2;
8156 		}
8157 #endif
8158 
8159 #ifdef ARM_PTE_FF_MARKER
8160 		pt_entry_t spte = *pte_p;
8161 
8162 		if (!(spte & ARM_PTE_FF_MARKER)) {
8163 			goto cff_skip_pve_pass2;
8164 		} else {
8165 			spte &= (~ARM_PTE_FF_MARKER);
8166 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8167 			write_pte_fast(pte_p, spte);
8168 			++pass2_updated;
8169 		}
8170 #endif
8171 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8172 		const pmap_t pmap = ptdp->pmap;
8173 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8174 
8175 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
8176 
8177 cff_skip_pve_pass2:
8178 		pte_p = PT_ENTRY_NULL;
8179 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8180 			pve_ptep_idx = 0;
8181 			pve_p = pve_next(pve_p);
8182 			++npve;
8183 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8184 				break;
8185 			}
8186 		}
8187 	}
8188 
8189 cff_finish:
8190 	if (__improbable(pass1_updated != pass2_updated)) {
8191 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8192 		    __func__, pass1_updated, pass2_updated);
8193 	}
8194 	if (tlb_flush_needed) {
8195 		sync_tlb_flush();
8196 	}
8197 	return result;
8198 }
8199 
8200 /*
8201  * Determine if the fault was induced by software tracking of
8202  * modify/reference bits.  If so, re-enable the mapping (and set
8203  * the appropriate bits).
8204  *
8205  * Returns KERN_SUCCESS if the fault was induced and was
8206  * successfully handled.
8207  *
8208  * Returns KERN_FAILURE if the fault was not induced and
8209  * the function was unable to deal with it.
8210  *
8211  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8212  * disallows this type of access.
8213  *
8214  * Returns KERN_ABORTED if the pmap lock is taken and a
8215  * preemption is pending.
8216  *
8217  */
8218 MARK_AS_PMAP_TEXT kern_return_t
8219 arm_fast_fault_internal(
8220 	pmap_t pmap,
8221 	vm_map_address_t va,
8222 	vm_prot_t fault_type,
8223 	__unused bool was_af_fault,
8224 	__unused bool from_user)
8225 {
8226 	kern_return_t   result = KERN_FAILURE;
8227 	pt_entry_t     *ptep;
8228 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8229 	unsigned int    pai;
8230 	pmap_paddr_t    pa;
8231 	validate_pmap_mutable(pmap);
8232 
8233 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8234 		return KERN_ABORTED;
8235 	}
8236 
8237 	/*
8238 	 * If the entry doesn't exist, is completely invalid, or is already
8239 	 * valid, we can't fix it here.
8240 	 */
8241 
8242 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8243 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8244 	if (ptep != PT_ENTRY_NULL) {
8245 		while (true) {
8246 			spte = *((volatile pt_entry_t*)ptep);
8247 
8248 			pa = pte_to_pa(spte);
8249 
8250 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8251 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8252 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8253 				return result;
8254 			}
8255 
8256 			if (!pa_valid(pa)) {
8257 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8258 #if XNU_MONITOR
8259 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8260 					return KERN_PROTECTION_FAILURE;
8261 				} else
8262 #endif
8263 				return result;
8264 			}
8265 			pai = pa_index(pa);
8266 			pvh_lock(pai);
8267 			if (*ptep == spte) {
8268 				/*
8269 				 * Double-check the spte value, as we care about the AF bit.
8270 				 * It's also possible that pmap_page_protect() transitioned the
8271 				 * PTE to compressed/empty before we grabbed the PVH lock.
8272 				 */
8273 				break;
8274 			}
8275 			pvh_unlock(pai);
8276 		}
8277 	} else {
8278 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8279 		return result;
8280 	}
8281 
8282 
8283 	if ((result != KERN_SUCCESS) &&
8284 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8285 		/*
8286 		 * An attempted access will always clear ref/mod fault state, as
8287 		 * appropriate for the fault type.  arm_clear_fast_fault will
8288 		 * update the associated PTEs for the page as appropriate; if
8289 		 * any PTEs are updated, we redrive the access.  If the mapping
8290 		 * does not actually allow for the attempted access, the
8291 		 * following fault will (hopefully) fail to update any PTEs, and
8292 		 * thus cause arm_fast_fault to decide that it failed to handle
8293 		 * the fault.
8294 		 */
8295 		if (ppattr_test_reffault(pai)) {
8296 			ppattr_clear_reffault(pai);
8297 		}
8298 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8299 			ppattr_clear_modfault(pai);
8300 		}
8301 
8302 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8303 			/*
8304 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8305 			 * cost of not doing so is a another fault in a case
8306 			 * that should already result in an exception.
8307 			 */
8308 			result = KERN_SUCCESS;
8309 		}
8310 	}
8311 
8312 	/*
8313 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8314 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8315 	 * on mappings of the same page
8316 	 */
8317 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8318 		uintptr_t ap_ro, ap_rw, ap_x;
8319 		if (pmap == kernel_pmap) {
8320 			ap_ro = ARM_PTE_AP(AP_RONA);
8321 			ap_rw = ARM_PTE_AP(AP_RWNA);
8322 			ap_x = ARM_PTE_NX;
8323 		} else {
8324 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8325 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8326 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8327 		}
8328 		/*
8329 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8330 		 * hardware they may be xPRR-protected, in which case they'll be handled
8331 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8332 		 * handling path currently does not call arm_fast_fault() without at least
8333 		 * VM_PROT_READ in fault_type.
8334 		 */
8335 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8336 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8337 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8338 				result = KERN_SUCCESS;
8339 			}
8340 		}
8341 	}
8342 
8343 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8344 		/*
8345 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8346 		 * another pending PV list operation or an excessively large PV list.
8347 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8348 		 * taking a fault on the same mapping.
8349 		 */
8350 		result = KERN_SUCCESS;
8351 	}
8352 
8353 	pvh_unlock(pai);
8354 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8355 	return result;
8356 }
8357 
8358 kern_return_t
8359 arm_fast_fault(
8360 	pmap_t pmap,
8361 	vm_map_address_t va,
8362 	vm_prot_t fault_type,
8363 	bool was_af_fault,
8364 	__unused bool from_user)
8365 {
8366 	kern_return_t   result = KERN_FAILURE;
8367 
8368 	if (va < pmap->min || va >= pmap->max) {
8369 		return result;
8370 	}
8371 
8372 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8373 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8374 	    from_user);
8375 
8376 	do {
8377 #if XNU_MONITOR
8378 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8379 #else
8380 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8381 #endif
8382 	} while (result == KERN_ABORTED);
8383 
8384 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8385 
8386 	return result;
8387 }
8388 
8389 void
8390 pmap_copy_page(
8391 	ppnum_t psrc,
8392 	ppnum_t pdst)
8393 {
8394 	bcopy_phys((addr64_t) (ptoa(psrc)),
8395 	    (addr64_t) (ptoa(pdst)),
8396 	    PAGE_SIZE);
8397 }
8398 
8399 
8400 /*
8401  *	pmap_copy_page copies the specified (machine independent) pages.
8402  */
8403 void
8404 pmap_copy_part_page(
8405 	ppnum_t psrc,
8406 	vm_offset_t src_offset,
8407 	ppnum_t pdst,
8408 	vm_offset_t dst_offset,
8409 	vm_size_t len)
8410 {
8411 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8412 	    (addr64_t) (ptoa(pdst) + dst_offset),
8413 	    len);
8414 }
8415 
8416 
8417 /*
8418  *	pmap_zero_page zeros the specified (machine independent) page.
8419  */
8420 void
8421 pmap_zero_page(
8422 	ppnum_t pn)
8423 {
8424 	assert(pn != vm_page_fictitious_addr);
8425 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8426 }
8427 
8428 /*
8429  *	pmap_zero_part_page
8430  *	zeros the specified (machine independent) part of a page.
8431  */
8432 void
8433 pmap_zero_part_page(
8434 	ppnum_t pn,
8435 	vm_offset_t offset,
8436 	vm_size_t len)
8437 {
8438 	assert(pn != vm_page_fictitious_addr);
8439 	assert(offset + len <= PAGE_SIZE);
8440 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8441 }
8442 
8443 void
8444 pmap_map_globals(
8445 	void)
8446 {
8447 	pt_entry_t      *ptep, pte;
8448 
8449 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8450 	assert(ptep != PT_ENTRY_NULL);
8451 	assert(*ptep == ARM_PTE_EMPTY);
8452 
8453 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8454 #if __ARM_KERNEL_PROTECT__
8455 	pte |= ARM_PTE_NG;
8456 #endif /* __ARM_KERNEL_PROTECT__ */
8457 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8458 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8459 	*ptep = pte;
8460 	FLUSH_PTE();
8461 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8462 
8463 #if KASAN
8464 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8465 #endif
8466 }
8467 
8468 vm_offset_t
8469 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8470 {
8471 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8472 		panic("%s: invalid index %u", __func__, index);
8473 	}
8474 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8475 }
8476 
8477 MARK_AS_PMAP_TEXT unsigned int
8478 pmap_map_cpu_windows_copy_internal(
8479 	ppnum_t pn,
8480 	vm_prot_t prot,
8481 	unsigned int wimg_bits)
8482 {
8483 	pt_entry_t      *ptep = NULL, pte;
8484 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8485 	unsigned int    cpu_num;
8486 	unsigned int    i;
8487 	vm_offset_t     cpu_copywindow_vaddr = 0;
8488 	bool            need_strong_sync = false;
8489 
8490 #if XNU_MONITOR
8491 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8492 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8493 #endif
8494 
8495 #if XNU_MONITOR
8496 #ifdef  __ARM_COHERENT_IO__
8497 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8498 		panic("%s: attempted to map a managed page, "
8499 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8500 		    __FUNCTION__,
8501 		    pn, prot, wimg_bits);
8502 	}
8503 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8504 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8505 	}
8506 
8507 #else /* __ARM_COHERENT_IO__ */
8508 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8509 #endif /* __ARM_COHERENT_IO__ */
8510 #endif /* XNU_MONITOR */
8511 	cpu_num = pmap_cpu_data->cpu_number;
8512 
8513 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8514 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8515 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8516 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8517 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8518 			break;
8519 		}
8520 	}
8521 	if (i == CPUWINDOWS_MAX) {
8522 		panic("pmap_map_cpu_windows_copy: out of window");
8523 	}
8524 
8525 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8526 #if __ARM_KERNEL_PROTECT__
8527 	pte |= ARM_PTE_NG;
8528 #endif /* __ARM_KERNEL_PROTECT__ */
8529 
8530 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8531 
8532 	if (prot & VM_PROT_WRITE) {
8533 		pte |= ARM_PTE_AP(AP_RWNA);
8534 	} else {
8535 		pte |= ARM_PTE_AP(AP_RONA);
8536 	}
8537 
8538 	write_pte_fast(ptep, pte);
8539 	/*
8540 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8541 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8542 	 */
8543 	FLUSH_PTE_STRONG();
8544 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8545 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8546 
8547 	return i;
8548 }
8549 
8550 unsigned int
8551 pmap_map_cpu_windows_copy(
8552 	ppnum_t pn,
8553 	vm_prot_t prot,
8554 	unsigned int wimg_bits)
8555 {
8556 #if XNU_MONITOR
8557 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8558 #else
8559 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8560 #endif
8561 }
8562 
8563 MARK_AS_PMAP_TEXT void
8564 pmap_unmap_cpu_windows_copy_internal(
8565 	unsigned int index)
8566 {
8567 	pt_entry_t      *ptep;
8568 	unsigned int    cpu_num;
8569 	vm_offset_t     cpu_copywindow_vaddr = 0;
8570 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8571 
8572 	cpu_num = pmap_cpu_data->cpu_number;
8573 
8574 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8575 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8576 	 * (which are likely to have been on I/O memory) are complete before
8577 	 * tearing down the mapping. */
8578 	__builtin_arm_dsb(DSB_SY);
8579 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8580 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8581 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8582 }
8583 
8584 void
8585 pmap_unmap_cpu_windows_copy(
8586 	unsigned int index)
8587 {
8588 #if XNU_MONITOR
8589 	return pmap_unmap_cpu_windows_copy_ppl(index);
8590 #else
8591 	return pmap_unmap_cpu_windows_copy_internal(index);
8592 #endif
8593 }
8594 
8595 #if XNU_MONITOR
8596 
8597 MARK_AS_PMAP_TEXT void
8598 pmap_invoke_with_page(
8599 	ppnum_t page_number,
8600 	void *ctx,
8601 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8602 {
8603 	#pragma unused(page_number, ctx, callback)
8604 }
8605 
8606 /*
8607  * Loop over every pmap_io_range (I/O ranges marked as owned by
8608  * the PPL in the device tree) and conditionally call callback() on each range
8609  * that needs to be included in the hibernation image.
8610  *
8611  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8612  *                 context is needed in the callback.
8613  * @param callback Callback function invoked on each range (gated by flag).
8614  */
8615 MARK_AS_PMAP_TEXT void
8616 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8617 {
8618 	extern const pmap_io_range_t* io_attr_table;
8619 	extern const unsigned int num_io_rgns;
8620 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8621 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8622 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8623 		}
8624 	}
8625 }
8626 
8627 /**
8628  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8629  * PPL-owned page. Otherwise, do nothing.
8630  *
8631  * @param addr Physical address of the page to set the HASHED flag on.
8632  */
8633 MARK_AS_PMAP_TEXT void
8634 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8635 {
8636 	/* Ignore non-managed kernel memory. */
8637 	if (!pa_valid(addr)) {
8638 		return;
8639 	}
8640 
8641 	const unsigned int pai = pa_index(addr);
8642 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8643 		pv_entry_t **pv_h = pai_to_pvh(pai);
8644 
8645 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8646 		pvh_lock(pai);
8647 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8648 		pvh_unlock(pai);
8649 	}
8650 }
8651 
8652 /**
8653  * Loop through every physical page in the system and clear out the HASHED flag
8654  * on every PPL-owned page. That flag is used to keep track of which pages have
8655  * been hashed into the hibernation image during the hibernation entry process.
8656  *
8657  * The HASHED flag needs to be cleared out between hibernation cycles because the
8658  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8659  * image with the HASHED flag set on certain pages. It's important to clear the
8660  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8661  * into the hibernation image can't be compromised across hibernation cycles.
8662  */
8663 MARK_AS_PMAP_TEXT void
8664 pmap_clear_ppl_hashed_flag_all(void)
8665 {
8666 	const unsigned int last_index = pa_index(vm_last_phys);
8667 	pv_entry_t **pv_h = NULL;
8668 
8669 	for (int pai = 0; pai < last_index; ++pai) {
8670 		pv_h = pai_to_pvh(pai);
8671 
8672 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8673 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8674 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8675 			pvh_lock(pai);
8676 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8677 			pvh_unlock(pai);
8678 		}
8679 	}
8680 }
8681 
8682 /**
8683  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8684  * ppl_hib driver will call this after all wired pages have been copied into the
8685  * hibernation image.
8686  */
8687 MARK_AS_PMAP_TEXT void
8688 pmap_check_ppl_hashed_flag_all(void)
8689 {
8690 	const unsigned int last_index = pa_index(vm_last_phys);
8691 	pv_entry_t **pv_h = NULL;
8692 
8693 	for (int pai = 0; pai < last_index; ++pai) {
8694 		pv_h = pai_to_pvh(pai);
8695 
8696 		/**
8697 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8698 		 * the pages that contain the PMAP stacks.
8699 		 */
8700 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8701 		    (pai < pa_index(pmap_stacks_end_pa));
8702 
8703 		if (!is_pmap_stack &&
8704 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8705 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8706 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8707 		}
8708 	}
8709 }
8710 
8711 #endif /* XNU_MONITOR */
8712 
8713 /*
8714  * Indicate that a pmap is intended to be used as a nested pmap
8715  * within one or more larger address spaces.  This must be set
8716  * before pmap_nest() is called with this pmap as the 'subordinate'.
8717  */
8718 MARK_AS_PMAP_TEXT void
8719 pmap_set_nested_internal(
8720 	pmap_t pmap)
8721 {
8722 	validate_pmap_mutable(pmap);
8723 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8724 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8725 		    __func__, pmap, pmap->type);
8726 	}
8727 	pmap->type = PMAP_TYPE_NESTED;
8728 	pmap_get_pt_ops(pmap)->free_id(pmap);
8729 }
8730 
8731 void
8732 pmap_set_nested(
8733 	pmap_t pmap)
8734 {
8735 #if XNU_MONITOR
8736 	pmap_set_nested_ppl(pmap);
8737 #else
8738 	pmap_set_nested_internal(pmap);
8739 #endif
8740 }
8741 
8742 /*
8743  * pmap_trim_range(pmap, start, end)
8744  *
8745  * pmap  = pmap to operate on
8746  * start = start of the range
8747  * end   = end of the range
8748  *
8749  * Attempts to deallocate TTEs for the given range in the nested range.
8750  */
8751 MARK_AS_PMAP_TEXT static void
8752 pmap_trim_range(
8753 	pmap_t pmap,
8754 	addr64_t start,
8755 	addr64_t end)
8756 {
8757 	addr64_t cur;
8758 	addr64_t nested_region_start;
8759 	addr64_t nested_region_end;
8760 	addr64_t adjusted_start;
8761 	addr64_t adjusted_end;
8762 	addr64_t adjust_offmask;
8763 	tt_entry_t * tte_p;
8764 	pt_entry_t * pte_p;
8765 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8766 
8767 	if (__improbable(end < start)) {
8768 		panic("%s: invalid address range, "
8769 		    "pmap=%p, start=%p, end=%p",
8770 		    __func__,
8771 		    pmap, (void*)start, (void*)end);
8772 	}
8773 
8774 	nested_region_start = pmap->nested_region_addr;
8775 	nested_region_end = nested_region_start + pmap->nested_region_size;
8776 
8777 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8778 		panic("%s: range outside nested region %p-%p, "
8779 		    "pmap=%p, start=%p, end=%p",
8780 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8781 		    pmap, (void*)start, (void*)end);
8782 	}
8783 
8784 	/* Contract the range to TT page boundaries. */
8785 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8786 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8787 	adjusted_end = end & ~adjust_offmask;
8788 
8789 	/* Iterate over the range, trying to remove TTEs. */
8790 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8791 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8792 
8793 		tte_p = pmap_tte(pmap, cur);
8794 
8795 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8796 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
8797 
8798 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8799 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8800 				/* Deallocate for the nested map. */
8801 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8802 			} else if (pmap->type == PMAP_TYPE_USER) {
8803 				/**
8804 				 * Just remove for the parent map. If the leaf table pointed
8805 				 * to by the TTE being removed (owned by the nested pmap)
8806 				 * has any mappings, then this call will panic. This
8807 				 * enforces the policy that tables being trimmed must be
8808 				 * empty to prevent possible use-after-free attacks.
8809 				 */
8810 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8811 			} else {
8812 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8813 			}
8814 		} else {
8815 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8816 		}
8817 	}
8818 
8819 	/* Remove empty L2 TTs. */
8820 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8821 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8822 
8823 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8824 		/* For each L1 entry in our range... */
8825 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8826 
8827 		bool remove_tt1e = true;
8828 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8829 		tt_entry_t * tt2e_start;
8830 		tt_entry_t * tt2e_end;
8831 		tt_entry_t * tt2e_p;
8832 		tt_entry_t tt1e;
8833 
8834 		if (tt1e_p == NULL) {
8835 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8836 			continue;
8837 		}
8838 
8839 		tt1e = *tt1e_p;
8840 
8841 		if (tt1e == ARM_TTE_TYPE_FAULT) {
8842 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8843 			continue;
8844 		}
8845 
8846 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
8847 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
8848 
8849 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
8850 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
8851 				/*
8852 				 * If any TTEs are populated, don't remove the
8853 				 * L1 TT.
8854 				 */
8855 				remove_tt1e = false;
8856 			}
8857 		}
8858 
8859 		if (remove_tt1e) {
8860 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
8861 		} else {
8862 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8863 		}
8864 	}
8865 }
8866 
8867 /**
8868  * State machine for multi-step pmap trimming. Trimming is the action of
8869  * deallocating the TTEs of the shared region of pmaps down to a given range.
8870  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
8871  * disabling preemption for too long. These steps include computing the bounds
8872  * of the shared region, trimming the head of the "grand", trimming the tail of
8873  * the "grand", and trimming the "subord". Some of the steps can be skipped under
8874  * different conditions.
8875  *
8876  * @param grand the pmap in which the pages are nested
8877  * @param subord the pmap from which the pages are shared, or nested
8878  * @param vstart start of the used range in "grand"
8879  * @param size size of the used range
8880  * @param state the current state of the state machine
8881  *
8882  * @return the next state of the state machine, to be used in the next call
8883  *         into this function.
8884  */
8885 MARK_AS_PMAP_TEXT pmap_trim_state_t
8886 pmap_trim_internal(
8887 	pmap_t grand,
8888 	pmap_t subord,
8889 	addr64_t vstart,
8890 	uint64_t size,
8891 	pmap_trim_state_t state)
8892 {
8893 	/* Validation needs to be done regardless of state. */
8894 	addr64_t vend;
8895 
8896 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
8897 		panic("%s: grand addr wraps around, "
8898 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8899 		    __func__, grand, subord, (void*)vstart, size, state);
8900 	}
8901 
8902 	validate_pmap_mutable(grand);
8903 	validate_pmap(subord);
8904 
8905 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8906 		panic("%s: subord is of non-nestable type 0x%hhx, "
8907 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8908 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
8909 	}
8910 
8911 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
8912 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8913 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8914 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
8915 	}
8916 
8917 	if (__improbable(grand->nested_pmap != subord)) {
8918 		panic("%s: grand->nested != subord, "
8919 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8920 		    __func__, grand, subord, (void*)vstart, size, state);
8921 	}
8922 
8923 	if (__improbable((size != 0) &&
8924 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
8925 		panic("%s: grand range not in nested region, "
8926 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8927 		    __func__, grand, subord, (void*)vstart, size, state);
8928 	}
8929 
8930 	/* Trimming starts with figuring out the bounds for the grand. */
8931 	if (state == PMAP_TRIM_STATE_START) {
8932 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8933 
8934 		/**
8935 		 * The "nested_has_no_bounds_ref" flag is set by `pmap_nest()` if the subord is nested into
8936 		 * the grand when the bounds are not known yet. Therefore, if it is not set, either any nesting
8937 		 * has not happened, or trimming has been done, or nesting has been done with bounds known so
8938 		 * the "extra" region was not nested in the first place. Anyway, trimming is not needed so
8939 		 * we exit early with PMAP_TRIM_STATE_DONE.
8940 		 */
8941 		if (!grand->nested_has_no_bounds_ref) {
8942 			assert(subord->nested_bounds_set);
8943 
8944 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
8945 			if (!grand->nested_bounds_set) {
8946 				/* Inherit the bounds from subord. */
8947 				grand->nested_region_true_start = subord->nested_region_true_start;
8948 				grand->nested_region_true_end = subord->nested_region_true_end;
8949 				grand->nested_bounds_set = true;
8950 			}
8951 
8952 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8953 
8954 			/* Now that the grand has bounds, we are done. */
8955 			return PMAP_TRIM_STATE_DONE;
8956 		}
8957 
8958 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
8959 		if ((!subord->nested_bounds_set) && size) {
8960 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8961 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8962 
8963 			subord->nested_region_true_start = vstart;
8964 			subord->nested_region_true_end = vend;
8965 			subord->nested_region_true_start &= ~adjust_offmask;
8966 
8967 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
8968 				panic("%s: padded true end wraps around, "
8969 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
8970 				    __func__, grand, subord, (void*)vstart, size, state);
8971 			}
8972 
8973 			subord->nested_region_true_end &= ~adjust_offmask;
8974 			subord->nested_bounds_set = true;
8975 		}
8976 
8977 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
8978 		if (subord->nested_bounds_set) {
8979 			/* Inherit the bounds from subord. */
8980 			grand->nested_region_true_start = subord->nested_region_true_start;
8981 			grand->nested_region_true_end = subord->nested_region_true_end;
8982 			grand->nested_bounds_set = true;
8983 
8984 			/* If we know the bounds, we can trim the pmap. */
8985 			grand->nested_has_no_bounds_ref = false;
8986 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8987 
8988 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
8989 		} else {
8990 			/* Don't trim if we don't know the bounds. */
8991 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8992 
8993 			return PMAP_TRIM_STATE_DONE;
8994 		}
8995 	}
8996 
8997 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
8998 	if (!grand->nested_bounds_set) {
8999 		panic("%s: !grand->nested_bounds_set, "
9000 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9001 		    __func__, grand, subord, (void*)vstart, size, state);
9002 	}
9003 
9004 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9005 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9006 
9007 #if XNU_MONITOR
9008 		if (pmap_pending_preemption()) {
9009 			return PMAP_TRIM_STATE_GRAND_AFTER;
9010 		}
9011 #endif
9012 
9013 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9014 	}
9015 
9016 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9017 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9018 
9019 #if XNU_MONITOR
9020 		if (pmap_pending_preemption()) {
9021 			return PMAP_TRIM_STATE_SUBORD;
9022 		}
9023 #endif
9024 
9025 		state = PMAP_TRIM_STATE_SUBORD;
9026 	}
9027 
9028 	/* START state is guaranteed to compute the bounds for the subord. */
9029 	if (!subord->nested_bounds_set) {
9030 		panic("%s: !subord->nested_bounds_set, "
9031 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9032 		    __func__, grand, subord, (void*)vstart, size, state);
9033 	}
9034 
9035 	if (state == PMAP_TRIM_STATE_SUBORD) {
9036 		pmap_trim_subord(subord);
9037 	}
9038 
9039 	return PMAP_TRIM_STATE_DONE;
9040 }
9041 
9042 MARK_AS_PMAP_TEXT static void
9043 pmap_trim_self(pmap_t pmap)
9044 {
9045 	if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
9046 		/* If we have a no bounds ref, we need to drop it. */
9047 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9048 		pmap->nested_has_no_bounds_ref = false;
9049 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9050 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9051 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9052 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9053 
9054 		if (nested_bounds_set) {
9055 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9056 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9057 		}
9058 		/*
9059 		 * Try trimming the nested pmap, in case we had the
9060 		 * last reference.
9061 		 */
9062 		pmap_trim_subord(pmap->nested_pmap);
9063 	}
9064 }
9065 
9066 /*
9067  * pmap_trim_subord(grand, subord)
9068  *
9069  * grand  = pmap that we have nested subord in
9070  * subord = nested pmap we are attempting to trim
9071  *
9072  * Trims subord if possible
9073  */
9074 MARK_AS_PMAP_TEXT static void
9075 pmap_trim_subord(pmap_t subord)
9076 {
9077 	bool contract_subord = false;
9078 
9079 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9080 
9081 	subord->nested_no_bounds_refcnt--;
9082 
9083 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9084 		/* If this was the last no bounds reference, trim subord. */
9085 		contract_subord = true;
9086 	}
9087 
9088 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9089 
9090 	if (contract_subord) {
9091 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9092 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9093 	}
9094 }
9095 
9096 /**
9097  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9098  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9099  * disabling preemption for too long.
9100  *
9101  * @note When we load the shared region we always create pages tables for the
9102  *       entire region. In practice, the shared cache may use just a portion
9103  *       of that. Before we know the bounds of the shared region, it can
9104  *       already be mapped into processes. Therefore, once the bounds are
9105  *       known, "trimming" comes in handy to remove the unnecessary page
9106  *       tables in the processes the shared region is mapped in, and eventually
9107  *       those in the shared region itself. Note that the shared region must
9108  *       be trimmed after the user processes because it has the L3 entries
9109  *       everyone else is pointing to.
9110  *
9111  * @param grand the pmap in which the pages are nested
9112  * @param subord the pmap from which the pages are shared, or nested
9113  * @param vstart start of the used range in "grand"
9114  * @param size size of the used range
9115  */
9116 void
9117 pmap_trim(
9118 	pmap_t grand,
9119 	pmap_t subord,
9120 	addr64_t vstart,
9121 	uint64_t size)
9122 {
9123 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9124 
9125 #if XNU_MONITOR
9126 	/* On PPL systems, drives the state machine until its done. */
9127 	while (state != PMAP_TRIM_STATE_DONE) {
9128 		__assert_only pmap_trim_state_t old_state = state;
9129 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9130 
9131 		/* Are we making progress? */
9132 		assert(old_state != state);
9133 	}
9134 
9135 	pmap_ledger_check_balance(grand);
9136 	pmap_ledger_check_balance(subord);
9137 #else
9138 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9139 
9140 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9141 	assert(state == PMAP_TRIM_STATE_DONE);
9142 #endif
9143 }
9144 
9145 #if HAS_APPLE_PAC
9146 void *
9147 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9148 {
9149 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9150 		panic("attempt to sign user pointer without process independent key");
9151 	}
9152 
9153 	void *res = NULL;
9154 	uint64_t current_intr_state = pmap_interrupts_disable();
9155 
9156 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9157 
9158 	__compiler_materialize_and_prevent_reordering_on(value);
9159 	switch (key) {
9160 	case ptrauth_key_asia:
9161 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9162 		break;
9163 	case ptrauth_key_asda:
9164 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9165 		break;
9166 	default:
9167 		__builtin_unreachable();
9168 	}
9169 	__compiler_materialize_and_prevent_reordering_on(res);
9170 
9171 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9172 
9173 	pmap_interrupts_restore(current_intr_state);
9174 
9175 	return res;
9176 }
9177 
9178 void *
9179 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9180 {
9181 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9182 }
9183 
9184 void *
9185 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9186 {
9187 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9188 		panic("attempt to auth user pointer without process independent key");
9189 	}
9190 
9191 	void *res = NULL;
9192 	uint64_t current_intr_state = pmap_interrupts_disable();
9193 
9194 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9195 	__compiler_materialize_and_prevent_reordering_on(value);
9196 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9197 	__compiler_materialize_and_prevent_reordering_on(res);
9198 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9199 
9200 	pmap_interrupts_restore(current_intr_state);
9201 
9202 	return res;
9203 }
9204 
9205 void *
9206 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9207 {
9208 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9209 }
9210 #endif /* HAS_APPLE_PAC */
9211 
9212 /*
9213  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9214  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9215  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9216  * return value, to indicate where a preempted [un]nest operation should resume.
9217  * When the return value contains the ending address of the nested region with
9218  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9219  */
9220 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9221 
9222 /*
9223  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9224  *
9225  *	grand  = the pmap that we will nest subord into
9226  *	subord = the pmap that goes into the grand
9227  *	vstart  = start of range in pmap to be inserted
9228  *	size   = Size of nest area (up to 16TB)
9229  *
9230  *	Inserts a pmap into another.  This is used to implement shared segments.
9231  *
9232  */
9233 
9234 /**
9235  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9236  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9237  * This function operates in 3 main phases:
9238  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9239  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9240  *    the mapping range are present in subord.
9241  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9242  *    contains pointers to subord's leaf-level pagetable pages for the specified
9243  *    VA range.
9244  *
9245  * This function may return early due to pending AST_URGENT preemption; if so
9246  * it will indicate the need to be re-entered.
9247  *
9248  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9249  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9250  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9251  * @param size twig-aligned size of the nesting range
9252  * @param vrestart the twig-aligned starting address of the current call.  May contain
9253  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9254  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9255  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9256  *
9257  * @return the virtual address at which to restart the operation, possibly including
9258  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9259  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9260  */
9261 MARK_AS_PMAP_TEXT vm_map_offset_t
9262 pmap_nest_internal(
9263 	pmap_t grand,
9264 	pmap_t subord,
9265 	addr64_t vstart,
9266 	uint64_t size,
9267 	vm_map_offset_t vrestart,
9268 	kern_return_t *krp)
9269 {
9270 	kern_return_t kr = KERN_FAILURE;
9271 	vm_map_offset_t vaddr;
9272 	tt_entry_t     *stte_p;
9273 	tt_entry_t     *gtte_p;
9274 	unsigned int    nested_region_asid_bitmap_size;
9275 	unsigned int*   nested_region_asid_bitmap;
9276 	int             expand_options = 0;
9277 	bool            deref_subord = true;
9278 
9279 	addr64_t vend;
9280 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9281 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9282 	}
9283 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9284 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9285 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9286 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9287 	}
9288 
9289 	assert(krp != NULL);
9290 	validate_pmap_mutable(grand);
9291 	validate_pmap(subord);
9292 #if XNU_MONITOR
9293 	/*
9294 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9295 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9296 	 * be in the process of being destroyed.  If destruction is already committed,
9297 	 * then the check of ref_count below will cover us.  If destruction is initiated
9298 	 * during or after this call, then pmap_destroy() will catch the non-zero
9299 	 * nested_count.
9300 	 */
9301 	os_atomic_inc(&subord->nested_count, relaxed);
9302 	os_atomic_thread_fence(seq_cst);
9303 #endif
9304 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9305 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9306 	}
9307 
9308 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9309 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9310 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9311 	}
9312 
9313 #if XNU_MONITOR
9314 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9315 #endif
9316 
9317 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9318 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9319 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9320 		    grand, vstart, size, (unsigned long long)vrestart);
9321 	}
9322 
9323 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9324 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9325 	}
9326 
9327 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9328 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9329 	}
9330 
9331 	if (subord->nested_region_asid_bitmap == NULL) {
9332 		nested_region_asid_bitmap_size  = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9333 
9334 #if XNU_MONITOR
9335 		pmap_paddr_t pa = 0;
9336 
9337 		if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9338 			panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9339 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9340 			    __FUNCTION__, nested_region_asid_bitmap_size,
9341 			    grand, subord, vstart, size);
9342 		}
9343 
9344 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9345 
9346 		if (kr != KERN_SUCCESS) {
9347 			goto nest_cleanup;
9348 		}
9349 
9350 		assert(pa);
9351 
9352 		nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9353 #else
9354 		nested_region_asid_bitmap = kalloc_data(
9355 			nested_region_asid_bitmap_size * sizeof(unsigned int),
9356 			Z_WAITOK | Z_ZERO);
9357 #endif
9358 
9359 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9360 		if (subord->nested_region_asid_bitmap == NULL) {
9361 			subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9362 			subord->nested_region_addr = vstart;
9363 			subord->nested_region_size = (mach_vm_offset_t) size;
9364 
9365 			/**
9366 			 * Ensure that the rest of the subord->nested_region_* fields are
9367 			 * initialized and visible before setting the nested_region_asid_bitmap
9368 			 * field (which is used as the flag to say that the rest are initialized).
9369 			 */
9370 			__builtin_arm_dmb(DMB_ISHST);
9371 			subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9372 			nested_region_asid_bitmap = NULL;
9373 		}
9374 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9375 		if (nested_region_asid_bitmap != NULL) {
9376 #if XNU_MONITOR
9377 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9378 #else
9379 			kfree_data(nested_region_asid_bitmap,
9380 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9381 #endif
9382 		}
9383 	}
9384 
9385 	/**
9386 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9387 	 * speculated before their initialization.
9388 	 */
9389 	__builtin_arm_dmb(DMB_ISHLD);
9390 
9391 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9392 		uint64_t        new_size;
9393 		unsigned int    new_nested_region_asid_bitmap_size;
9394 		unsigned int*   new_nested_region_asid_bitmap;
9395 
9396 		nested_region_asid_bitmap = NULL;
9397 		nested_region_asid_bitmap_size = 0;
9398 		new_size =  vend - subord->nested_region_addr;
9399 
9400 		/* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9401 		new_nested_region_asid_bitmap_size  = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9402 
9403 #if XNU_MONITOR
9404 		pmap_paddr_t pa = 0;
9405 
9406 		if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9407 			panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9408 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9409 			    __FUNCTION__, new_nested_region_asid_bitmap_size,
9410 			    grand, subord, vstart, new_size);
9411 		}
9412 
9413 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9414 
9415 		if (kr != KERN_SUCCESS) {
9416 			goto nest_cleanup;
9417 		}
9418 
9419 		assert(pa);
9420 
9421 		new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9422 #else
9423 		new_nested_region_asid_bitmap = kalloc_data(
9424 			new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9425 			Z_WAITOK | Z_ZERO);
9426 #endif
9427 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9428 		if (subord->nested_region_size < new_size) {
9429 			bcopy(subord->nested_region_asid_bitmap,
9430 			    new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9431 			nested_region_asid_bitmap_size  = subord->nested_region_asid_bitmap_size;
9432 			nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9433 			subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9434 			subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9435 			subord->nested_region_size = new_size;
9436 			new_nested_region_asid_bitmap = NULL;
9437 		}
9438 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9439 		if (nested_region_asid_bitmap != NULL) {
9440 #if XNU_MONITOR
9441 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9442 #else
9443 			kfree_data(nested_region_asid_bitmap,
9444 			    nested_region_asid_bitmap_size * sizeof(unsigned int));
9445 #endif
9446 		}
9447 		if (new_nested_region_asid_bitmap != NULL) {
9448 #if XNU_MONITOR
9449 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9450 #else
9451 			kfree_data(new_nested_region_asid_bitmap,
9452 			    new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9453 #endif
9454 		}
9455 	}
9456 
9457 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9458 
9459 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9460 		/*
9461 		 * If this is grand's first nesting operation, keep the reference on subord.
9462 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9463 		 */
9464 		deref_subord = false;
9465 
9466 		if (!subord->nested_bounds_set) {
9467 			/*
9468 			 * We are nesting without the shared regions bounds
9469 			 * being known.  We'll have to trim the pmap later.
9470 			 */
9471 			grand->nested_has_no_bounds_ref = true;
9472 			subord->nested_no_bounds_refcnt++;
9473 		}
9474 
9475 		grand->nested_region_addr = vstart;
9476 		grand->nested_region_size = (mach_vm_offset_t) size;
9477 	} else {
9478 		if (__improbable(grand->nested_pmap != subord)) {
9479 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9480 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9481 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9482 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9483 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9484 		}
9485 	}
9486 
9487 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9488 	if (vaddr < subord->nested_region_true_start) {
9489 		vaddr = subord->nested_region_true_start;
9490 	}
9491 
9492 	addr64_t true_end = vend;
9493 	if (true_end > subord->nested_region_true_end) {
9494 		true_end = subord->nested_region_true_end;
9495 	}
9496 	__unused unsigned int ttecount = 0;
9497 
9498 	if (vrestart & PMAP_NEST_GRAND) {
9499 		goto nest_grand;
9500 	}
9501 
9502 	while (vaddr < true_end) {
9503 		stte_p = pmap_tte(subord, vaddr);
9504 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9505 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9506 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9507 
9508 			if (kr != KERN_SUCCESS) {
9509 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9510 				goto done;
9511 			}
9512 
9513 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9514 		}
9515 		vaddr += pt_attr_twig_size(pt_attr);
9516 		vrestart = vaddr;
9517 		++ttecount;
9518 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9519 		    pmap_pending_preemption())) {
9520 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9521 			kr = KERN_SUCCESS;
9522 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9523 			goto done;
9524 		}
9525 	}
9526 	/*
9527 	 * copy TTEs from subord pmap into grand pmap
9528 	 */
9529 
9530 	vaddr = (vm_map_offset_t) vstart;
9531 	if (vaddr < subord->nested_region_true_start) {
9532 		vaddr = subord->nested_region_true_start;
9533 	}
9534 	vrestart = vaddr | PMAP_NEST_GRAND;
9535 
9536 nest_grand:
9537 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9538 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9539 	while (vaddr < true_end) {
9540 		stte_p = pmap_tte(subord, vaddr);
9541 		gtte_p = pmap_tte(grand, vaddr);
9542 		if (gtte_p == PT_ENTRY_NULL) {
9543 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9544 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9545 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9546 
9547 			if (kr != KERN_SUCCESS) {
9548 				goto done;
9549 			}
9550 
9551 			gtte_p = pmap_tt2e(grand, vaddr);
9552 		}
9553 		/* Don't leak a page table page.  Don't violate break-before-make. */
9554 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9555 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9556 			    __func__, gtte_p, grand);
9557 		}
9558 		*gtte_p = *stte_p;
9559 
9560 		vaddr += pt_attr_twig_size(pt_attr);
9561 		vrestart = vaddr | PMAP_NEST_GRAND;
9562 		++ttecount;
9563 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9564 		    pmap_pending_preemption())) {
9565 			break;
9566 		}
9567 	}
9568 	if (vaddr >= true_end) {
9569 		vrestart = vend | PMAP_NEST_GRAND;
9570 	}
9571 
9572 	kr = KERN_SUCCESS;
9573 done:
9574 
9575 	FLUSH_PTE();
9576 	__builtin_arm_isb(ISB_SY);
9577 
9578 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9579 #if XNU_MONITOR
9580 nest_cleanup:
9581 	if (kr != KERN_SUCCESS) {
9582 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9583 		*krp = kr;
9584 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9585 	}
9586 #else
9587 	if (kr != KERN_SUCCESS) {
9588 		*krp = kr;
9589 	}
9590 #endif
9591 	if (deref_subord) {
9592 #if XNU_MONITOR
9593 		os_atomic_dec(&subord->nested_count, relaxed);
9594 #endif
9595 		pmap_destroy_internal(subord);
9596 	}
9597 	return vrestart;
9598 }
9599 
9600 kern_return_t
9601 pmap_nest(
9602 	pmap_t grand,
9603 	pmap_t subord,
9604 	addr64_t vstart,
9605 	uint64_t size)
9606 {
9607 	kern_return_t kr = KERN_SUCCESS;
9608 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9609 	vm_map_offset_t vend = vaddr + size;
9610 	__unused vm_map_offset_t vlast = vaddr;
9611 
9612 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9613 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9614 	    VM_KERNEL_ADDRHIDE(vstart));
9615 
9616 	pmap_verify_preemptible();
9617 #if XNU_MONITOR
9618 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
9619 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9620 		if (kr == KERN_RESOURCE_SHORTAGE) {
9621 			pmap_alloc_page_for_ppl(0);
9622 			kr = KERN_SUCCESS;
9623 		} else if (kr == KERN_ABORTED) {
9624 			/* Reset kr to KERN_SUCCESS and try again. */
9625 			kr = KERN_SUCCESS;
9626 		} else if (kr != KERN_SUCCESS) {
9627 			break;
9628 		} else if (vaddr == vlast) {
9629 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9630 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9631 		}
9632 		vlast = vaddr;
9633 	}
9634 
9635 	pmap_ledger_check_balance(grand);
9636 	pmap_ledger_check_balance(subord);
9637 #else
9638 	/**
9639 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9640 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9641 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9642 	 */
9643 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9644 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9645 	}
9646 #endif
9647 
9648 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9649 
9650 	return kr;
9651 }
9652 
9653 /*
9654  *	kern_return_t pmap_unnest(grand, vaddr)
9655  *
9656  *	grand  = the pmap that will have the virtual range unnested
9657  *	vaddr  = start of range in pmap to be unnested
9658  *	size   = size of range in pmap to be unnested
9659  *
9660  */
9661 
9662 kern_return_t
9663 pmap_unnest(
9664 	pmap_t grand,
9665 	addr64_t vaddr,
9666 	uint64_t size)
9667 {
9668 	return pmap_unnest_options(grand, vaddr, size, 0);
9669 }
9670 
9671 /**
9672  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9673  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9674  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9675  * still have the region nested.  The mappings in 'grand' will be left empty
9676  * with the assumption that they will be demand-filled by subsequent access faults.
9677  *
9678  * This function operates in 2 main phases:
9679  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9680  *    them non-global.
9681  * 2. Clearing of the twig-level TTEs for the address range in grand.
9682  *
9683  * This function may return early due to pending AST_URGENT preemption; if so
9684  * it will indicate the need to be re-entered.
9685  *
9686  * @param grand pmap from which to unnest mappings
9687  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9688  * @param size twig-aligned size of the nested range
9689  * @param vrestart the page-aligned starting address of the current call.  May contain
9690  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9691  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9692  *        grand is being torn down and step 1) above is not needed.
9693  *
9694  * @return the virtual address at which to restart the operation, possibly including
9695  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9696  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9697  */
9698 MARK_AS_PMAP_TEXT vm_map_offset_t
9699 pmap_unnest_options_internal(
9700 	pmap_t grand,
9701 	addr64_t vaddr,
9702 	uint64_t size,
9703 	vm_map_offset_t vrestart,
9704 	unsigned int option)
9705 {
9706 	vm_map_offset_t start;
9707 	vm_map_offset_t addr;
9708 	tt_entry_t     *tte_p;
9709 	unsigned int    current_index;
9710 	unsigned int    start_index;
9711 	unsigned int    max_index;
9712 	unsigned int    entry_count = 0;
9713 
9714 	addr64_t vend;
9715 	addr64_t true_end;
9716 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9717 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9718 	}
9719 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9720 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9721 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9722 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9723 	}
9724 
9725 	validate_pmap_mutable(grand);
9726 
9727 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9728 
9729 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9730 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9731 		    (unsigned long long)vaddr, (unsigned long long)size);
9732 	}
9733 
9734 	if (__improbable(grand->nested_pmap == NULL)) {
9735 		panic("%s: %p has no nested pmap", __func__, grand);
9736 	}
9737 
9738 	true_end = vend;
9739 	if (true_end > grand->nested_pmap->nested_region_true_end) {
9740 		true_end = grand->nested_pmap->nested_region_true_end;
9741 	}
9742 
9743 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
9744 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9745 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9746 		}
9747 
9748 		pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9749 
9750 		start = vrestart;
9751 		if (start < grand->nested_pmap->nested_region_true_start) {
9752 			start = grand->nested_pmap->nested_region_true_start;
9753 		}
9754 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9755 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9756 		bool flush_tlb = false;
9757 
9758 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9759 			pt_entry_t  *bpte, *cpte;
9760 
9761 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9762 
9763 			bpte = pmap_pte(grand->nested_pmap, addr);
9764 
9765 			/*
9766 			 * If we've re-entered this function partway through unnesting a leaf region, the
9767 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
9768 			 * the run of PTEs.  We therefore also need to check for a non-twig-aligned starting
9769 			 * address.
9770 			 */
9771 			if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
9772 			    (addr & pt_attr_twig_offmask(pt_attr))) {
9773 				/*
9774 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
9775 				 * the nested pmap in this region will now be marked non-global.  Do this
9776 				 * before marking any of the PTEs within the region as non-global to avoid
9777 				 * the possibility of pmap_enter() subsequently inserting a global mapping
9778 				 * in the region, which could lead to a TLB conflict if a non-global entry
9779 				 * is later inserted for the same VA in a pmap which has fully unnested this
9780 				 * region.
9781 				 */
9782 				setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
9783 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
9784 					pmap_paddr_t    pa;
9785 					unsigned int    pai = 0;
9786 					boolean_t               managed = FALSE;
9787 					pt_entry_t  spte;
9788 
9789 					if ((*cpte != ARM_PTE_TYPE_FAULT)
9790 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
9791 						spte = *((volatile pt_entry_t*)cpte);
9792 						while (!managed) {
9793 							pa = pte_to_pa(spte);
9794 							if (!pa_valid(pa)) {
9795 								break;
9796 							}
9797 							pai = pa_index(pa);
9798 							pvh_lock(pai);
9799 							spte = *((volatile pt_entry_t*)cpte);
9800 							pa = pte_to_pa(spte);
9801 							if (pai == pa_index(pa)) {
9802 								managed = TRUE;
9803 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
9804 							}
9805 							pvh_unlock(pai);
9806 						}
9807 
9808 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
9809 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
9810 							flush_tlb = true;
9811 						}
9812 
9813 						if (managed) {
9814 							pvh_assert_locked(pai);
9815 							pvh_unlock(pai);
9816 						}
9817 					}
9818 
9819 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9820 					vrestart = addr;
9821 					++entry_count;
9822 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9823 					    pmap_pending_preemption())) {
9824 						goto unnest_subord_done;
9825 					}
9826 				}
9827 			}
9828 			addr = vlim;
9829 			vrestart = addr;
9830 			++entry_count;
9831 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9832 			    pmap_pending_preemption())) {
9833 				break;
9834 			}
9835 		}
9836 
9837 unnest_subord_done:
9838 		if (flush_tlb) {
9839 			FLUSH_PTE_STRONG();
9840 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
9841 		}
9842 
9843 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9844 		if (current_index < max_index) {
9845 			return vrestart;
9846 		}
9847 	}
9848 
9849 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9850 
9851 	/*
9852 	 * invalidate all pdes for segment at vaddr in pmap grand
9853 	 */
9854 	if (vrestart & PMAP_NEST_GRAND) {
9855 		addr = vrestart & ~PMAP_NEST_GRAND;
9856 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
9857 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
9858 		}
9859 	} else {
9860 		addr = vaddr;
9861 		vrestart = vaddr | PMAP_NEST_GRAND;
9862 	}
9863 
9864 	if (addr < grand->nested_pmap->nested_region_true_start) {
9865 		addr = grand->nested_pmap->nested_region_true_start;
9866 	}
9867 
9868 	while (addr < true_end) {
9869 		tte_p = pmap_tte(grand, addr);
9870 		/*
9871 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
9872 		 * so it's possible that a region we're trying to unnest may not have been
9873 		 * nested in the first place.
9874 		 */
9875 		if (tte_p != NULL) {
9876 			*tte_p = ARM_TTE_TYPE_FAULT;
9877 		}
9878 		addr += pt_attr_twig_size(pt_attr);
9879 		vrestart = addr | PMAP_NEST_GRAND;
9880 		++entry_count;
9881 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9882 		    pmap_pending_preemption())) {
9883 			break;
9884 		}
9885 	}
9886 	if (addr >= true_end) {
9887 		vrestart = vend | PMAP_NEST_GRAND;
9888 	}
9889 
9890 	FLUSH_PTE_STRONG();
9891 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
9892 
9893 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9894 
9895 	return vrestart;
9896 }
9897 
9898 kern_return_t
9899 pmap_unnest_options(
9900 	pmap_t grand,
9901 	addr64_t vaddr,
9902 	uint64_t size,
9903 	unsigned int option)
9904 {
9905 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
9906 	vm_map_offset_t vend = vaddr + size;
9907 	__unused vm_map_offset_t vlast = vrestart;
9908 
9909 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9910 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9911 
9912 	pmap_verify_preemptible();
9913 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
9914 #if XNU_MONITOR
9915 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
9916 		if (vrestart == vlast) {
9917 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9918 			    __func__, (unsigned long long)vaddr, (unsigned long long)vend, (unsigned long long)vrestart);
9919 		}
9920 		vlast = vrestart;
9921 #else
9922 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
9923 #endif
9924 	}
9925 
9926 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9927 
9928 	return KERN_SUCCESS;
9929 }
9930 
9931 boolean_t
9932 pmap_adjust_unnest_parameters(
9933 	__unused pmap_t p,
9934 	__unused vm_map_offset_t *s,
9935 	__unused vm_map_offset_t *e)
9936 {
9937 	return TRUE; /* to get to log_unnest_badness()... */
9938 }
9939 
9940 #if PMAP_FORK_NEST
9941 /**
9942  * Perform any necessary pre-nesting of the parent's shared region at fork()
9943  * time.
9944  *
9945  * @note This should only be called from vm_map_fork().
9946  *
9947  * @param old_pmap The pmap of the parent task.
9948  * @param new_pmap The pmap of the child task.
9949  * @param nesting_start An output parameter that is updated with the start
9950  *                      address of the range that was pre-nested
9951  * @param nesting_end An output parameter that is updated with the end
9952  *                      address of the range that was pre-nested
9953  *
9954  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9955  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
9956  */
9957 kern_return_t
9958 pmap_fork_nest(
9959 	pmap_t old_pmap,
9960 	pmap_t new_pmap,
9961 	vm_map_offset_t *nesting_start,
9962 	vm_map_offset_t *nesting_end)
9963 {
9964 	if (old_pmap == NULL || new_pmap == NULL) {
9965 		return KERN_INVALID_ARGUMENT;
9966 	}
9967 	if (old_pmap->nested_pmap == NULL) {
9968 		return KERN_SUCCESS;
9969 	}
9970 	pmap_nest(new_pmap,
9971 	    old_pmap->nested_pmap,
9972 	    old_pmap->nested_region_addr,
9973 	    old_pmap->nested_region_size);
9974 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
9975 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
9976 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
9977 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
9978 	    new_pmap->nested_pmap,
9979 	    new_pmap->nested_region_addr,
9980 	    new_pmap->nested_region_size,
9981 	    old_pmap->nested_pmap,
9982 	    old_pmap->nested_region_addr,
9983 	    old_pmap->nested_region_size);
9984 	*nesting_start = old_pmap->nested_region_addr;
9985 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
9986 	return KERN_SUCCESS;
9987 }
9988 #endif /* PMAP_FORK_NEST */
9989 
9990 /*
9991  * disable no-execute capability on
9992  * the specified pmap
9993  */
9994 #if DEVELOPMENT || DEBUG
9995 void
9996 pmap_disable_NX(
9997 	pmap_t pmap)
9998 {
9999 	pmap->nx_enabled = FALSE;
10000 }
10001 #else
10002 void
10003 pmap_disable_NX(
10004 	__unused pmap_t pmap)
10005 {
10006 }
10007 #endif
10008 
10009 /*
10010  * flush a range of hardware TLB entries.
10011  * NOTE: assumes the smallest TLB entry in use will be for
10012  * an ARM small page (4K).
10013  */
10014 
10015 #define ARM_FULL_TLB_FLUSH_THRESHOLD 64
10016 
10017 #if __ARM_RANGE_TLBI__
10018 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
10019 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_PAGES
10020 #else
10021 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10022 #endif // __ARM_RANGE_TLBI__
10023 
10024 static void
10025 flush_mmu_tlb_region_asid_async(
10026 	vm_offset_t va,
10027 	size_t length,
10028 	pmap_t pmap,
10029 	bool last_level_only __unused)
10030 {
10031 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10032 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10033 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10034 	uint32_t    asid;
10035 
10036 	asid = pmap->hw_asid;
10037 
10038 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10039 		boolean_t       flush_all = FALSE;
10040 
10041 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10042 			flush_all = TRUE;
10043 		}
10044 		if (flush_all) {
10045 			flush_mmu_tlb_async();
10046 		} else {
10047 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT);
10048 		}
10049 		return;
10050 	}
10051 #if __ARM_RANGE_TLBI__
10052 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10053 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10054 		if (pmap->type == PMAP_TYPE_NESTED) {
10055 			flush_mmu_tlb_allrange_async(va, last_level_only);
10056 		} else {
10057 			flush_mmu_tlb_range_async(va, last_level_only);
10058 		}
10059 		return;
10060 	}
10061 #endif
10062 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10063 	va = tlbi_asid(asid) | tlbi_addr(va);
10064 
10065 	if (pmap->type == PMAP_TYPE_NESTED) {
10066 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only);
10067 	} else {
10068 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only);
10069 	}
10070 }
10071 
10072 MARK_AS_PMAP_TEXT static void
10073 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10074 {
10075 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT);
10076 }
10077 
10078 void
10079 flush_mmu_tlb_region(
10080 	vm_offset_t va,
10081 	unsigned length)
10082 {
10083 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
10084 	sync_tlb_flush();
10085 }
10086 
10087 unsigned int
10088 pmap_cache_attributes(
10089 	ppnum_t pn)
10090 {
10091 	pmap_paddr_t    paddr;
10092 	unsigned int    pai;
10093 	unsigned int    result;
10094 	pp_attr_t       pp_attr_current;
10095 
10096 	paddr = ptoa(pn);
10097 
10098 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10099 
10100 	if (!pa_valid(paddr)) {
10101 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10102 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10103 	}
10104 
10105 	result = VM_WIMG_DEFAULT;
10106 
10107 	pai = pa_index(paddr);
10108 
10109 	pp_attr_current = pp_attr_table[pai];
10110 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10111 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10112 	}
10113 	return result;
10114 }
10115 
10116 MARK_AS_PMAP_TEXT static void
10117 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10118 {
10119 	if ((wimg_bits_prev != wimg_bits_new)
10120 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10121 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10122 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10123 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10124 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10125 		pmap_sync_page_attributes_phys(pn);
10126 	}
10127 
10128 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10129 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10130 	}
10131 }
10132 
10133 MARK_AS_PMAP_TEXT __unused void
10134 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10135 {
10136 	pmap_paddr_t paddr = ptoa(pn);
10137 	const unsigned int pai = pa_index(paddr);
10138 
10139 	if (__improbable(!pa_valid(paddr))) {
10140 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10141 	}
10142 
10143 	pvh_lock(pai);
10144 
10145 #if XNU_MONITOR
10146 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10147 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10148 	}
10149 #endif
10150 
10151 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10152 
10153 	pvh_unlock(pai);
10154 
10155 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10156 }
10157 
10158 void *
10159 pmap_map_compressor_page(ppnum_t pn)
10160 {
10161 #if __ARM_PTE_PHYSMAP__
10162 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10163 	if (cacheattr != VM_WIMG_DEFAULT) {
10164 #if XNU_MONITOR
10165 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10166 #else
10167 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10168 #endif
10169 	}
10170 #endif
10171 	return (void*)phystokv(ptoa(pn));
10172 }
10173 
10174 void
10175 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10176 {
10177 #if __ARM_PTE_PHYSMAP__
10178 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10179 	if (cacheattr != VM_WIMG_DEFAULT) {
10180 #if XNU_MONITOR
10181 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10182 #else
10183 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10184 #endif
10185 	}
10186 #endif
10187 }
10188 
10189 /**
10190  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10191  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10192  *
10193  * @param user_page_list List of pages to be updated.
10194  * @param page_cnt Number of pages in total in user_page_list.
10195  * @param cacheattr The new cache attribute.
10196  *
10197  * @return Success if true is returned.
10198  */
10199 bool
10200 pmap_batch_set_cache_attributes(
10201 	upl_page_info_array_t user_page_list,
10202 	unsigned int page_cnt,
10203 	unsigned int cacheattr)
10204 {
10205 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10206 
10207 	if (page_cnt == 0) {
10208 		return true;
10209 	}
10210 
10211 	batch_set_cache_attr_state_t states;
10212 	states.page_index = 0;
10213 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10214 	states.tlb_flush_pass_needed = false;
10215 	states.rt_cache_flush_pass_needed = false;
10216 
10217 	/* Verify we are being called from a preemptible context. */
10218 	pmap_verify_preemptible();
10219 
10220 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10221 #if XNU_MONITOR
10222 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10223 #else /* !XNU_MONITOR */
10224 		states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10225 #endif /* XNU_MONITOR */
10226 	}
10227 
10228 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10229 	return true;
10230 }
10231 
10232 /**
10233  * Flushes TLB entries associated with the page specified by paddr, but do not
10234  * issue barriers yet.
10235  *
10236  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10237  */
10238 MARK_AS_PMAP_TEXT static void
10239 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10240 {
10241 #if __ARM_PTE_PHYSMAP__
10242 	/* Flush the physical aperture mappings. */
10243 	const vm_offset_t kva = phystokv(paddr);
10244 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10245 #endif /* __ARM_PTE_PHYSMAP__ */
10246 
10247 	/* Flush the mappings tracked in the ptes. */
10248 	const unsigned int pai = pa_index(paddr);
10249 	pv_entry_t **pv_h = pai_to_pvh(pai);
10250 
10251 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10252 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10253 
10254 	pvh_assert_locked(pai);
10255 
10256 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10257 		pte_p = pvh_ptep(pv_h);
10258 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10259 		pve_p = pvh_pve_list(pv_h);
10260 		pte_p = PT_ENTRY_NULL;
10261 	}
10262 
10263 	int pve_ptep_idx = 0;
10264 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10265 		if (pve_p != PV_ENTRY_NULL) {
10266 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10267 			if (pte_p == PT_ENTRY_NULL) {
10268 				goto flush_tlb_skip_pte;
10269 			}
10270 		}
10271 
10272 #ifdef PVH_FLAG_IOMMU
10273 		if (pvh_ptep_is_iommu(pte_p)) {
10274 			goto flush_tlb_skip_pte;
10275 		}
10276 #endif /* PVH_FLAG_IOMMU */
10277 		pmap_t pmap = ptep_get_pmap(pte_p);
10278 		vm_map_address_t va = ptep_get_va(pte_p);
10279 
10280 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10281 
10282 flush_tlb_skip_pte:
10283 		pte_p = PT_ENTRY_NULL;
10284 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10285 			pve_ptep_idx = 0;
10286 			pve_p = pve_next(pve_p);
10287 		}
10288 	}
10289 }
10290 
10291 /**
10292  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10293  *
10294  * @param pai The Physical Address Index of the entry.
10295  * @param cacheattr The new cache attribute.
10296  */
10297 MARK_AS_PMAP_TEXT static void
10298 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10299 {
10300 	pvh_assert_locked(pai);
10301 
10302 	pp_attr_t pp_attr_current, pp_attr_template;
10303 	do {
10304 		pp_attr_current = pp_attr_table[pai];
10305 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10306 
10307 		/**
10308 		 * WIMG bits should only be updated under the PVH lock, but we should do
10309 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10310 		 */
10311 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10312 }
10313 
10314 /**
10315  * Batch updates the cache attributes of a list of pages in three passes.
10316  *
10317  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10318  * In pass two, TLB entries are flushed for each page in the list if necessary.
10319  * In pass three, caches are cleaned for each page in the list if necessary.
10320  *
10321  * When running in PPL, this function may decide to return to the caller in response
10322  * to AST_URGENT.
10323  *
10324  * @param user_page_list List of pages to be updated.
10325  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10326  * @param page_cnt Number of pages in total in user_page_list.
10327  * @param cacheattr The new cache attributes.
10328  *
10329  * @return The new state of the state machine.
10330  */
10331 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10332 pmap_batch_set_cache_attributes_internal(
10333 #if XNU_MONITOR
10334 	volatile upl_page_info_t *user_page_list,
10335 #else /* !XNU_MONITOR */
10336 	upl_page_info_array_t user_page_list,
10337 #endif /* XNU_MONITOR */
10338 	batch_set_cache_attr_state_t states,
10339 	unsigned int page_cnt,
10340 	unsigned int cacheattr)
10341 {
10342 	uint64_t page_index = states.page_index;
10343 	uint64_t state = states.state;
10344 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10345 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10346 
10347 	/* For verifying progress. */
10348 	__assert_only const uint64_t page_index_old = page_index;
10349 	__assert_only const uint64_t state_old = state;
10350 
10351 	/* Assert page_index and state are within their range. */
10352 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10353 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10354 	}
10355 
10356 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10357 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10358 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10359 		while (page_index < page_cnt) {
10360 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10361 			const pmap_paddr_t paddr = ptoa(pn);
10362 
10363 			if (!pa_valid(paddr)) {
10364 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10365 			}
10366 
10367 			const unsigned int pai = pa_index(paddr);
10368 
10369 			/* Lock the page. */
10370 			pvh_lock(pai);
10371 
10372 #if XNU_MONITOR
10373 			if (ppattr_pa_test_monitor(paddr)) {
10374 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10375 			}
10376 #endif /* XNU_MONITOR */
10377 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10378 
10379 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10380 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10381 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10382 			}
10383 
10384 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10385 
10386 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10387 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10388 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10389 			}
10390 
10391 			/* Update the cache attributes in PTE and PP_ATTR table. */
10392 			if (wimg_bits_new != wimg_bits_prev) {
10393 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10394 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10395 			}
10396 
10397 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10398 				rt_cache_flush_pass_needed = true;
10399 			}
10400 
10401 			pvh_unlock(pai);
10402 
10403 			page_index++;
10404 
10405 #if XNU_MONITOR
10406 			/**
10407 			 * Check for AST_URGENT every page, as the pve list search in cache
10408 			 * update can take non-constant time.
10409 			 */
10410 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10411 				goto pbscai_exit;
10412 			}
10413 #endif /* XNU_MONITOR */
10414 		}
10415 
10416 		/* page_index == page_cnt && !pmap_pending_preemption() */
10417 		if (tlb_flush_pass_needed) {
10418 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10419 		} else if (rt_cache_flush_pass_needed) {
10420 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10421 		} else {
10422 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10423 		}
10424 		page_index = 0;
10425 
10426 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10427 		FLUSH_PTE_STRONG();
10428 
10429 #if XNU_MONITOR
10430 		if (__improbable(pmap_pending_preemption())) {
10431 			goto pbscai_exit;
10432 		}
10433 #endif /* XNU_MONITOR */
10434 	}
10435 
10436 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10437 		/**
10438 		 * Pass 2: for each physical page and for each mapping, we need to flush
10439 		 * the TLB for it.
10440 		 */
10441 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10442 		while (page_index < page_cnt) {
10443 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10444 
10445 			const pmap_paddr_t paddr = ptoa(pn);
10446 			if (!pa_valid(paddr)) {
10447 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10448 			}
10449 
10450 			const unsigned int pai = pa_index(paddr);
10451 
10452 			pvh_lock(pai);
10453 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10454 			pvh_unlock(pai);
10455 
10456 			page_index++;
10457 
10458 #if XNU_MONITOR
10459 			/**
10460 			 * Check for AST_URGENT every page, as the pve list search in cache
10461 			 * update can take non-constant time.
10462 			 */
10463 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10464 				goto pbscai_exit;
10465 			}
10466 #endif /* XNU_MONITOR */
10467 		}
10468 
10469 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10470 
10471 		if (rt_cache_flush_pass_needed) {
10472 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10473 		} else {
10474 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10475 		}
10476 		page_index = 0;
10477 
10478 #if XNU_MONITOR
10479 		if (__improbable(pmap_pending_preemption())) {
10480 			goto pbscai_exit;
10481 		}
10482 #endif /* XNU_MONITOR */
10483 	}
10484 
10485 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10486 		/* Pass 3: Flush the cache if the page is recently set to RT */
10487 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10488 #if !XNU_MONITOR
10489 		/**
10490 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10491 		 * in the state where DC by VA instructions remain enabled.
10492 		 */
10493 		disable_preemption();
10494 #endif /* !XNU_MONITOR */
10495 
10496 		assert(get_preemption_level() > 0);
10497 
10498 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10499 		/**
10500 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10501 		 * and the host will handle cache maintenance for it. So we don't need to
10502 		 * worry about enabling the ops here for AVP.
10503 		 */
10504 		enable_dc_mva_ops();
10505 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10506 
10507 		while (page_index < page_cnt) {
10508 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10509 
10510 			if (!pa_valid(paddr)) {
10511 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10512 			}
10513 
10514 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10515 
10516 			page_index++;
10517 
10518 #if XNU_MONITOR
10519 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10520 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10521 				disable_dc_mva_ops();
10522 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10523 				goto pbscai_exit;
10524 			}
10525 #endif /* XNU_MONITOR */
10526 		}
10527 
10528 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10529 		disable_dc_mva_ops();
10530 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10531 
10532 #if !XNU_MONITOR
10533 		enable_preemption();
10534 #endif /* !XNU_MONITOR */
10535 
10536 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10537 		page_index = 0;
10538 	}
10539 
10540 #if XNU_MONITOR
10541 pbscai_exit:
10542 #endif /* XNU_MONITOR */
10543 	/* Assert page_index and state are within their range. */
10544 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10545 
10546 	/* Make sure we are making progress in this call. */
10547 	assert(page_index > page_index_old || state > state_old);
10548 
10549 	batch_set_cache_attr_state_t states_new;
10550 	states_new.page_index = page_index;
10551 	states_new.state = state;
10552 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10553 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10554 	return states_new;
10555 }
10556 
10557 MARK_AS_PMAP_TEXT static void
10558 pmap_set_cache_attributes_priv(
10559 	ppnum_t pn,
10560 	unsigned int cacheattr,
10561 	boolean_t external __unused)
10562 {
10563 	pmap_paddr_t    paddr;
10564 	unsigned int    pai;
10565 	pp_attr_t       pp_attr_current;
10566 	pp_attr_t       pp_attr_template;
10567 	unsigned int    wimg_bits_prev, wimg_bits_new;
10568 
10569 	paddr = ptoa(pn);
10570 
10571 	if (!pa_valid(paddr)) {
10572 		return;                         /* Not a managed page. */
10573 	}
10574 
10575 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
10576 		cacheattr = VM_WIMG_DEFAULT;
10577 	}
10578 
10579 	pai = pa_index(paddr);
10580 
10581 	pvh_lock(pai);
10582 
10583 #if XNU_MONITOR
10584 	if (external && ppattr_pa_test_monitor(paddr)) {
10585 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10586 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
10587 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10588 	}
10589 #endif
10590 
10591 	do {
10592 		pp_attr_current = pp_attr_table[pai];
10593 		wimg_bits_prev = VM_WIMG_DEFAULT;
10594 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10595 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10596 		}
10597 
10598 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10599 
10600 		/**
10601 		 * WIMG bits should only be updated under the PVH lock, but we should do
10602 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10603 		 */
10604 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10605 
10606 	wimg_bits_new = VM_WIMG_DEFAULT;
10607 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10608 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10609 	}
10610 
10611 	if (wimg_bits_new != wimg_bits_prev) {
10612 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
10613 	}
10614 
10615 	pvh_unlock(pai);
10616 
10617 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10618 }
10619 
10620 MARK_AS_PMAP_TEXT void
10621 pmap_set_cache_attributes_internal(
10622 	ppnum_t pn,
10623 	unsigned int cacheattr)
10624 {
10625 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10626 }
10627 
10628 void
10629 pmap_set_cache_attributes(
10630 	ppnum_t pn,
10631 	unsigned int cacheattr)
10632 {
10633 #if XNU_MONITOR
10634 	pmap_set_cache_attributes_ppl(pn, cacheattr);
10635 #else
10636 	pmap_set_cache_attributes_internal(pn, cacheattr);
10637 #endif
10638 }
10639 
10640 /**
10641  * Updates the page numbered ppnum to have attribute specified by attributes.
10642  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10643  * The necessity of the TLB flush is returned in case this function is called
10644  * in a batched manner and the TLB flush is intended to be done at a different
10645  * timing.
10646  *
10647  * @param ppnum Page Number of the page to be updated.
10648  * @param attributes The new cache attributes.
10649  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10650  *        immediately.
10651  *
10652  * @return Returns true if a TLB flush is needed for this update regardless of
10653  *         whether a flush has occurred already.
10654  */
10655 MARK_AS_PMAP_TEXT bool
10656 pmap_update_cache_attributes_locked(
10657 	ppnum_t ppnum,
10658 	unsigned attributes,
10659 	bool perform_tlbi)
10660 {
10661 	pmap_paddr_t    phys = ptoa(ppnum);
10662 	pv_entry_t      *pve_p;
10663 	pt_entry_t      *pte_p;
10664 	pv_entry_t      **pv_h;
10665 	pt_entry_t      tmplate;
10666 	unsigned int    pai;
10667 	boolean_t       tlb_flush_needed = false;
10668 
10669 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10670 
10671 	if (pmap_panic_dev_wimg_on_managed) {
10672 		switch (attributes & VM_WIMG_MASK) {
10673 		case VM_WIMG_IO:                        // nGnRnE
10674 		case VM_WIMG_POSTED:                    // nGnRE
10675 		/* supported on DRAM, but slow, so we disallow */
10676 
10677 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10678 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10679 			/* unsupported on DRAM */
10680 
10681 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10682 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10683 			break;
10684 
10685 		default:
10686 			/* not device type memory, all good */
10687 
10688 			break;
10689 		}
10690 	}
10691 
10692 #if __ARM_PTE_PHYSMAP__
10693 	vm_offset_t kva = phystokv(phys);
10694 	pte_p = pmap_pte(kernel_pmap, kva);
10695 
10696 	tmplate = *pte_p;
10697 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10698 #if XNU_MONITOR
10699 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10700 #else
10701 	tmplate |= wimg_to_pte(attributes, phys);
10702 #endif
10703 	if (tmplate & ARM_PTE_HINT_MASK) {
10704 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10705 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
10706 	}
10707 
10708 	if (perform_tlbi) {
10709 		write_pte_strong(pte_p, tmplate);
10710 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10711 	} else {
10712 		write_pte_fast(pte_p, tmplate);
10713 	}
10714 	tlb_flush_needed = true;
10715 #endif
10716 
10717 	pai = pa_index(phys);
10718 
10719 	pv_h = pai_to_pvh(pai);
10720 
10721 	pte_p = PT_ENTRY_NULL;
10722 	pve_p = PV_ENTRY_NULL;
10723 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10724 		pte_p = pvh_ptep(pv_h);
10725 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10726 		pve_p = pvh_pve_list(pv_h);
10727 		pte_p = PT_ENTRY_NULL;
10728 	}
10729 
10730 	int pve_ptep_idx = 0;
10731 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10732 		vm_map_address_t va;
10733 		pmap_t          pmap;
10734 
10735 		if (pve_p != PV_ENTRY_NULL) {
10736 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10737 			if (pte_p == PT_ENTRY_NULL) {
10738 				goto cache_skip_pve;
10739 			}
10740 		}
10741 
10742 #ifdef PVH_FLAG_IOMMU
10743 		if (pvh_ptep_is_iommu(pte_p)) {
10744 			goto cache_skip_pve;
10745 		}
10746 #endif
10747 		pmap = ptep_get_pmap(pte_p);
10748 		va = ptep_get_va(pte_p);
10749 
10750 		tmplate = *pte_p;
10751 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10752 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
10753 
10754 		if (perform_tlbi) {
10755 			write_pte_strong(pte_p, tmplate);
10756 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10757 		} else {
10758 			write_pte_fast(pte_p, tmplate);
10759 		}
10760 		tlb_flush_needed = true;
10761 
10762 cache_skip_pve:
10763 		pte_p = PT_ENTRY_NULL;
10764 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10765 			pve_ptep_idx = 0;
10766 			pve_p = pve_next(pve_p);
10767 		}
10768 	}
10769 	if (perform_tlbi && tlb_flush_needed) {
10770 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
10771 	}
10772 
10773 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
10774 
10775 	return tlb_flush_needed;
10776 }
10777 
10778 /**
10779  * Mark a pmap as being dedicated to use for a commpage mapping.
10780  * The pmap itself will never be activated on a CPU; its mappings will
10781  * only be embedded in userspace pmaps at a fixed virtual address.
10782  *
10783  * @param pmap the pmap to mark as belonging to a commpage.
10784  */
10785 static void
10786 pmap_set_commpage(pmap_t pmap)
10787 {
10788 #if XNU_MONITOR
10789 	assert(!pmap_ppl_locked_down);
10790 #endif
10791 	assert(pmap->type == PMAP_TYPE_USER);
10792 	pmap->type = PMAP_TYPE_COMMPAGE;
10793 	/*
10794 	 * Free the pmap's ASID.  This pmap should not ever be directly
10795 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
10796 	 * ASID space contention but will also cause pmap_switch() to panic
10797 	 * if an attacker tries to activate this pmap.  Disable preemption to
10798 	 * accommodate the *_nopreempt spinlock in free_asid().
10799 	 */
10800 	mp_disable_preemption();
10801 	pmap_get_pt_ops(pmap)->free_id(pmap);
10802 	mp_enable_preemption();
10803 }
10804 
10805 static void
10806 pmap_update_tt3e(
10807 	pmap_t pmap,
10808 	vm_address_t address,
10809 	tt_entry_t template)
10810 {
10811 	tt_entry_t *ptep, pte;
10812 
10813 	ptep = pmap_tt3e(pmap, address);
10814 	if (ptep == NULL) {
10815 		panic("%s: no ptep?", __FUNCTION__);
10816 	}
10817 
10818 	pte = *ptep;
10819 	pte = tte_to_pa(pte) | template;
10820 	write_pte_strong(ptep, pte);
10821 }
10822 
10823 /* Note absence of non-global bit */
10824 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10825 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10826 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
10827 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10828 
10829 /* Note absence of non-global bit and no-execute bit.  */
10830 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
10831 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
10832 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
10833 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
10834 
10835 void
10836 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10837     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10838 {
10839 	kern_return_t kr;
10840 	pmap_paddr_t data_pa = 0; // data address
10841 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10842 	pmap_paddr_t text_pa = 0; // text address
10843 
10844 	*kernel_data_addr = 0;
10845 	*kernel_text_addr = 0;
10846 	*user_text_addr = 0;
10847 
10848 #if XNU_MONITOR
10849 	data_pa = pmap_alloc_page_for_kern(0);
10850 	assert(data_pa);
10851 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
10852 	ro_data_pa = pmap_alloc_page_for_kern(0);
10853 	assert(ro_data_pa);
10854 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
10855 #if CONFIG_ARM_PFZ
10856 	text_pa = pmap_alloc_page_for_kern(0);
10857 	assert(text_pa);
10858 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
10859 #endif
10860 
10861 #else /* XNU_MONITOR */
10862 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
10863 	/*
10864 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
10865 	 * mapped at page granularity, so a separate page for kernel RO data would not
10866 	 * be useful.
10867 	 */
10868 	ro_data_pa = data_pa;
10869 #if CONFIG_ARM_PFZ
10870 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
10871 #endif
10872 
10873 #endif /* XNU_MONITOR */
10874 
10875 	/*
10876 	 * In order to avoid burning extra pages on mapping the shared page, we
10877 	 * create a dedicated pmap for the shared page.  We forcibly nest the
10878 	 * translation tables from this pmap into other pmaps.  The level we
10879 	 * will nest at depends on the MMU configuration (page size, TTBR range,
10880 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
10881 	 *
10882 	 * Note that this is NOT "the nested pmap" (which is used to nest the
10883 	 * shared cache).
10884 	 *
10885 	 * Note that we update parameters of the entry for our unique needs (NG
10886 	 * entry, etc.).
10887 	 */
10888 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
10889 	assert(commpage_pmap_default != NULL);
10890 	pmap_set_commpage(commpage_pmap_default);
10891 
10892 	/* The user 64-bit mappings... */
10893 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10894 	assert(kr == KERN_SUCCESS);
10895 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10896 
10897 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10898 	assert(kr == KERN_SUCCESS);
10899 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10900 #if CONFIG_ARM_PFZ
10901 	/* User mapping of comm page text section for 64 bit mapping only
10902 	 *
10903 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10904 	 * user processes to get this page mapped in, they should never call into
10905 	 * this page.
10906 	 *
10907 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10908 	 * is slid in the same L3 as the data commpage.  It is either outside the
10909 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
10910 	 * it is reserved and unavailable to mach VM for future mappings.
10911 	 */
10912 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
10913 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
10914 
10915 	vm_map_address_t commpage_text_va = 0;
10916 
10917 	do {
10918 		int text_leaf_index = random() % num_ptes;
10919 
10920 		// Generate a VA for the commpage text with the same root and twig index as data
10921 		// comm page, but with new leaf index we've just generated.
10922 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
10923 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
10924 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10925 
10926 	// Assert that this is empty
10927 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
10928 	assert(ptep != PT_ENTRY_NULL);
10929 	assert(*ptep == ARM_TTE_EMPTY);
10930 
10931 	// At this point, we've found the address we want to insert our comm page at
10932 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10933 	assert(kr == KERN_SUCCESS);
10934 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
10935 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
10936 
10937 	*user_text_addr = commpage_text_va;
10938 #endif
10939 
10940 	/* ...and the user 32-bit mappings. */
10941 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10942 	assert(kr == KERN_SUCCESS);
10943 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10944 
10945 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10946 	assert(kr == KERN_SUCCESS);
10947 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10948 #if __ARM_MIXED_PAGE_SIZE__
10949 	/**
10950 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
10951 	 * new set of page tables that point to the exact same 16K shared page as
10952 	 * before. Only the first 4K of the 16K shared page is mapped since that's
10953 	 * the only part that contains relevant data.
10954 	 */
10955 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
10956 	assert(commpage_pmap_4k != NULL);
10957 	pmap_set_commpage(commpage_pmap_4k);
10958 
10959 	/* The user 64-bit mappings... */
10960 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10961 	assert(kr == KERN_SUCCESS);
10962 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10963 
10964 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10965 	assert(kr == KERN_SUCCESS);
10966 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10967 
10968 	/* ...and the user 32-bit mapping. */
10969 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10970 	assert(kr == KERN_SUCCESS);
10971 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10972 
10973 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
10974 	assert(kr == KERN_SUCCESS);
10975 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
10976 #endif
10977 
10978 	/* For manipulation in kernel, go straight to physical page */
10979 	*kernel_data_addr = phystokv(data_pa);
10980 	assert(commpage_ro_data_kva == 0);
10981 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
10982 	assert(commpage_text_kva == 0);
10983 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
10984 }
10985 
10986 
10987 /*
10988  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10989  * with user controlled TTEs for regions that aren't explicitly reserved by the
10990  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10991  */
10992 #if (ARM_PGSHIFT == 14)
10993 /**
10994  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10995  * commpage completely above the maximum 32-bit userspace VA.
10996  */
10997 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10998 
10999 /**
11000  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11001  * userspace VAs can nest the commpage completely above the maximum 64-bit
11002  * userpace VA, but that technically isn't true on macOS. On those systems, the
11003  * commpage lives within the userspace VA range, but is protected by the VM as
11004  * a reserved region (see vm_reserved_regions[] definition for more info).
11005  */
11006 
11007 #elif (ARM_PGSHIFT == 12)
11008 /**
11009  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11010  * above the maximum userspace VA.
11011  */
11012 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11013 #else
11014 #error Nested shared page mapping is unsupported on this config
11015 #endif
11016 
11017 MARK_AS_PMAP_TEXT kern_return_t
11018 pmap_insert_commpage_internal(
11019 	pmap_t pmap)
11020 {
11021 	kern_return_t kr = KERN_SUCCESS;
11022 	vm_offset_t commpage_vaddr;
11023 	pt_entry_t *ttep, *src_ttep;
11024 	int options = 0;
11025 	pmap_t commpage_pmap = commpage_pmap_default;
11026 
11027 	/* Validate the pmap input before accessing its data. */
11028 	validate_pmap_mutable(pmap);
11029 
11030 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11031 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11032 
11033 #if __ARM_MIXED_PAGE_SIZE__
11034 #if !__ARM_16K_PG__
11035 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11036 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11037 #endif /* !__ARM_16K_PG__ */
11038 
11039 	/* Choose the correct shared page pmap to use. */
11040 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11041 	if (pmap_page_size == 16384) {
11042 		commpage_pmap = commpage_pmap_default;
11043 	} else if (pmap_page_size == 4096) {
11044 		commpage_pmap = commpage_pmap_4k;
11045 	} else {
11046 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11047 	}
11048 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11049 
11050 #if XNU_MONITOR
11051 	options |= PMAP_OPTIONS_NOWAIT;
11052 #endif /* XNU_MONITOR */
11053 
11054 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11055 #error We assume a single page.
11056 #endif
11057 
11058 	if (pmap_is_64bit(pmap)) {
11059 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11060 	} else {
11061 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11062 	}
11063 
11064 
11065 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11066 
11067 	/*
11068 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11069 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11070 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11071 	 * to "nest".
11072 	 *
11073 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11074 	 * nesting just means inserting pointers to pre-allocated tables inside of
11075 	 * the passed in pmap to allow us to share page tables (which map the shared
11076 	 * page) for every task. This saves at least one page of memory per process
11077 	 * compared to creating new page tables in every process for mapping the
11078 	 * shared page.
11079 	 */
11080 
11081 	/**
11082 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11083 	 * page's tables into place.
11084 	 */
11085 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11086 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11087 
11088 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11089 
11090 		if (kr != KERN_SUCCESS) {
11091 #if XNU_MONITOR
11092 			if (kr == KERN_RESOURCE_SHORTAGE) {
11093 				return kr;
11094 			} else
11095 #endif
11096 			if (kr == KERN_ABORTED) {
11097 				return kr;
11098 			} else {
11099 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11100 			}
11101 		}
11102 
11103 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11104 	}
11105 
11106 	if (*ttep != ARM_PTE_EMPTY) {
11107 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11108 	}
11109 
11110 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11111 
11112 	*ttep = *src_ttep;
11113 	FLUSH_PTE_STRONG();
11114 
11115 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11116 
11117 	return kr;
11118 }
11119 
11120 static void
11121 pmap_unmap_commpage(
11122 	pmap_t pmap)
11123 {
11124 	pt_entry_t *ttep;
11125 	vm_offset_t commpage_vaddr;
11126 	pmap_t commpage_pmap = commpage_pmap_default;
11127 
11128 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11129 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11130 
11131 #if __ARM_MIXED_PAGE_SIZE__
11132 #if !__ARM_16K_PG__
11133 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11134 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11135 #endif /* !__ARM_16K_PG__ */
11136 
11137 	/* Choose the correct shared page pmap to use. */
11138 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11139 	if (pmap_page_size == 16384) {
11140 		commpage_pmap = commpage_pmap_default;
11141 	} else if (pmap_page_size == 4096) {
11142 		commpage_pmap = commpage_pmap_4k;
11143 	} else {
11144 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11145 	}
11146 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11147 
11148 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11149 #error We assume a single page.
11150 #endif
11151 
11152 	if (pmap_is_64bit(pmap)) {
11153 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11154 	} else {
11155 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11156 	}
11157 
11158 
11159 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11160 
11161 	if (ttep == NULL) {
11162 		return;
11163 	}
11164 
11165 	/* It had better be mapped to the shared page. */
11166 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11167 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11168 	}
11169 
11170 	*ttep = ARM_TTE_EMPTY;
11171 	FLUSH_PTE_STRONG();
11172 
11173 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false);
11174 	sync_tlb_flush();
11175 }
11176 
11177 void
11178 pmap_insert_commpage(
11179 	pmap_t pmap)
11180 {
11181 	kern_return_t kr = KERN_FAILURE;
11182 #if XNU_MONITOR
11183 	do {
11184 		kr = pmap_insert_commpage_ppl(pmap);
11185 
11186 		if (kr == KERN_RESOURCE_SHORTAGE) {
11187 			pmap_alloc_page_for_ppl(0);
11188 		}
11189 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11190 
11191 	pmap_ledger_check_balance(pmap);
11192 #else
11193 	do {
11194 		kr = pmap_insert_commpage_internal(pmap);
11195 	} while (kr == KERN_ABORTED);
11196 #endif
11197 
11198 	if (kr != KERN_SUCCESS) {
11199 		panic("%s: failed to insert the shared page, kr=%d, "
11200 		    "pmap=%p",
11201 		    __FUNCTION__, kr,
11202 		    pmap);
11203 	}
11204 }
11205 
11206 static boolean_t
11207 pmap_is_64bit(
11208 	pmap_t pmap)
11209 {
11210 	return pmap->is_64bit;
11211 }
11212 
11213 bool
11214 pmap_is_exotic(
11215 	pmap_t pmap __unused)
11216 {
11217 	return false;
11218 }
11219 
11220 
11221 /* ARMTODO -- an implementation that accounts for
11222  * holes in the physical map, if any.
11223  */
11224 boolean_t
11225 pmap_valid_page(
11226 	ppnum_t pn)
11227 {
11228 	return pa_valid(ptoa(pn));
11229 }
11230 
11231 boolean_t
11232 pmap_bootloader_page(
11233 	ppnum_t pn)
11234 {
11235 	pmap_paddr_t paddr = ptoa(pn);
11236 
11237 	if (pa_valid(paddr)) {
11238 		return FALSE;
11239 	}
11240 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11241 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11242 }
11243 
11244 MARK_AS_PMAP_TEXT boolean_t
11245 pmap_is_empty_internal(
11246 	pmap_t pmap,
11247 	vm_map_offset_t va_start,
11248 	vm_map_offset_t va_end)
11249 {
11250 	vm_map_offset_t block_start, block_end;
11251 	tt_entry_t *tte_p;
11252 
11253 	if (pmap == NULL) {
11254 		return TRUE;
11255 	}
11256 
11257 	validate_pmap(pmap);
11258 
11259 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11260 	unsigned int initial_not_in_kdp = not_in_kdp;
11261 
11262 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11263 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11264 	}
11265 
11266 
11267 	/* TODO: This will be faster if we increment ttep at each level. */
11268 	block_start = va_start;
11269 
11270 	while (block_start < va_end) {
11271 		pt_entry_t     *bpte_p, *epte_p;
11272 		pt_entry_t     *pte_p;
11273 
11274 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11275 		if (block_end > va_end) {
11276 			block_end = va_end;
11277 		}
11278 
11279 		tte_p = pmap_tte(pmap, block_start);
11280 		if ((tte_p != PT_ENTRY_NULL)
11281 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11282 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11283 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11284 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11285 
11286 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11287 				if (*pte_p != ARM_PTE_EMPTY) {
11288 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11289 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11290 					}
11291 					return FALSE;
11292 				}
11293 			}
11294 		}
11295 		block_start = block_end;
11296 	}
11297 
11298 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11299 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11300 	}
11301 
11302 	return TRUE;
11303 }
11304 
11305 boolean_t
11306 pmap_is_empty(
11307 	pmap_t pmap,
11308 	vm_map_offset_t va_start,
11309 	vm_map_offset_t va_end)
11310 {
11311 #if XNU_MONITOR
11312 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11313 #else
11314 	return pmap_is_empty_internal(pmap, va_start, va_end);
11315 #endif
11316 }
11317 
11318 vm_map_offset_t
11319 pmap_max_offset(
11320 	boolean_t               is64,
11321 	unsigned int    option)
11322 {
11323 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11324 }
11325 
11326 vm_map_offset_t
11327 pmap_max_64bit_offset(
11328 	__unused unsigned int option)
11329 {
11330 	vm_map_offset_t max_offset_ret = 0;
11331 
11332 #if defined(__arm64__)
11333 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11334 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11335 		max_offset_ret = arm64_pmap_max_offset_default;
11336 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11337 		max_offset_ret = min_max_offset;
11338 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11339 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11340 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11341 		if (arm64_pmap_max_offset_default) {
11342 			max_offset_ret = arm64_pmap_max_offset_default;
11343 		} else if (max_mem > 0xC0000000) {
11344 			// devices with > 3GB of memory
11345 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11346 		} else if (max_mem > 0x40000000) {
11347 			// devices with > 1GB and <= 3GB of memory
11348 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11349 		} else {
11350 			// devices with <= 1 GB of memory
11351 			max_offset_ret = min_max_offset;
11352 		}
11353 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11354 		if (arm64_pmap_max_offset_default) {
11355 			// Allow the boot-arg to override jumbo size
11356 			max_offset_ret = arm64_pmap_max_offset_default;
11357 		} else {
11358 			max_offset_ret = MACH_VM_MAX_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11359 		}
11360 	} else {
11361 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11362 	}
11363 
11364 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11365 	assert(max_offset_ret >= min_max_offset);
11366 #else
11367 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11368 #endif
11369 
11370 	return max_offset_ret;
11371 }
11372 
11373 vm_map_offset_t
11374 pmap_max_32bit_offset(
11375 	unsigned int option)
11376 {
11377 	vm_map_offset_t max_offset_ret = 0;
11378 
11379 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11380 		max_offset_ret = arm_pmap_max_offset_default;
11381 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11382 		max_offset_ret = VM_MAX_ADDRESS;
11383 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11384 		max_offset_ret = VM_MAX_ADDRESS;
11385 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11386 		if (arm_pmap_max_offset_default) {
11387 			max_offset_ret = arm_pmap_max_offset_default;
11388 		} else if (max_mem > 0x20000000) {
11389 			max_offset_ret = VM_MAX_ADDRESS;
11390 		} else {
11391 			max_offset_ret = VM_MAX_ADDRESS;
11392 		}
11393 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11394 		max_offset_ret = VM_MAX_ADDRESS;
11395 	} else {
11396 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11397 	}
11398 
11399 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11400 	return max_offset_ret;
11401 }
11402 
11403 #if CONFIG_DTRACE
11404 /*
11405  * Constrain DTrace copyin/copyout actions
11406  */
11407 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11408 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11409 
11410 kern_return_t
11411 dtrace_copyio_preflight(
11412 	__unused addr64_t va)
11413 {
11414 	if (current_map() == kernel_map) {
11415 		return KERN_FAILURE;
11416 	} else {
11417 		return KERN_SUCCESS;
11418 	}
11419 }
11420 
11421 kern_return_t
11422 dtrace_copyio_postflight(
11423 	__unused addr64_t va)
11424 {
11425 	return KERN_SUCCESS;
11426 }
11427 #endif /* CONFIG_DTRACE */
11428 
11429 
11430 void
11431 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11432 {
11433 }
11434 
11435 
11436 void
11437 pmap_flush(
11438 	__unused pmap_flush_context *cpus_to_flush)
11439 {
11440 	/* not implemented yet */
11441 	return;
11442 }
11443 
11444 #if XNU_MONITOR
11445 
11446 /*
11447  * Enforce that the address range described by kva and nbytes is not currently
11448  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11449  * unintentionally writing to PPL-owned memory.
11450  */
11451 void
11452 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11453 {
11454 	vm_offset_t end;
11455 	if (os_add_overflow(kva, nbytes, &end)) {
11456 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11457 	}
11458 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11459 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11460 		pp_attr_t attr;
11461 		unsigned int pai = pa_index(pa);
11462 		if (ckva == phystokv(pa)) {
11463 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11464 		}
11465 		do {
11466 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11467 			if (attr & PP_ATTR_MONITOR) {
11468 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11469 			}
11470 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11471 	}
11472 }
11473 
11474 void
11475 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11476 {
11477 	vm_offset_t end;
11478 	if (os_add_overflow(kva, nbytes, &end)) {
11479 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11480 	}
11481 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11482 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11483 
11484 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11485 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11486 		}
11487 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11488 		ppattr_pa_clear_no_monitor(pa);
11489 	}
11490 }
11491 
11492 /**
11493  * Lock down a page, making all mappings read-only, and preventing further
11494  * mappings or removal of this particular kva's mapping. Effectively, it makes
11495  * the physical page at kva immutable (see the ppl_writable parameter for an
11496  * exception to this).
11497  *
11498  * @param kva Valid address to any mapping of the physical page to lockdown.
11499  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11500  * @param ppl_writable True if the PPL should still be able to write to the page
11501  *                     using the physical aperture mapping. False will make the
11502  *                     page read-only for both the kernel and PPL in the
11503  *                     physical aperture.
11504  */
11505 
11506 MARK_AS_PMAP_TEXT static void
11507 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11508 {
11509 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11510 }
11511 
11512 /**
11513  * Lock down a page, giving all mappings the specified maximum permissions, and
11514  * preventing further mappings or removal of this particular kva's mapping.
11515  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11516  * parameter for an exception to this).
11517  *
11518  * @param kva Valid address to any mapping of the physical page to lockdown.
11519  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11520  * @param ppl_writable True if the PPL should still be able to write to the page
11521  *                     using the physical aperture mapping. False will make the
11522  *                     page read-only for both the kernel and PPL in the
11523  *                     physical aperture.
11524  * @param prot Maximum permissions to allow in existing alias mappings
11525  */
11526 MARK_AS_PMAP_TEXT static void
11527 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11528 {
11529 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11530 	const unsigned int pai = pa_index(pa);
11531 
11532 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11533 	pvh_lock(pai);
11534 	pv_entry_t **pvh = pai_to_pvh(pai);
11535 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11536 
11537 	if (__improbable(ppattr_pa_test_monitor(pa))) {
11538 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11539 	}
11540 
11541 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11542 		panic("%s: %#lx already locked down/executable (%#llx)",
11543 		    __func__, kva, (uint64_t)pvh_flags);
11544 	}
11545 
11546 
11547 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11548 
11549 	/* Update the physical aperture mapping to prevent kernel write access. */
11550 	const unsigned int new_xprr_perm =
11551 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11552 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11553 
11554 	pvh_unlock(pai);
11555 
11556 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11557 
11558 	/**
11559 	 * Double-check that the mapping didn't change physical addresses before the
11560 	 * LOCKDOWN flag was set (there is a brief window between the above
11561 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11562 	 *
11563 	 * This doesn't solve the ABA problem, but this doesn't have to since once
11564 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
11565 	 * page without the LOCKDOWN flag already set (so any future mappings can
11566 	 * only be RO, and no existing mappings can be removed).
11567 	 */
11568 	if (kvtophys_nofail(kva) != pa) {
11569 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11570 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11571 	}
11572 }
11573 
11574 /**
11575  * Helper for releasing a page from being locked down to the PPL, making it writable to the
11576  * kernel once again.
11577  *
11578  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11579  *       to unlockdown a page that was never locked down, will panic.
11580  *
11581  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
11582  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11583  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11584  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11585  *                     deviation will result in a panic.
11586  */
11587 MARK_AS_PMAP_TEXT static void
11588 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11589 {
11590 	pvh_assert_locked(pai);
11591 	pv_entry_t **pvh = pai_to_pvh(pai);
11592 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11593 
11594 	if (__improbable(!(pvh_flags & lockdown_flag))) {
11595 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11596 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11597 	}
11598 
11599 
11600 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11601 
11602 	/* Restore the pre-lockdown physical aperture mapping permissions. */
11603 	const unsigned int old_xprr_perm =
11604 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11605 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11606 }
11607 
11608 /**
11609  * Release a page from being locked down to the PPL, making it writable to the
11610  * kernel once again.
11611  *
11612  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11613  *       to unlockdown a page that was never locked down, will panic.
11614  *
11615  * @param kva Valid address to any mapping of the physical page to unlockdown.
11616  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11617  * @param ppl_writable This must match whatever `ppl_writable` parameter was
11618  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
11619  *                     deviation will result in a panic.
11620  */
11621 MARK_AS_PMAP_TEXT static void
11622 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11623 {
11624 	const pmap_paddr_t pa = kvtophys_nofail(kva);
11625 	const unsigned int pai = pa_index(pa);
11626 
11627 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11628 	pvh_lock(pai);
11629 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11630 	pvh_unlock(pai);
11631 }
11632 
11633 #else /* XNU_MONITOR */
11634 
11635 void __unused
11636 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11637 {
11638 }
11639 
11640 void __unused
11641 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11642 {
11643 }
11644 
11645 #endif /* !XNU_MONITOR */
11646 
11647 
11648 MARK_AS_PMAP_TEXT static inline void
11649 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11650 {
11651 #if XNU_MONITOR
11652 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11653 #else
11654 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11655 #endif
11656 }
11657 
11658 MARK_AS_PMAP_TEXT static inline void
11659 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11660 {
11661 #if XNU_MONITOR
11662 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11663 #else
11664 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11665 #endif
11666 }
11667 
11668 /**
11669  * Perform basic validation checks on the destination only and
11670  * corresponding offset/sizes prior to writing to a read only allocation.
11671  *
11672  * @note Should be called before writing to an allocation from the read
11673  * only allocator.
11674  *
11675  * @param zid The ID of the zone the allocation belongs to.
11676  * @param va VA of element being modified (destination).
11677  * @param offset Offset being written to, in the element.
11678  * @param new_data_size Size of modification.
11679  *
11680  */
11681 
11682 MARK_AS_PMAP_TEXT static void
11683 pmap_ro_zone_validate_element_dst(
11684 	zone_id_t           zid,
11685 	vm_offset_t         va,
11686 	vm_offset_t         offset,
11687 	vm_size_t           new_data_size)
11688 {
11689 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11690 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11691 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11692 	}
11693 
11694 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11695 
11696 	/* Check element is from correct zone and properly aligned */
11697 	zone_require_ro(zid, elem_size, (void*)va);
11698 
11699 	if (__improbable(new_data_size > (elem_size - offset))) {
11700 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
11701 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11702 	}
11703 	if (__improbable(offset >= elem_size)) {
11704 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
11705 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11706 	}
11707 }
11708 
11709 
11710 /**
11711  * Perform basic validation checks on the source, destination and
11712  * corresponding offset/sizes prior to writing to a read only allocation.
11713  *
11714  * @note Should be called before writing to an allocation from the read
11715  * only allocator.
11716  *
11717  * @param zid The ID of the zone the allocation belongs to.
11718  * @param va VA of element being modified (destination).
11719  * @param offset Offset being written to, in the element.
11720  * @param new_data Pointer to new data (source).
11721  * @param new_data_size Size of modification.
11722  *
11723  */
11724 
11725 MARK_AS_PMAP_TEXT static void
11726 pmap_ro_zone_validate_element(
11727 	zone_id_t           zid,
11728 	vm_offset_t         va,
11729 	vm_offset_t         offset,
11730 	const vm_offset_t   new_data,
11731 	vm_size_t           new_data_size)
11732 {
11733 	vm_offset_t sum = 0;
11734 
11735 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11736 		panic("%s: Integer addition overflow %p + %lu = %lu",
11737 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11738 	}
11739 
11740 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11741 }
11742 
11743 /**
11744  * Ensure that physical page is locked down and pinned, before writing to it.
11745  *
11746  * @note Should be called before writing to an allocation from the read
11747  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
11748  * ensure that it is called after the modification.
11749  *
11750  *
11751  * @param pa Physical address of the element being modified.
11752  * @param va Virtual address of element being modified.
11753  * @param size Size of the modification.
11754  *
11755  */
11756 
11757 MARK_AS_PMAP_TEXT static void
11758 pmap_ro_zone_lock_phy_page(
11759 	const pmap_paddr_t  pa,
11760 	vm_offset_t         va,
11761 	vm_size_t           size)
11762 {
11763 	const unsigned int pai = pa_index(pa);
11764 	pvh_lock(pai);
11765 
11766 	/* Ensure that the physical page is locked down */
11767 #if XNU_MONITOR
11768 	pv_entry_t **pvh = pai_to_pvh(pai);
11769 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
11770 		panic("%s: Physical page not locked down %llx", __func__, pa);
11771 	}
11772 #endif /* XNU_MONITOR */
11773 
11774 	/* Ensure page can't become PPL-owned memory before the memcpy occurs */
11775 	pmap_pin_kernel_pages(va, size);
11776 }
11777 
11778 /**
11779  * Unlock and unpin physical page after writing to it.
11780  *
11781  * @note Should be called after writing to an allocation from the read
11782  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
11783  * ensure that it has been called prior to the modification.
11784  *
11785  * @param pa Physical address of the element that was modified.
11786  * @param va Virtual address of element that was modified.
11787  * @param size Size of the modification.
11788  *
11789  */
11790 
11791 MARK_AS_PMAP_TEXT static void
11792 pmap_ro_zone_unlock_phy_page(
11793 	const pmap_paddr_t  pa,
11794 	vm_offset_t         va,
11795 	vm_size_t           size)
11796 {
11797 	const unsigned int pai = pa_index(pa);
11798 	pmap_unpin_kernel_pages(va, size);
11799 	pvh_unlock(pai);
11800 }
11801 
11802 /**
11803  * Function to copy kauth_cred from new_data to kv.
11804  * Function defined in "kern_prot.c"
11805  *
11806  * @note Will be removed upon completion of
11807  * <rdar://problem/72635194> Compiler PAC support for memcpy.
11808  *
11809  * @param kv Address to copy new data to.
11810  * @param new_data Pointer to new data.
11811  *
11812  */
11813 
11814 extern void
11815 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11816 
11817 /**
11818  * Zalloc-specific memcpy that writes through the physical aperture
11819  * and ensures the element being modified is from a read-only zone.
11820  *
11821  * @note Designed to work only with the zone allocator's read-only submap.
11822  *
11823  * @param zid The ID of the zone to allocate from.
11824  * @param va VA of element to be modified.
11825  * @param offset Offset from element.
11826  * @param new_data Pointer to new data.
11827  * @param new_data_size	Size of modification.
11828  *
11829  */
11830 
11831 void
11832 pmap_ro_zone_memcpy(
11833 	zone_id_t           zid,
11834 	vm_offset_t         va,
11835 	vm_offset_t         offset,
11836 	const vm_offset_t   new_data,
11837 	vm_size_t           new_data_size)
11838 {
11839 #if XNU_MONITOR
11840 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
11841 #else /* XNU_MONITOR */
11842 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11843 #endif /* XNU_MONITOR */
11844 }
11845 
11846 MARK_AS_PMAP_TEXT void
11847 pmap_ro_zone_memcpy_internal(
11848 	zone_id_t             zid,
11849 	vm_offset_t           va,
11850 	vm_offset_t           offset,
11851 	const vm_offset_t     new_data,
11852 	vm_size_t             new_data_size)
11853 {
11854 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11855 
11856 	if (!new_data || new_data_size == 0) {
11857 		return;
11858 	}
11859 
11860 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11861 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
11862 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11863 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
11864 }
11865 
11866 /**
11867  * Zalloc-specific function to atomically mutate fields of an element that
11868  * belongs to a read-only zone, via the physcial aperture.
11869  *
11870  * @note Designed to work only with the zone allocator's read-only submap.
11871  *
11872  * @param zid The ID of the zone the element belongs to.
11873  * @param va VA of element to be modified.
11874  * @param offset Offset in element.
11875  * @param op Atomic operation to perform.
11876  * @param value	Mutation value.
11877  *
11878  */
11879 
11880 uint64_t
11881 pmap_ro_zone_atomic_op(
11882 	zone_id_t             zid,
11883 	vm_offset_t           va,
11884 	vm_offset_t           offset,
11885 	zro_atomic_op_t       op,
11886 	uint64_t              value)
11887 {
11888 #if XNU_MONITOR
11889 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
11890 #else /* XNU_MONITOR */
11891 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11892 #endif /* XNU_MONITOR */
11893 }
11894 
11895 MARK_AS_PMAP_TEXT uint64_t
11896 pmap_ro_zone_atomic_op_internal(
11897 	zone_id_t             zid,
11898 	vm_offset_t           va,
11899 	vm_offset_t           offset,
11900 	zro_atomic_op_t       op,
11901 	uint64_t              value)
11902 {
11903 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11904 	vm_size_t value_size = op & 0xf;
11905 
11906 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11907 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
11908 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11909 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
11910 
11911 	return value;
11912 }
11913 
11914 /**
11915  * bzero for allocations from read only zones, that writes through the
11916  * physical aperture.
11917  *
11918  * @note This is called by the zfree path of all allocations from read
11919  * only zones.
11920  *
11921  * @param zid The ID of the zone the allocation belongs to.
11922  * @param va VA of element to be zeroed.
11923  * @param offset Offset in the element.
11924  * @param size	Size of allocation.
11925  *
11926  */
11927 
11928 void
11929 pmap_ro_zone_bzero(
11930 	zone_id_t       zid,
11931 	vm_offset_t     va,
11932 	vm_offset_t     offset,
11933 	vm_size_t       size)
11934 {
11935 #if XNU_MONITOR
11936 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
11937 #else /* XNU_MONITOR */
11938 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
11939 #endif /* XNU_MONITOR */
11940 }
11941 
11942 MARK_AS_PMAP_TEXT void
11943 pmap_ro_zone_bzero_internal(
11944 	zone_id_t       zid,
11945 	vm_offset_t     va,
11946 	vm_offset_t     offset,
11947 	vm_size_t       size)
11948 {
11949 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11950 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11951 	pmap_ro_zone_lock_phy_page(pa, va, size);
11952 	bzero((void*)phystokv(pa), size);
11953 	pmap_ro_zone_unlock_phy_page(pa, va, size);
11954 }
11955 
11956 /**
11957  * Removes write access from the Physical Aperture.
11958  *
11959  * @note For non-PPL devices, it simply makes all virtual mappings RO.
11960  * @note Designed to work only with the zone allocator's read-only submap.
11961  *
11962  * @param va VA of the page to restore write access to.
11963  *
11964  */
11965 MARK_AS_PMAP_TEXT static void
11966 pmap_phys_write_disable(vm_address_t va)
11967 {
11968 #if XNU_MONITOR
11969 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
11970 #else /* XNU_MONITOR */
11971 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
11972 #endif /* XNU_MONITOR */
11973 }
11974 
11975 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
11976 
11977 MARK_AS_PMAP_TEXT mach_vm_size_t
11978 pmap_query_resident_internal(
11979 	pmap_t                  pmap,
11980 	vm_map_address_t        start,
11981 	vm_map_address_t        end,
11982 	mach_vm_size_t          *compressed_bytes_p)
11983 {
11984 	mach_vm_size_t  resident_bytes = 0;
11985 	mach_vm_size_t  compressed_bytes = 0;
11986 
11987 	pt_entry_t     *bpte, *epte;
11988 	pt_entry_t     *pte_p;
11989 	tt_entry_t     *tte_p;
11990 
11991 	if (pmap == NULL) {
11992 		return PMAP_RESIDENT_INVALID;
11993 	}
11994 
11995 	validate_pmap(pmap);
11996 
11997 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11998 
11999 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12000 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12001 	    (end % pt_attr_page_size(pt_attr)))) {
12002 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12003 	}
12004 
12005 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12006 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12007 	}
12008 
12009 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12010 	tte_p = pmap_tte(pmap, start);
12011 	if (tte_p == (tt_entry_t *) NULL) {
12012 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12013 		return PMAP_RESIDENT_INVALID;
12014 	}
12015 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12016 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12017 		bpte = &pte_p[pte_index(pt_attr, start)];
12018 		epte = &pte_p[pte_index(pt_attr, end)];
12019 
12020 		for (; bpte < epte; bpte++) {
12021 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12022 				compressed_bytes += pt_attr_page_size(pt_attr);
12023 			} else if (pa_valid(pte_to_pa(*bpte))) {
12024 				resident_bytes += pt_attr_page_size(pt_attr);
12025 			}
12026 		}
12027 	}
12028 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12029 
12030 	if (compressed_bytes_p) {
12031 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12032 		*compressed_bytes_p += compressed_bytes;
12033 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12034 	}
12035 
12036 	return resident_bytes;
12037 }
12038 
12039 mach_vm_size_t
12040 pmap_query_resident(
12041 	pmap_t                  pmap,
12042 	vm_map_address_t        start,
12043 	vm_map_address_t        end,
12044 	mach_vm_size_t          *compressed_bytes_p)
12045 {
12046 	mach_vm_size_t          total_resident_bytes;
12047 	mach_vm_size_t          compressed_bytes;
12048 	vm_map_address_t        va;
12049 
12050 
12051 	if (pmap == PMAP_NULL) {
12052 		if (compressed_bytes_p) {
12053 			*compressed_bytes_p = 0;
12054 		}
12055 		return 0;
12056 	}
12057 
12058 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12059 
12060 	total_resident_bytes = 0;
12061 	compressed_bytes = 0;
12062 
12063 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12064 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12065 	    VM_KERNEL_ADDRHIDE(end));
12066 
12067 	va = start;
12068 	while (va < end) {
12069 		vm_map_address_t l;
12070 		mach_vm_size_t resident_bytes;
12071 
12072 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12073 
12074 		if (l > end) {
12075 			l = end;
12076 		}
12077 #if XNU_MONITOR
12078 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12079 #else
12080 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12081 #endif
12082 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12083 			break;
12084 		}
12085 
12086 		total_resident_bytes += resident_bytes;
12087 
12088 		va = l;
12089 	}
12090 
12091 	if (compressed_bytes_p) {
12092 		*compressed_bytes_p = compressed_bytes;
12093 	}
12094 
12095 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12096 	    total_resident_bytes);
12097 
12098 	return total_resident_bytes;
12099 }
12100 
12101 #if MACH_ASSERT
12102 static void
12103 pmap_check_ledgers(
12104 	pmap_t pmap)
12105 {
12106 	int     pid;
12107 	char    *procname;
12108 
12109 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12110 		/*
12111 		 * This pmap was not or is no longer fully associated
12112 		 * with a task (e.g. the old pmap after a fork()/exec() or
12113 		 * spawn()).  Its "ledger" still points at a task that is
12114 		 * now using a different (and active) address space, so
12115 		 * we can't check that all the pmap ledgers are balanced here.
12116 		 *
12117 		 * If the "pid" is set, that means that we went through
12118 		 * pmap_set_process() in task_terminate_internal(), so
12119 		 * this task's ledger should not have been re-used and
12120 		 * all the pmap ledgers should be back to 0.
12121 		 */
12122 		return;
12123 	}
12124 
12125 	pid = pmap->pmap_pid;
12126 	procname = pmap->pmap_procname;
12127 
12128 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12129 }
12130 #endif /* MACH_ASSERT */
12131 
12132 void
12133 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12134 {
12135 }
12136 
12137 /**
12138  * The minimum shared region nesting size is used by the VM to determine when to
12139  * break up large mappings to nested regions. The smallest size that these
12140  * mappings can be broken into is determined by what page table level those
12141  * regions are being nested in at and the size of the page tables.
12142  *
12143  * For instance, if a nested region is nesting at L2 for a process utilizing
12144  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12145  * block entry).
12146  *
12147  * @param pmap The target pmap to determine the block size based on whether it's
12148  *             using 16KB or 4KB page tables.
12149  */
12150 uint64_t
12151 pmap_shared_region_size_min(__unused pmap_t pmap)
12152 {
12153 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12154 
12155 	/**
12156 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12157 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12158 	 * point to shared L3 page tables in the shared region pmap.
12159 	 */
12160 	return pt_attr_twig_size(pt_attr);
12161 }
12162 
12163 boolean_t
12164 pmap_enforces_execute_only(
12165 	pmap_t pmap)
12166 {
12167 	return pmap != kernel_pmap;
12168 }
12169 
12170 MARK_AS_PMAP_TEXT void
12171 pmap_set_vm_map_cs_enforced_internal(
12172 	pmap_t pmap,
12173 	bool new_value)
12174 {
12175 	validate_pmap_mutable(pmap);
12176 	pmap->pmap_vm_map_cs_enforced = new_value;
12177 }
12178 
12179 void
12180 pmap_set_vm_map_cs_enforced(
12181 	pmap_t pmap,
12182 	bool new_value)
12183 {
12184 #if XNU_MONITOR
12185 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12186 #else
12187 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12188 #endif
12189 }
12190 
12191 extern int cs_process_enforcement_enable;
12192 bool
12193 pmap_get_vm_map_cs_enforced(
12194 	pmap_t pmap)
12195 {
12196 	if (cs_process_enforcement_enable) {
12197 		return true;
12198 	}
12199 	return pmap->pmap_vm_map_cs_enforced;
12200 }
12201 
12202 MARK_AS_PMAP_TEXT void
12203 pmap_set_jit_entitled_internal(
12204 	__unused pmap_t pmap)
12205 {
12206 	return;
12207 }
12208 
12209 void
12210 pmap_set_jit_entitled(
12211 	pmap_t pmap)
12212 {
12213 #if XNU_MONITOR
12214 	pmap_set_jit_entitled_ppl(pmap);
12215 #else
12216 	pmap_set_jit_entitled_internal(pmap);
12217 #endif
12218 }
12219 
12220 bool
12221 pmap_get_jit_entitled(
12222 	__unused pmap_t pmap)
12223 {
12224 	return false;
12225 }
12226 
12227 MARK_AS_PMAP_TEXT void
12228 pmap_set_tpro_internal(
12229 	__unused pmap_t pmap)
12230 {
12231 	return;
12232 }
12233 
12234 void
12235 pmap_set_tpro(
12236 	pmap_t pmap)
12237 {
12238 #if XNU_MONITOR
12239 	pmap_set_tpro_ppl(pmap);
12240 #else /* XNU_MONITOR */
12241 	pmap_set_tpro_internal(pmap);
12242 #endif /* XNU_MONITOR */
12243 }
12244 
12245 bool
12246 pmap_get_tpro(
12247 	__unused pmap_t pmap)
12248 {
12249 	return false;
12250 }
12251 
12252 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12253 
12254 MARK_AS_PMAP_TEXT kern_return_t
12255 pmap_query_page_info_internal(
12256 	pmap_t          pmap,
12257 	vm_map_offset_t va,
12258 	int             *disp_p)
12259 {
12260 	pmap_paddr_t    pa;
12261 	int             disp;
12262 	unsigned int    pai;
12263 	pt_entry_t      *pte_p, pte;
12264 	pv_entry_t      **pv_h, *pve_p;
12265 
12266 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12267 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12268 		*disp_p = 0;
12269 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12270 		return KERN_INVALID_ARGUMENT;
12271 	}
12272 
12273 	validate_pmap(pmap);
12274 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12275 
12276 try_again:
12277 	disp = 0;
12278 	pte_p = pmap_pte(pmap, va);
12279 	if (pte_p == PT_ENTRY_NULL) {
12280 		goto done;
12281 	}
12282 	pte = *(volatile pt_entry_t*)pte_p;
12283 	pa = pte_to_pa(pte);
12284 	if (pa == 0) {
12285 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12286 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12287 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12288 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12289 			}
12290 		}
12291 	} else {
12292 		disp |= PMAP_QUERY_PAGE_PRESENT;
12293 		pai = pa_index(pa);
12294 		if (!pa_valid(pa)) {
12295 			goto done;
12296 		}
12297 		pvh_lock(pai);
12298 		if (pte != *(volatile pt_entry_t*)pte_p) {
12299 			/* something changed: try again */
12300 			pvh_unlock(pai);
12301 			pmap_query_page_info_retries++;
12302 			goto try_again;
12303 		}
12304 		pv_h = pai_to_pvh(pai);
12305 		pve_p = PV_ENTRY_NULL;
12306 		int pve_ptep_idx = 0;
12307 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12308 			pve_p = pvh_pve_list(pv_h);
12309 			while (pve_p != PV_ENTRY_NULL &&
12310 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12311 				pve_p = pve_next(pve_p);
12312 			}
12313 		}
12314 
12315 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12316 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12317 		} else if (ppattr_test_reusable(pai)) {
12318 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12319 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12320 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12321 		}
12322 		pvh_unlock(pai);
12323 	}
12324 
12325 done:
12326 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12327 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12328 	*disp_p = disp;
12329 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12330 	return KERN_SUCCESS;
12331 }
12332 
12333 kern_return_t
12334 pmap_query_page_info(
12335 	pmap_t          pmap,
12336 	vm_map_offset_t va,
12337 	int             *disp_p)
12338 {
12339 #if XNU_MONITOR
12340 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12341 #else
12342 	return pmap_query_page_info_internal(pmap, va, disp_p);
12343 #endif
12344 }
12345 
12346 
12347 
12348 uint32_t
12349 pmap_user_va_bits(pmap_t pmap __unused)
12350 {
12351 #if __ARM_MIXED_PAGE_SIZE__
12352 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12353 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12354 #else
12355 	return 64 - T0SZ_BOOT;
12356 #endif
12357 }
12358 
12359 uint32_t
12360 pmap_kernel_va_bits(void)
12361 {
12362 	return 64 - T1SZ_BOOT;
12363 }
12364 
12365 static vm_map_size_t
12366 pmap_user_va_size(pmap_t pmap)
12367 {
12368 	return 1ULL << pmap_user_va_bits(pmap);
12369 }
12370 
12371 
12372 
12373 bool
12374 pmap_in_ppl(void)
12375 {
12376 	// Unsupported
12377 	return false;
12378 }
12379 
12380 __attribute__((__noreturn__))
12381 void
12382 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12383 {
12384 	panic("%s called on an unsupported platform.", __FUNCTION__);
12385 }
12386 
12387 void *
12388 pmap_claim_reserved_ppl_page(void)
12389 {
12390 	// Unsupported
12391 	return NULL;
12392 }
12393 
12394 void
12395 pmap_free_reserved_ppl_page(void __unused *kva)
12396 {
12397 	// Unsupported
12398 }
12399 
12400 
12401 #if PMAP_CS_PPL_MONITOR
12402 
12403 /* Immutable part of the trust cache runtime */
12404 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12405 
12406 /* Mutable part of the trust cache runtime */
12407 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12408 
12409 /* Lock for the trust cache runtime */
12410 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12411 
12412 MARK_AS_PMAP_TEXT kern_return_t
12413 pmap_check_trust_cache_runtime_for_uuid_internal(
12414 	const uint8_t check_uuid[kUUIDSize])
12415 {
12416 	kern_return_t ret = KERN_DENIED;
12417 
12418 	if (amfi->TrustCache.version < 3) {
12419 		/* AMFI change hasn't landed in the build */
12420 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12421 		return KERN_NOT_SUPPORTED;
12422 	}
12423 
12424 	/* Lock the runtime as shared */
12425 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12426 
12427 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12428 		&ppl_trust_cache_rt,
12429 		check_uuid,
12430 		NULL);
12431 
12432 	/* Unlock the runtime */
12433 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12434 
12435 	if (tc_ret.error == kTCReturnSuccess) {
12436 		ret = KERN_SUCCESS;
12437 	} else if (tc_ret.error == kTCReturnNotFound) {
12438 		ret = KERN_NOT_FOUND;
12439 	} else {
12440 		ret = KERN_FAILURE;
12441 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12442 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12443 	}
12444 
12445 	return ret;
12446 }
12447 
12448 kern_return_t
12449 pmap_check_trust_cache_runtime_for_uuid(
12450 	const uint8_t check_uuid[kUUIDSize])
12451 {
12452 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12453 }
12454 
12455 MARK_AS_PMAP_TEXT kern_return_t
12456 pmap_load_trust_cache_with_type_internal(
12457 	TCType_t type,
12458 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12459 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12460 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12461 {
12462 	kern_return_t ret = KERN_DENIED;
12463 	pmap_img4_payload_t *payload = NULL;
12464 	size_t img4_payload_len = 0;
12465 	size_t payload_len_aligned = 0;
12466 	size_t manifest_len_aligned = 0;
12467 
12468 	/* Ignore the auxiliary manifest until we add support for it */
12469 	(void)img4_aux_manifest;
12470 	(void)img4_aux_manifest_len;
12471 
12472 
12473 #if PMAP_CS_INCLUDE_CODE_SIGNING
12474 	if (pmap_cs) {
12475 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12476 			panic("trust cache type not loadable from interface: %u", type);
12477 		} else if (type >= kTCTypeTotal) {
12478 			panic("attempted to load an unsupported trust cache type: %u", type);
12479 		}
12480 
12481 		/* Validate entitlement for the calling process */
12482 		if (TCTypeConfig[type].entitlementValue != NULL) {
12483 			const bool entitlement_satisfied = check_entitlement_pmap(
12484 				NULL,
12485 				"com.apple.private.pmap.load-trust-cache",
12486 				TCTypeConfig[type].entitlementValue,
12487 				false,
12488 				true);
12489 
12490 			if (entitlement_satisfied == false) {
12491 				panic("attempted to load trust cache without entitlement: %u", type);
12492 			}
12493 		}
12494 	}
12495 #endif
12496 
12497 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12498 	ret = pmap_reserve_ppl_page();
12499 	if (ret != KERN_SUCCESS) {
12500 		if (ret != KERN_RESOURCE_SHORTAGE) {
12501 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12502 		}
12503 		return ret;
12504 	}
12505 
12506 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
12507 	payload_len_aligned = round_page(pmap_img4_payload_len);
12508 	manifest_len_aligned = round_page(img4_manifest_len);
12509 
12510 	/* Ensure we have valid data passed in */
12511 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12512 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12513 
12514 	/*
12515 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12516 	 * data structure used by libTrustCache to manage the payload. We need to be able to
12517 	 * write to that data structure, so we keep the payload PPL writable.
12518 	 */
12519 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12520 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12521 
12522 	/* Should be safe to read from this now */
12523 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
12524 
12525 	/* Acquire a writable version of the trust cache data structure */
12526 	TrustCache_t *trust_cache = &payload->trust_cache;
12527 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12528 
12529 	/* Calculate the correct length of the img4 payload */
12530 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12531 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12532 	}
12533 
12534 	/* Exclusively lock the runtime */
12535 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12536 
12537 	/* Load the trust cache */
12538 	TCReturn_t tc_ret = amfi->TrustCache.load(
12539 		&ppl_trust_cache_rt,
12540 		type,
12541 		trust_cache,
12542 		(const uintptr_t)payload->img4_payload, img4_payload_len,
12543 		(const uintptr_t)img4_manifest, img4_manifest_len);
12544 
12545 	/* Unlock the runtime */
12546 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12547 
12548 	if (tc_ret.error == kTCReturnSuccess) {
12549 		ret = KERN_SUCCESS;
12550 	} else {
12551 		if (tc_ret.error == kTCReturnDuplicate) {
12552 			ret = KERN_ALREADY_IN_SET;
12553 		} else {
12554 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12555 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12556 
12557 			ret = KERN_FAILURE;
12558 		}
12559 
12560 		/* Unlock the payload data */
12561 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12562 		trust_cache = NULL;
12563 		payload = NULL;
12564 	}
12565 
12566 	/* Unlock the manifest since it is no longer needed */
12567 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12568 
12569 	/* Return the CoreCrypto reserved page back to the free list */
12570 	pmap_release_reserved_ppl_page();
12571 
12572 	return ret;
12573 }
12574 
12575 kern_return_t
12576 pmap_load_trust_cache_with_type(
12577 	TCType_t type,
12578 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12579 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12580 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12581 {
12582 	kern_return_t ret = KERN_DENIED;
12583 
12584 	ret = pmap_load_trust_cache_with_type_ppl(
12585 		type,
12586 		pmap_img4_payload, pmap_img4_payload_len,
12587 		img4_manifest, img4_manifest_len,
12588 		img4_aux_manifest, img4_aux_manifest_len);
12589 
12590 	while (ret == KERN_RESOURCE_SHORTAGE) {
12591 		/* Allocate a page from the free list */
12592 		pmap_alloc_page_for_ppl(0);
12593 
12594 		/* Attempt the call again */
12595 		ret = pmap_load_trust_cache_with_type_ppl(
12596 			type,
12597 			pmap_img4_payload, pmap_img4_payload_len,
12598 			img4_manifest, img4_manifest_len,
12599 			img4_aux_manifest, img4_aux_manifest_len);
12600 	}
12601 
12602 	return ret;
12603 }
12604 
12605 MARK_AS_PMAP_TEXT kern_return_t
12606 pmap_query_trust_cache_safe(
12607 	TCQueryType_t query_type,
12608 	const uint8_t cdhash[kTCEntryHashSize],
12609 	TrustCacheQueryToken_t *query_token)
12610 {
12611 	kern_return_t ret = KERN_NOT_FOUND;
12612 
12613 	/* Validate the query type preemptively */
12614 	if (query_type >= kTCQueryTypeTotal) {
12615 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12616 		return KERN_INVALID_ARGUMENT;
12617 	}
12618 
12619 	/* Lock the runtime as shared */
12620 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12621 
12622 	TCReturn_t tc_ret = amfi->TrustCache.query(
12623 		&ppl_trust_cache_rt,
12624 		query_type,
12625 		cdhash,
12626 		query_token);
12627 
12628 	/* Unlock the runtime */
12629 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12630 
12631 	if (tc_ret.error == kTCReturnSuccess) {
12632 		ret = KERN_SUCCESS;
12633 	} else if (tc_ret.error == kTCReturnNotFound) {
12634 		ret = KERN_NOT_FOUND;
12635 	} else {
12636 		ret = KERN_FAILURE;
12637 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12638 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12639 	}
12640 
12641 	return ret;
12642 }
12643 
12644 MARK_AS_PMAP_TEXT kern_return_t
12645 pmap_query_trust_cache_internal(
12646 	TCQueryType_t query_type,
12647 	const uint8_t cdhash[kTCEntryHashSize],
12648 	TrustCacheQueryToken_t *query_token)
12649 {
12650 	kern_return_t ret = KERN_NOT_FOUND;
12651 	TrustCacheQueryToken_t query_token_safe = {0};
12652 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12653 
12654 	/* Copy in the CDHash into PPL storage */
12655 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12656 
12657 	/* Query through the safe API since we're in the PPL now */
12658 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12659 
12660 	if (query_token != NULL) {
12661 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12662 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12663 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12664 	}
12665 
12666 	return ret;
12667 }
12668 
12669 kern_return_t
12670 pmap_query_trust_cache(
12671 	TCQueryType_t query_type,
12672 	const uint8_t cdhash[kTCEntryHashSize],
12673 	TrustCacheQueryToken_t *query_token)
12674 {
12675 	kern_return_t ret = KERN_NOT_FOUND;
12676 
12677 	ret = pmap_query_trust_cache_ppl(
12678 		query_type,
12679 		cdhash,
12680 		query_token);
12681 
12682 	return ret;
12683 }
12684 
12685 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
12686 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12687 
12688 MARK_AS_PMAP_TEXT void
12689 pmap_toggle_developer_mode_internal(
12690 	bool state)
12691 {
12692 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12693 
12694 	/*
12695 	 * Only the following state transitions are allowed:
12696 	 * -- not set --> false
12697 	 * -- not set --> true
12698 	 * -- true --> false
12699 	 * -- true --> true
12700 	 * -- false --> false
12701 	 *
12702 	 * We never allow false --> true transitions.
12703 	 */
12704 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
12705 
12706 	if ((current == false) && (state == true) && state_set) {
12707 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
12708 	}
12709 
12710 	/* We're going to update the developer mode state, so update this first */
12711 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
12712 
12713 	/* Update the developer mode state on the system */
12714 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
12715 }
12716 
12717 void
12718 pmap_toggle_developer_mode(
12719 	bool state)
12720 {
12721 	pmap_toggle_developer_mode_ppl(state);
12722 }
12723 
12724 #endif /* PMAP_CS_PPL_MONITOR */
12725 
12726 #if PMAP_CS_INCLUDE_CODE_SIGNING
12727 
12728 static int
12729 pmap_cs_profiles_rbtree_compare(
12730 	void *profile0,
12731 	void *profile1)
12732 {
12733 	if (profile0 < profile1) {
12734 		return -1;
12735 	} else if (profile0 > profile1) {
12736 		return 1;
12737 	}
12738 	return 0;
12739 }
12740 
12741 /* Red-black tree for managing provisioning profiles */
12742 MARK_AS_PMAP_DATA static
12743 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
12744 
12745 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12746 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
12747 
12748 /* Lock for the profile red-black tree */
12749 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
12750 
12751 void
12752 pmap_initialize_provisioning_profiles(void)
12753 {
12754 	/* Initialize the profiles red-black tree lock */
12755 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
12756 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
12757 
12758 	/* Initialize the red-black tree itself */
12759 	RB_INIT(&pmap_cs_registered_profiles);
12760 
12761 	printf("initialized PPL provisioning profile data\n");
12762 }
12763 
12764 static bool
12765 pmap_is_testflight_profile(
12766 	pmap_cs_profile_t *profile_obj)
12767 {
12768 	const char *entitlement_name = "beta-reports-active";
12769 	const size_t entitlement_length = strlen(entitlement_name);
12770 	CEQueryOperation_t query[2] = {0};
12771 
12772 	/* If the profile provisions no entitlements, then it isn't a test flight one */
12773 	if (profile_obj->entitlements_ctx == NULL) {
12774 		return false;
12775 	}
12776 
12777 	/* Build our CoreEntitlements query */
12778 	query[0].opcode = kCEOpSelectKey;
12779 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
12780 	query[0].parameters.stringParameter.length = entitlement_length;
12781 	query[1] = CEMatchBool(true);
12782 
12783 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
12784 		profile_obj->entitlements_ctx,
12785 		query, 2);
12786 
12787 	if (ce_err == amfi->CoreEntitlements.kNoError) {
12788 		return true;
12789 	}
12790 
12791 	return false;
12792 }
12793 
12794 static bool
12795 pmap_is_development_profile(
12796 	pmap_cs_profile_t *profile_obj)
12797 {
12798 	/* Check for UPP */
12799 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
12800 		*profile_obj->profile_ctx,
12801 		CESelectDictValue("ProvisionsAllDevices"));
12802 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
12803 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
12804 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
12805 			return false;
12806 		}
12807 	}
12808 
12809 	/* Check for TestFlight profile */
12810 	if (pmap_is_testflight_profile(profile_obj) == true) {
12811 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
12812 		return false;
12813 	}
12814 
12815 	pmap_cs_log_info("%p: development profile", profile_obj);
12816 	return true;
12817 }
12818 
12819 static kern_return_t
12820 pmap_initialize_profile_entitlements(
12821 	pmap_cs_profile_t *profile_obj)
12822 {
12823 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
12824 		*profile_obj->profile_ctx,
12825 		CESelectDictValue("Entitlements"));
12826 
12827 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
12828 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
12829 		profile_obj->entitlements_ctx = NULL;
12830 
12831 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
12832 		return KERN_NOT_FOUND;
12833 	}
12834 
12835 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
12836 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
12837 
12838 	CEValidationResult ce_result = {0};
12839 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
12840 		pmap_cs_core_entitlements_runtime,
12841 		&ce_result,
12842 		der_start, der_end);
12843 	if (ce_err != amfi->CoreEntitlements.kNoError) {
12844 		pmap_cs_log_error("unable to validate profile entitlements: %s",
12845 		    amfi->CoreEntitlements.GetErrorString(ce_err));
12846 
12847 		return KERN_ABORTED;
12848 	}
12849 
12850 	struct CEQueryContext query_ctx = {0};
12851 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
12852 		pmap_cs_core_entitlements_runtime,
12853 		ce_result,
12854 		&query_ctx);
12855 	if (ce_err != amfi->CoreEntitlements.kNoError) {
12856 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
12857 		    amfi->CoreEntitlements.GetErrorString(ce_err));
12858 
12859 		return KERN_ABORTED;
12860 	}
12861 
12862 	/* Setup the entitlements context within the profile object */
12863 	profile_obj->entitlements_ctx_storage = query_ctx;
12864 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
12865 
12866 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
12867 	return KERN_SUCCESS;
12868 }
12869 
12870 kern_return_t
12871 pmap_register_provisioning_profile_internal(
12872 	const vm_address_t payload_addr,
12873 	const vm_size_t payload_size)
12874 {
12875 	kern_return_t ret = KERN_DENIED;
12876 	pmap_cs_profile_t *profile_obj = NULL;
12877 	pmap_profile_payload_t *profile_payload = NULL;
12878 	vm_size_t max_profile_blob_size = 0;
12879 	const uint8_t *profile_content = NULL;
12880 	size_t profile_content_length = 0;
12881 
12882 
12883 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
12884 	ret = pmap_reserve_ppl_page();
12885 	if (ret != KERN_SUCCESS) {
12886 		if (ret != KERN_RESOURCE_SHORTAGE) {
12887 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
12888 		}
12889 		return ret;
12890 	}
12891 
12892 	/* Ensure we have valid data passed in */
12893 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
12894 
12895 	/*
12896 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
12897 	 * data structure used by the PPL to manage the payload. We need to be able to write
12898 	 * to that data structure, so we keep the payload PPL writable.
12899 	 */
12900 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
12901 
12902 	/* Should be safe to read from this now */
12903 	profile_payload = (pmap_profile_payload_t*)payload_addr;
12904 
12905 	/* Ensure the profile blob size provided is valid */
12906 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
12907 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
12908 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
12909 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
12910 	}
12911 
12912 #if PMAP_CS_INCLUDE_INTERNAL_CODE
12913 	const bool allow_development_root_cert = true;
12914 #else
12915 	const bool allow_development_root_cert = false;
12916 #endif
12917 
12918 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
12919 		profile_payload->profile_blob, profile_payload->profile_blob_size,
12920 		allow_development_root_cert,
12921 		&profile_content, &profile_content_length);
12922 
12923 	/* Release the PPL page allocated for CoreCrypto */
12924 	pmap_release_reserved_ppl_page();
12925 
12926 	if (ct_result != 0) {
12927 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
12928 	} else if ((profile_content == NULL) || profile_content_length == 0) {
12929 		panic("PMAP_CS: profile does not have any content: %p | %lu",
12930 		    profile_content, profile_content_length);
12931 	}
12932 
12933 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
12934 		pmap_cs_core_entitlements_runtime,
12935 		CCDER_CONSTRUCTED_SET,
12936 		false,
12937 		profile_content, profile_content + profile_content_length);
12938 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
12939 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
12940 	}
12941 
12942 	/* Acquire a writable version of the profile data structure */
12943 	profile_obj = &profile_payload->profile_obj_storage;
12944 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
12945 
12946 	profile_obj->original_payload = profile_payload;
12947 	profile_obj->profile_ctx_storage = profile_ctx_storage;
12948 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
12949 	os_atomic_store(&profile_obj->reference_count, 0, release);
12950 
12951 	/* Setup the entitlements provisioned by the profile */
12952 	ret = pmap_initialize_profile_entitlements(profile_obj);
12953 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
12954 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
12955 	}
12956 
12957 	/* Setup properties of the profile */
12958 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
12959 
12960 	/* Mark as validated since it passed all checks */
12961 	profile_obj->profile_validated = true;
12962 
12963 	/* Add the profile to the red-black tree */
12964 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
12965 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
12966 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
12967 	}
12968 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
12969 
12970 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
12971 	return KERN_SUCCESS;
12972 }
12973 
12974 kern_return_t
12975 pmap_register_provisioning_profile(
12976 	const vm_address_t payload_addr,
12977 	const vm_size_t payload_size)
12978 {
12979 	kern_return_t ret = KERN_DENIED;
12980 
12981 	ret = pmap_register_provisioning_profile_ppl(
12982 		payload_addr,
12983 		payload_size);
12984 
12985 	while (ret == KERN_RESOURCE_SHORTAGE) {
12986 		/* Allocate a page from the free list */
12987 		pmap_alloc_page_for_ppl(0);
12988 
12989 		/* Attempt the call again */
12990 		ret = pmap_register_provisioning_profile_ppl(
12991 			payload_addr,
12992 			payload_size);
12993 	}
12994 
12995 	return ret;
12996 }
12997 
12998 kern_return_t
12999 pmap_unregister_provisioning_profile_internal(
13000 	pmap_cs_profile_t *profile_obj)
13001 {
13002 	kern_return_t ret = KERN_DENIED;
13003 
13004 	/* Lock the red-black tree exclusively */
13005 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13006 
13007 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13008 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13009 	}
13010 
13011 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13012 	if (reference_count != 0) {
13013 		ret = KERN_FAILURE;
13014 		goto exit;
13015 	}
13016 
13017 	/* Remove the profile from the red-black tree */
13018 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13019 
13020 	/* Unregistration was a success */
13021 	ret = KERN_SUCCESS;
13022 
13023 exit:
13024 	/* Unlock the red-black tree */
13025 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13026 
13027 	if (ret == KERN_SUCCESS) {
13028 		/* Get the original payload address */
13029 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13030 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13031 
13032 		/* Get the original payload size */
13033 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13034 		payload_size = round_page(payload_size);
13035 
13036 		/* Unlock the profile payload */
13037 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13038 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13039 		    profile_payload, payload_size);
13040 
13041 		profile_obj = NULL;
13042 	}
13043 	return ret;
13044 }
13045 
13046 kern_return_t
13047 pmap_unregister_provisioning_profile(
13048 	pmap_cs_profile_t *profile_obj)
13049 {
13050 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13051 }
13052 
13053 kern_return_t
13054 pmap_associate_provisioning_profile_internal(
13055 	pmap_cs_code_directory_t *cd_entry,
13056 	pmap_cs_profile_t *profile_obj)
13057 {
13058 	kern_return_t ret = KERN_DENIED;
13059 
13060 	/* Acquire the lock on the code directory */
13061 	pmap_cs_lock_code_directory(cd_entry);
13062 
13063 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13064 		pmap_cs_log_error("disallowing profile association with verified signature");
13065 		goto exit;
13066 	} else if (cd_entry->profile_obj != NULL) {
13067 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13068 		goto exit;
13069 	}
13070 
13071 	/* Lock the red-black tree as shared */
13072 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13073 
13074 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13075 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13076 	} else if (profile_obj->profile_validated == false) {
13077 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13078 	}
13079 
13080 	/* Associate the profile with the signature */
13081 	cd_entry->profile_obj = profile_obj;
13082 
13083 	/* Increment the reference count on the profile object */
13084 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13085 	if (reference_count == 0) {
13086 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13087 	}
13088 
13089 	/* Unlock the red-black tree */
13090 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13091 
13092 	/* Association was a success */
13093 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13094 	ret = KERN_SUCCESS;
13095 
13096 exit:
13097 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13098 
13099 	return ret;
13100 }
13101 
13102 kern_return_t
13103 pmap_associate_provisioning_profile(
13104 	pmap_cs_code_directory_t *cd_entry,
13105 	pmap_cs_profile_t *profile_obj)
13106 {
13107 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13108 }
13109 
13110 kern_return_t
13111 pmap_disassociate_provisioning_profile_internal(
13112 	pmap_cs_code_directory_t *cd_entry)
13113 {
13114 	pmap_cs_profile_t *profile_obj = NULL;
13115 	kern_return_t ret = KERN_DENIED;
13116 
13117 	/* Acquire the lock on the code directory */
13118 	pmap_cs_lock_code_directory(cd_entry);
13119 
13120 	if (cd_entry->profile_obj == NULL) {
13121 		ret = KERN_NOT_FOUND;
13122 		goto exit;
13123 	}
13124 	profile_obj = cd_entry->profile_obj;
13125 
13126 	/* Disassociate the profile from the signature */
13127 	cd_entry->profile_obj = NULL;
13128 
13129 	/* Disassociation was a success */
13130 	ret = KERN_SUCCESS;
13131 
13132 exit:
13133 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13134 
13135 	if (ret == KERN_SUCCESS) {
13136 		/* Decrement the reference count on the profile object */
13137 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13138 		if (reference_count == UINT32_MAX) {
13139 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13140 		}
13141 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13142 	}
13143 	return ret;
13144 }
13145 
13146 kern_return_t
13147 pmap_disassociate_provisioning_profile(
13148 	pmap_cs_code_directory_t *cd_entry)
13149 {
13150 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13151 }
13152 
13153 kern_return_t
13154 pmap_associate_kernel_entitlements_internal(
13155 	pmap_cs_code_directory_t *cd_entry,
13156 	const void *kernel_entitlements)
13157 {
13158 	kern_return_t ret = KERN_DENIED;
13159 
13160 	if (kernel_entitlements == NULL) {
13161 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13162 	}
13163 
13164 	/* Acquire the lock on the code directory */
13165 	pmap_cs_lock_code_directory(cd_entry);
13166 
13167 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13168 		ret = KERN_DENIED;
13169 		goto out;
13170 	} else if (cd_entry->kernel_entitlements != NULL) {
13171 		ret = KERN_DENIED;
13172 		goto out;
13173 	}
13174 	cd_entry->kernel_entitlements = kernel_entitlements;
13175 
13176 	/* Association was a success */
13177 	ret = KERN_SUCCESS;
13178 
13179 out:
13180 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13181 	return ret;
13182 }
13183 
13184 kern_return_t
13185 pmap_associate_kernel_entitlements(
13186 	pmap_cs_code_directory_t *cd_entry,
13187 	const void *kernel_entitlements)
13188 {
13189 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13190 }
13191 
13192 kern_return_t
13193 pmap_resolve_kernel_entitlements_internal(
13194 	pmap_t pmap,
13195 	const void **kernel_entitlements)
13196 {
13197 	const void *entitlements = NULL;
13198 	pmap_cs_code_directory_t *cd_entry = NULL;
13199 	kern_return_t ret = KERN_DENIED;
13200 
13201 	/* Validate the PMAP object */
13202 	validate_pmap(pmap);
13203 
13204 	/* Take a shared lock on the PMAP */
13205 	pmap_lock(pmap, PMAP_LOCK_SHARED);
13206 
13207 	if (pmap == kernel_pmap) {
13208 		ret = KERN_NOT_FOUND;
13209 		goto out;
13210 	}
13211 
13212 	/*
13213 	 * Acquire the code signature from the PMAP. This function is called when
13214 	 * performing an entitlement check, and since we've confirmed this isn't
13215 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13216 	 * with a code signature.
13217 	 */
13218 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13219 	if (cd_entry == NULL) {
13220 		ret = KERN_NOT_FOUND;
13221 		goto out;
13222 	}
13223 
13224 	entitlements = cd_entry->kernel_entitlements;
13225 	if (entitlements == NULL) {
13226 		ret = KERN_NOT_FOUND;
13227 		goto out;
13228 	}
13229 
13230 	/* Pin and write out the entitlements object pointer */
13231 	if (kernel_entitlements != NULL) {
13232 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13233 		*kernel_entitlements = entitlements;
13234 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13235 	}
13236 
13237 	/* Successfully resolved the entitlements */
13238 	ret = KERN_SUCCESS;
13239 
13240 out:
13241 	/* Unlock the code signature object */
13242 	if (cd_entry != NULL) {
13243 		lck_rw_unlock_shared(&cd_entry->rwlock);
13244 		cd_entry = NULL;
13245 	}
13246 
13247 	/* Unlock the PMAP object */
13248 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
13249 
13250 	return ret;
13251 }
13252 
13253 kern_return_t
13254 pmap_resolve_kernel_entitlements(
13255 	pmap_t pmap,
13256 	const void **kernel_entitlements)
13257 {
13258 	return pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13259 }
13260 
13261 kern_return_t
13262 pmap_accelerate_entitlements_internal(
13263 	pmap_cs_code_directory_t *cd_entry)
13264 {
13265 	const coreentitlements_t *CoreEntitlements = NULL;
13266 	const CS_SuperBlob *superblob = NULL;
13267 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13268 	size_t signature_length = 0;
13269 	size_t acceleration_length = 0;
13270 	size_t required_length = 0;
13271 	kern_return_t ret = KERN_DENIED;
13272 
13273 	/* Setup the CoreEntitlements interface */
13274 	CoreEntitlements = &amfi->CoreEntitlements;
13275 
13276 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13277 
13278 	/* Acquire the lock on the code directory */
13279 	pmap_cs_lock_code_directory(cd_entry);
13280 
13281 	/*
13282 	 * Only reconstituted code signatures can be accelerated. This is only a policy
13283 	 * decision we make since this allows us to re-use any unused space within the
13284 	 * locked down code signature region. There is also a decent bit of validation
13285 	 * within the reconstitution function to ensure blobs are ordered and do not
13286 	 * contain any padding around them which can cause issues here.
13287 	 *
13288 	 * This also serves as a check to ensure the signature is trusted.
13289 	 */
13290 	if (cd_entry->unneeded_code_signature_unlocked == false) {
13291 		ret = KERN_DENIED;
13292 		goto out;
13293 	}
13294 
13295 	if (cd_entry->ce_ctx == NULL) {
13296 		ret = KERN_SUCCESS;
13297 		goto out;
13298 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13299 		ret = KERN_SUCCESS;
13300 		goto out;
13301 	}
13302 
13303 	/* We only support accelerating when size <= PAGE_SIZE */
13304 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13305 	if (ce_err != CoreEntitlements->kNoError) {
13306 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13307 			/* Small entitlement blobs aren't eligible */
13308 			ret = KERN_SUCCESS;
13309 			goto out;
13310 		}
13311 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13312 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13313 	} else if (acceleration_length > PAGE_SIZE) {
13314 		ret = KERN_ABORTED;
13315 		goto out;
13316 	}
13317 	assert(acceleration_length > 0);
13318 
13319 	superblob = cd_entry->superblob;
13320 	signature_length = ntohl(superblob->length);
13321 
13322 	/* Adjust the required length for the overhead structure -- can't overflow */
13323 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13324 	if (required_length > PAGE_SIZE) {
13325 		ret = KERN_ABORTED;
13326 		goto out;
13327 	}
13328 
13329 	/*
13330 	 * First we'll check if the code signature has enough space within the locked down
13331 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13332 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
13333 	 * free list.
13334 	 *
13335 	 * When we're storing the buffer within the code signature, we also need to make
13336 	 * sure we account for alignment of the buffer.
13337 	 */
13338 	const vm_address_t align_mask = sizeof(void*) - 1;
13339 	size_t required_length_within_sig = required_length + align_mask;
13340 
13341 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13342 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13343 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13344 
13345 		/* We need to resolve to the physical aperture */
13346 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13347 		acceleration_buf = (void*)phystokv(phys_addr);
13348 
13349 		/* Ensure the offset within the page wasn't lost */
13350 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13351 
13352 		acceleration_buf->allocated = false;
13353 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13354 	} else {
13355 		if (required_length <= pmap_cs_blob_limit) {
13356 			struct pmap_cs_blob *bucket = NULL;
13357 			size_t bucket_size = 0;
13358 
13359 			/* Allocate a buffer from the blob allocator */
13360 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13361 			if (ret != KERN_SUCCESS) {
13362 				goto out;
13363 			}
13364 			acceleration_buf = (void*)bucket->blob;
13365 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13366 		} else {
13367 			pmap_paddr_t phys_addr = 0;
13368 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13369 			if (ret != KERN_SUCCESS) {
13370 				goto out;
13371 			}
13372 			acceleration_buf = (void*)phystokv(phys_addr);
13373 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13374 		}
13375 		acceleration_buf->allocated = true;
13376 	}
13377 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13378 	acceleration_buf->length = acceleration_length;
13379 
13380 	/* Take the acceleration buffer lock */
13381 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13382 
13383 	/* Setup the global acceleration buffer state */
13384 	pmap_cs_acceleration_buf = acceleration_buf;
13385 
13386 	/* Accelerate the entitlements */
13387 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13388 	if (ce_err != CoreEntitlements->kNoError) {
13389 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13390 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
13391 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13392 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13393 	}
13394 
13395 	/*
13396 	 * The global acceleration buffer lock is unlocked by the allocation function itself
13397 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13398 	 * an assert that the lock is unlocked here since another thread could have acquired
13399 	 * it by now.
13400 	 */
13401 	ret = KERN_SUCCESS;
13402 
13403 out:
13404 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13405 	return ret;
13406 }
13407 
13408 kern_return_t
13409 pmap_accelerate_entitlements(
13410 	pmap_cs_code_directory_t *cd_entry)
13411 {
13412 	kern_return_t ret = KERN_DENIED;
13413 
13414 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
13415 	while (ret == KERN_RESOURCE_SHORTAGE) {
13416 		/* Allocate a page for the PPL */
13417 		pmap_alloc_page_for_ppl(0);
13418 
13419 		/* Try again */
13420 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
13421 	}
13422 
13423 	return ret;
13424 }
13425 
13426 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13427 
13428 MARK_AS_PMAP_TEXT bool
13429 pmap_lookup_in_loaded_trust_caches_internal(
13430 	const uint8_t cdhash[CS_CDHASH_LEN])
13431 {
13432 	kern_return_t kr = KERN_NOT_FOUND;
13433 
13434 #if PMAP_CS_PPL_MONITOR
13435 	/*
13436 	 * If we have the PPL monitor, then this function can only be called from
13437 	 * within the PPL. Calling it directly would've caused a panic, so we can
13438 	 * assume that we're in the PPL here.
13439 	 */
13440 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13441 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13442 
13443 	kr = pmap_query_trust_cache_safe(
13444 		kTCQueryTypeLoadable,
13445 		cdhash_safe,
13446 		NULL);
13447 #else
13448 	kr = query_trust_cache(
13449 		kTCQueryTypeLoadable,
13450 		cdhash,
13451 		NULL);
13452 #endif
13453 
13454 	if (kr == KERN_SUCCESS) {
13455 		return true;
13456 	}
13457 	return false;
13458 }
13459 
13460 bool
13461 pmap_lookup_in_loaded_trust_caches(
13462 	const uint8_t cdhash[CS_CDHASH_LEN])
13463 {
13464 #if XNU_MONITOR
13465 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13466 #else
13467 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13468 #endif
13469 }
13470 
13471 MARK_AS_PMAP_TEXT uint32_t
13472 pmap_lookup_in_static_trust_cache_internal(
13473 	const uint8_t cdhash[CS_CDHASH_LEN])
13474 {
13475 	TrustCacheQueryToken_t query_token = {0};
13476 	kern_return_t kr = KERN_NOT_FOUND;
13477 	uint64_t flags = 0;
13478 	uint8_t hash_type = 0;
13479 
13480 #if PMAP_CS_PPL_MONITOR
13481 	/*
13482 	 * If we have the PPL monitor, then this function can only be called from
13483 	 * within the PPL. Calling it directly would've caused a panic, so we can
13484 	 * assume that we're in the PPL here.
13485 	 */
13486 	uint8_t cdhash_safe[CS_CDHASH_LEN];
13487 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13488 
13489 	kr = pmap_query_trust_cache_safe(
13490 		kTCQueryTypeStatic,
13491 		cdhash_safe,
13492 		&query_token);
13493 #else
13494 	kr = query_trust_cache(
13495 		kTCQueryTypeStatic,
13496 		cdhash,
13497 		&query_token);
13498 #endif
13499 
13500 	if (kr == KERN_SUCCESS) {
13501 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
13502 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13503 
13504 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13505 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13506 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13507 	}
13508 
13509 	return 0;
13510 }
13511 
13512 uint32_t
13513 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13514 {
13515 #if XNU_MONITOR
13516 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13517 #else
13518 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
13519 #endif
13520 }
13521 
13522 #if PMAP_CS_INCLUDE_CODE_SIGNING
13523 
13524 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13525 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13526 
13527 MARK_AS_PMAP_TEXT void
13528 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13529 {
13530 
13531 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13532 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13533 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13534 
13535 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13536 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13537 }
13538 
13539 MARK_AS_PMAP_TEXT bool
13540 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13541 {
13542 	bool match = false;
13543 
13544 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13545 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13546 		match = true;
13547 	}
13548 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13549 
13550 	if (match) {
13551 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13552 	}
13553 
13554 	return match;
13555 }
13556 
13557 void
13558 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13559 {
13560 #if XNU_MONITOR
13561 	pmap_set_compilation_service_cdhash_ppl(cdhash);
13562 #else
13563 	pmap_set_compilation_service_cdhash_internal(cdhash);
13564 #endif
13565 }
13566 
13567 bool
13568 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13569 {
13570 #if XNU_MONITOR
13571 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
13572 #else
13573 	return pmap_match_compilation_service_cdhash_internal(cdhash);
13574 #endif
13575 }
13576 
13577 /*
13578  * As part of supporting local signing on the device, we need the PMAP layer
13579  * to store the local signing key so that PMAP_CS can validate with it. We
13580  * store it at the PMAP layer such that it is accessible to both AMFI and
13581  * PMAP_CS should they need it.
13582  */
13583 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13584 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13585 
13586 MARK_AS_PMAP_TEXT void
13587 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13588 {
13589 	bool key_set = false;
13590 
13591 	/*
13592 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13593 	 * a successful exchange means that the local signing public key has _not_ been
13594 	 * set. In case the key has been set, we panic as we would never expect the
13595 	 * kernel to attempt to set the key more than once.
13596 	 */
13597 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13598 
13599 	if (key_set) {
13600 		panic("attempted to set the local signing public key multiple times");
13601 	}
13602 
13603 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13604 	pmap_cs_log_info("set local signing public key");
13605 }
13606 
13607 void
13608 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13609 {
13610 #if XNU_MONITOR
13611 	return pmap_set_local_signing_public_key_ppl(public_key);
13612 #else
13613 	return pmap_set_local_signing_public_key_internal(public_key);
13614 #endif
13615 }
13616 
13617 uint8_t*
13618 pmap_get_local_signing_public_key(void)
13619 {
13620 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13621 
13622 	if (key_set) {
13623 		return pmap_local_signing_public_key;
13624 	}
13625 
13626 	return NULL;
13627 }
13628 
13629 /*
13630  * Locally signed applications need to be explicitly authorized by an entitled application
13631  * before we allow them to run.
13632  */
13633 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13634 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13635 
13636 MARK_AS_PMAP_TEXT void
13637 pmap_unrestrict_local_signing_internal(
13638 	const uint8_t cdhash[CS_CDHASH_LEN])
13639 {
13640 
13641 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13642 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13643 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13644 
13645 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13646 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13647 }
13648 
13649 void
13650 pmap_unrestrict_local_signing(
13651 	const uint8_t cdhash[CS_CDHASH_LEN])
13652 {
13653 #if XNU_MONITOR
13654 	return pmap_unrestrict_local_signing_ppl(cdhash);
13655 #else
13656 	return pmap_unrestrict_local_signing_internal(cdhash);
13657 #endif
13658 }
13659 
13660 #if PMAP_CS
13661 MARK_AS_PMAP_TEXT static void
13662 pmap_restrict_local_signing(void)
13663 {
13664 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13665 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13666 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13667 }
13668 
13669 MARK_AS_PMAP_TEXT static bool
13670 pmap_local_signing_restricted(
13671 	const uint8_t cdhash[CS_CDHASH_LEN])
13672 {
13673 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13674 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13675 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13676 
13677 	return ret != 0;
13678 }
13679 
13680 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13681 #endif
13682 
13683 MARK_AS_PMAP_TEXT void
13684 pmap_footprint_suspend_internal(
13685 	vm_map_t        map,
13686 	boolean_t       suspend)
13687 {
13688 #if DEVELOPMENT || DEBUG
13689 	if (suspend) {
13690 		current_thread()->pmap_footprint_suspended = TRUE;
13691 		map->pmap->footprint_was_suspended = TRUE;
13692 	} else {
13693 		current_thread()->pmap_footprint_suspended = FALSE;
13694 	}
13695 #else /* DEVELOPMENT || DEBUG */
13696 	(void) map;
13697 	(void) suspend;
13698 #endif /* DEVELOPMENT || DEBUG */
13699 }
13700 
13701 void
13702 pmap_footprint_suspend(
13703 	vm_map_t map,
13704 	boolean_t suspend)
13705 {
13706 #if XNU_MONITOR
13707 	pmap_footprint_suspend_ppl(map, suspend);
13708 #else
13709 	pmap_footprint_suspend_internal(map, suspend);
13710 #endif
13711 }
13712 
13713 MARK_AS_PMAP_TEXT void
13714 pmap_nop_internal(pmap_t pmap __unused)
13715 {
13716 	validate_pmap_mutable(pmap);
13717 }
13718 
13719 void
13720 pmap_nop(pmap_t pmap)
13721 {
13722 #if XNU_MONITOR
13723 	pmap_nop_ppl(pmap);
13724 #else
13725 	pmap_nop_internal(pmap);
13726 #endif
13727 }
13728 
13729 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
13730 
13731 struct page_table_dump_header {
13732 	uint64_t pa;
13733 	uint64_t num_entries;
13734 	uint64_t start_va;
13735 	uint64_t end_va;
13736 };
13737 
13738 static kern_return_t
13739 pmap_dump_page_tables_recurse(pmap_t pmap,
13740     const tt_entry_t *ttp,
13741     unsigned int cur_level,
13742     unsigned int level_mask,
13743     uint64_t start_va,
13744     void *buf_start,
13745     void *buf_end,
13746     size_t *bytes_copied)
13747 {
13748 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13749 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
13750 
13751 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
13752 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
13753 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
13754 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
13755 
13756 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
13757 
13758 	if (cur_level == pt_attr_root_level(pt_attr)) {
13759 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
13760 	}
13761 
13762 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
13763 	const tt_entry_t *tt_end = &ttp[num_entries];
13764 
13765 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
13766 		return KERN_INSUFFICIENT_BUFFER_SIZE;
13767 	}
13768 
13769 	if (level_mask & (1U << cur_level)) {
13770 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
13771 		header->pa = ml_static_vtop((vm_offset_t)ttp);
13772 		header->num_entries = num_entries;
13773 		header->start_va = start_va;
13774 		header->end_va = start_va + (num_entries * size);
13775 
13776 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
13777 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
13778 	}
13779 	uint64_t current_va = start_va;
13780 
13781 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
13782 		tt_entry_t tte = *ttep;
13783 
13784 		if (!(tte & valid_mask)) {
13785 			continue;
13786 		}
13787 
13788 		if ((tte & type_mask) == type_block) {
13789 			continue;
13790 		} else {
13791 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
13792 				panic("%s: corrupt entry %#llx at %p, "
13793 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
13794 				    __FUNCTION__, tte, ttep,
13795 				    ttp, cur_level, bufp, buf_end);
13796 			}
13797 
13798 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
13799 
13800 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
13801 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
13802 
13803 			if (recurse_result != KERN_SUCCESS) {
13804 				return recurse_result;
13805 			}
13806 		}
13807 	}
13808 
13809 	return KERN_SUCCESS;
13810 }
13811 
13812 kern_return_t
13813 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
13814 {
13815 	if (not_in_kdp) {
13816 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
13817 	}
13818 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
13819 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
13820 }
13821 
13822 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13823 
13824 kern_return_t
13825 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
13826     unsigned int level_mask __unused, size_t *bytes_copied __unused)
13827 {
13828 	return KERN_NOT_SUPPORTED;
13829 }
13830 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
13831 
13832 
13833 #ifdef CONFIG_XNUPOST
13834 #ifdef __arm64__
13835 static volatile bool pmap_test_took_fault = false;
13836 
13837 static bool
13838 pmap_test_fault_handler(arm_saved_state_t * state)
13839 {
13840 	bool retval                 = false;
13841 	uint32_t esr                = get_saved_state_esr(state);
13842 	esr_exception_class_t class = ESR_EC(esr);
13843 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
13844 
13845 	if ((class == ESR_EC_DABORT_EL1) &&
13846 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
13847 		pmap_test_took_fault = true;
13848 		/* return to the instruction immediately after the call to NX page */
13849 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
13850 		retval = true;
13851 	}
13852 
13853 	return retval;
13854 }
13855 
13856 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
13857 static NOKASAN bool
13858 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
13859 {
13860 	pmap_t old_pmap = NULL;
13861 
13862 	pmap_test_took_fault = false;
13863 
13864 	/*
13865 	 * We're potentially switching pmaps without using the normal thread
13866 	 * mechanism; disable interrupts and preemption to avoid any unexpected
13867 	 * memory accesses.
13868 	 */
13869 	uint64_t old_int_state = pmap_interrupts_disable();
13870 	mp_disable_preemption();
13871 
13872 	if (pmap != NULL) {
13873 		old_pmap = current_pmap();
13874 		pmap_switch(pmap);
13875 
13876 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
13877 #if __ARM_PAN_AVAILABLE__
13878 		__builtin_arm_wsr("pan", 0);
13879 #endif /* __ARM_PAN_AVAILABLE__ */
13880 	}
13881 
13882 	ml_expect_fault_begin(pmap_test_fault_handler, va);
13883 
13884 	if (is_write) {
13885 		*((volatile uint64_t*)(va)) = 0xdec0de;
13886 	} else {
13887 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
13888 		(void)tmp;
13889 	}
13890 
13891 	/* Save the fault bool, and undo the gross stuff we did. */
13892 	bool took_fault = pmap_test_took_fault;
13893 	ml_expect_fault_end();
13894 
13895 	if (pmap != NULL) {
13896 #if __ARM_PAN_AVAILABLE__
13897 		__builtin_arm_wsr("pan", 1);
13898 #endif /* __ARM_PAN_AVAILABLE__ */
13899 
13900 		pmap_switch(old_pmap);
13901 	}
13902 
13903 	mp_enable_preemption();
13904 	pmap_interrupts_restore(old_int_state);
13905 	bool retval = (took_fault == should_fault);
13906 	return retval;
13907 }
13908 
13909 static bool
13910 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
13911 {
13912 	bool retval = pmap_test_access(pmap, va, should_fault, false);
13913 
13914 	if (!retval) {
13915 		T_FAIL("%s: %s, "
13916 		    "pmap=%p, va=%p, should_fault=%u",
13917 		    __func__, should_fault ? "did not fault" : "faulted",
13918 		    pmap, (void*)va, (unsigned)should_fault);
13919 	}
13920 
13921 	return retval;
13922 }
13923 
13924 static bool
13925 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
13926 {
13927 	bool retval = pmap_test_access(pmap, va, should_fault, true);
13928 
13929 	if (!retval) {
13930 		T_FAIL("%s: %s, "
13931 		    "pmap=%p, va=%p, should_fault=%u",
13932 		    __func__, should_fault ? "did not fault" : "faulted",
13933 		    pmap, (void*)va, (unsigned)should_fault);
13934 	}
13935 
13936 	return retval;
13937 }
13938 
13939 static bool
13940 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
13941 {
13942 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
13943 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
13944 
13945 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
13946 
13947 	if (!retval) {
13948 		T_FAIL("%s: bits=%u, "
13949 		    "pa=%p, should_be_set=%u",
13950 		    __func__, bits,
13951 		    (void*)pa, should_be_set);
13952 	}
13953 
13954 	return retval;
13955 }
13956 
13957 static __attribute__((noinline)) bool
13958 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
13959 {
13960 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
13961 	return retval;
13962 }
13963 
13964 static int
13965 pmap_test_test_config(unsigned int flags)
13966 {
13967 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
13968 	unsigned int map_count = 0;
13969 	unsigned long page_ratio = 0;
13970 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
13971 
13972 	if (!pmap) {
13973 		panic("Failed to allocate pmap");
13974 	}
13975 
13976 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
13977 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
13978 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
13979 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
13980 
13981 	if (pmap_page_size <= native_page_size) {
13982 		page_ratio = native_page_size / pmap_page_size;
13983 	} else {
13984 		/*
13985 		 * We claim to support a page_ratio of less than 1, which is
13986 		 * not currently supported by the pmap layer; panic.
13987 		 */
13988 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
13989 		    "flags=%u",
13990 		    __func__, native_page_size, pmap_page_size,
13991 		    flags);
13992 	}
13993 
13994 	if (PAGE_RATIO > 1) {
13995 		/*
13996 		 * The kernel is deliberately pretending to have 16KB pages.
13997 		 * The pmap layer has code that supports this, so pretend the
13998 		 * page size is larger than it is.
13999 		 */
14000 		pmap_page_size = PAGE_SIZE;
14001 		native_page_size = PAGE_SIZE;
14002 	}
14003 
14004 	/*
14005 	 * Get two pages from the VM; one to be mapped wired, and one to be
14006 	 * mapped nonwired.
14007 	 */
14008 	vm_page_t unwired_vm_page = vm_page_grab();
14009 	vm_page_t wired_vm_page = vm_page_grab();
14010 
14011 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14012 		panic("Failed to grab VM pages");
14013 	}
14014 
14015 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14016 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14017 
14018 	pmap_paddr_t pa = ptoa(pn);
14019 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14020 
14021 	/*
14022 	 * We'll start mappings at the second twig TT.  This keeps us from only
14023 	 * using the first entry in each TT, which would trivially be address
14024 	 * 0; one of the things we will need to test is retrieving the VA for
14025 	 * a given PTE.
14026 	 */
14027 	vm_map_address_t va_base = pmap_twig_size;
14028 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14029 
14030 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14031 		/*
14032 		 * Not exactly a functional failure, but this test relies on
14033 		 * there being a spare PTE slot we can use to pin the TT.
14034 		 */
14035 		panic("Cannot pin translation table");
14036 	}
14037 
14038 	/*
14039 	 * Create the wired mapping; this will prevent the pmap layer from
14040 	 * reclaiming our test TTs, which would interfere with this test
14041 	 * ("interfere" -> "make it panic").
14042 	 */
14043 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14044 
14045 #if XNU_MONITOR
14046 	/*
14047 	 * If the PPL is enabled, make sure that the kernel cannot write
14048 	 * to PPL memory.
14049 	 */
14050 	if (!pmap_ppl_disable) {
14051 		T_LOG("Validate that kernel cannot write to PPL memory.");
14052 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14053 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14054 	}
14055 #endif
14056 
14057 	/*
14058 	 * Create read-only mappings of the nonwired page; if the pmap does
14059 	 * not use the same page size as the kernel, create multiple mappings
14060 	 * so that the kernel page is fully mapped.
14061 	 */
14062 	for (map_count = 0; map_count < page_ratio; map_count++) {
14063 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14064 	}
14065 
14066 	/* Validate that all the PTEs have the expected PA and VA. */
14067 	for (map_count = 0; map_count < page_ratio; map_count++) {
14068 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14069 
14070 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14071 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14072 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14073 		}
14074 
14075 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14076 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14077 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14078 		}
14079 	}
14080 
14081 	T_LOG("Validate that reads to our mapping do not fault.");
14082 	pmap_test_read(pmap, va_base, false);
14083 
14084 	T_LOG("Validate that writes to our mapping fault.");
14085 	pmap_test_write(pmap, va_base, true);
14086 
14087 	T_LOG("Make the first mapping writable.");
14088 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14089 
14090 	T_LOG("Validate that writes to our mapping do not fault.");
14091 	pmap_test_write(pmap, va_base, false);
14092 
14093 
14094 	T_LOG("Make the first mapping execute-only");
14095 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14096 
14097 
14098 	T_LOG("Validate that reads to our mapping do not fault.");
14099 	pmap_test_read(pmap, va_base, false);
14100 
14101 	T_LOG("Validate that writes to our mapping fault.");
14102 	pmap_test_write(pmap, va_base, true);
14103 
14104 
14105 	/*
14106 	 * For page ratios of greater than 1: validate that writes to the other
14107 	 * mappings still fault.  Remove the mappings afterwards (we're done
14108 	 * with page ratio testing).
14109 	 */
14110 	for (map_count = 1; map_count < page_ratio; map_count++) {
14111 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14112 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14113 	}
14114 
14115 	T_LOG("Mark the page unreferenced and unmodified.");
14116 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14117 	pmap_test_check_refmod(pa, 0);
14118 
14119 	/*
14120 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14121 	 * different protection/fault_type settings, and confirm that the
14122 	 * ref/mod state matches our expectations at each step.
14123 	 */
14124 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14125 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14126 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14127 
14128 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14129 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14130 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14131 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14132 
14133 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14134 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14135 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14136 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14137 
14138 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14139 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14140 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14141 
14142 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14143 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14144 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14145 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14146 
14147 	/*
14148 	 * Shared memory testing; we'll have two mappings; one read-only,
14149 	 * one read-write.
14150 	 */
14151 	vm_map_address_t rw_base = va_base;
14152 	vm_map_address_t ro_base = va_base + pmap_page_size;
14153 
14154 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14155 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14156 
14157 	/*
14158 	 * Test that we take faults as expected for unreferenced/unmodified
14159 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14160 	 * mapping permissions change as expected.
14161 	 */
14162 	T_LOG("!ref/!mod: expect no access");
14163 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14164 	pmap_test_read_write(pmap, ro_base, false, false);
14165 	pmap_test_read_write(pmap, rw_base, false, false);
14166 
14167 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14168 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14169 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14170 	pmap_test_read_write(pmap, ro_base, true, false);
14171 	pmap_test_read_write(pmap, rw_base, true, false);
14172 
14173 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14174 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14175 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14176 	pmap_test_read_write(pmap, ro_base, true, false);
14177 	pmap_test_read_write(pmap, rw_base, true, true);
14178 
14179 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14180 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14181 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14182 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14183 	pmap_test_read_write(pmap, ro_base, true, false);
14184 	pmap_test_read_write(pmap, rw_base, true, true);
14185 
14186 	T_LOG("RW protect both mappings; should not change protections.");
14187 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14188 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14189 	pmap_test_read_write(pmap, ro_base, true, false);
14190 	pmap_test_read_write(pmap, rw_base, true, true);
14191 
14192 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14193 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14194 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14195 	pmap_test_read_write(pmap, ro_base, true, false);
14196 	pmap_test_read_write(pmap, rw_base, true, false);
14197 
14198 	T_LOG("RW protect the page; mappings should not change protections.");
14199 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14200 	pmap_page_protect(pn, VM_PROT_ALL);
14201 	pmap_test_read_write(pmap, ro_base, true, false);
14202 	pmap_test_read_write(pmap, rw_base, true, true);
14203 
14204 	T_LOG("Read protect the page; RW mapping should become RO.");
14205 	pmap_page_protect(pn, VM_PROT_READ);
14206 	pmap_test_read_write(pmap, ro_base, true, false);
14207 	pmap_test_read_write(pmap, rw_base, true, false);
14208 
14209 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14210 	pmap_disconnect(pn);
14211 	if (!pmap_verify_free(pn)) {
14212 		T_FAIL("Page still has mappings");
14213 	}
14214 
14215 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14216 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14217 	pmap_destroy(pmap);
14218 
14219 	T_LOG("Release the pages back to the VM.");
14220 	vm_page_lock_queues();
14221 	vm_page_free(unwired_vm_page);
14222 	vm_page_free(wired_vm_page);
14223 	vm_page_unlock_queues();
14224 
14225 	T_LOG("Testing successful!");
14226 	return 0;
14227 }
14228 #endif /* __arm64__ */
14229 
14230 kern_return_t
14231 pmap_test(void)
14232 {
14233 	T_LOG("Starting pmap_tests");
14234 #ifdef __arm64__
14235 	int flags = 0;
14236 	flags |= PMAP_CREATE_64BIT;
14237 
14238 #if __ARM_MIXED_PAGE_SIZE__
14239 	T_LOG("Testing VM_PAGE_SIZE_4KB");
14240 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14241 	T_LOG("Testing VM_PAGE_SIZE_16KB");
14242 	pmap_test_test_config(flags);
14243 #else /* __ARM_MIXED_PAGE_SIZE__ */
14244 	pmap_test_test_config(flags);
14245 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14246 
14247 #endif /* __arm64__ */
14248 	T_PASS("completed pmap_test successfully");
14249 	return KERN_SUCCESS;
14250 }
14251 #endif /* CONFIG_XNUPOST */
14252 
14253 /*
14254  * The following function should never make it to RELEASE code, since
14255  * it provides a way to get the PPL to modify text pages.
14256  */
14257 #if DEVELOPMENT || DEBUG
14258 
14259 #define ARM_UNDEFINED_INSN 0xe7f000f0
14260 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14261 
14262 /**
14263  * Forcibly overwrite executable text with an illegal instruction.
14264  *
14265  * @note Only used for xnu unit testing.
14266  *
14267  * @param pa The physical address to corrupt.
14268  *
14269  * @return KERN_SUCCESS on success.
14270  */
14271 kern_return_t
14272 pmap_test_text_corruption(pmap_paddr_t pa)
14273 {
14274 #if XNU_MONITOR
14275 	return pmap_test_text_corruption_ppl(pa);
14276 #else /* XNU_MONITOR */
14277 	return pmap_test_text_corruption_internal(pa);
14278 #endif /* XNU_MONITOR */
14279 }
14280 
14281 MARK_AS_PMAP_TEXT kern_return_t
14282 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14283 {
14284 	vm_offset_t va = phystokv(pa);
14285 	unsigned int pai = pa_index(pa);
14286 
14287 	assert(pa_valid(pa));
14288 
14289 	pvh_lock(pai);
14290 
14291 	pv_entry_t **pv_h  = pai_to_pvh(pai);
14292 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14293 #if defined(PVH_FLAG_EXEC)
14294 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14295 
14296 	if (need_ap_twiddle) {
14297 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14298 	}
14299 #endif /* defined(PVH_FLAG_EXEC) */
14300 
14301 	/*
14302 	 * The low bit in an instruction address indicates a THUMB instruction
14303 	 */
14304 	if (va & 1) {
14305 		va &= ~(vm_offset_t)1;
14306 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14307 	} else {
14308 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
14309 	}
14310 
14311 #if defined(PVH_FLAG_EXEC)
14312 	if (need_ap_twiddle) {
14313 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14314 	}
14315 #endif /* defined(PVH_FLAG_EXEC) */
14316 
14317 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14318 
14319 	pvh_unlock(pai);
14320 
14321 	return KERN_SUCCESS;
14322 }
14323 
14324 #endif /* DEVELOPMENT || DEBUG */
14325