xref: /xnu-11215.41.3/osfmk/arm/pmap/pmap.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84 
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89 
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93 
94 #include <pexpert/device_tree.h>
95 
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98 
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102 
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106 
107 
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111 
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117 
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121 
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123 
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125 
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131 
132 const struct page_table_ops native_pt_ops =
133 {
134 	.alloc_id = alloc_asid,
135 	.free_id = free_asid,
136 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 	.wimg_to_pte = wimg_to_pte,
139 };
140 
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 	[0] = {
144 		.size       = ARM_16K_TT_L0_SIZE,
145 		.offmask    = ARM_16K_TT_L0_OFFMASK,
146 		.shift      = ARM_16K_TT_L0_SHIFT,
147 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 		.valid_mask = ARM_TTE_VALID,
149 		.type_mask  = ARM_TTE_TYPE_MASK,
150 		.type_block = ARM_TTE_TYPE_BLOCK
151 	},
152 	[1] = {
153 		.size       = ARM_16K_TT_L1_SIZE,
154 		.offmask    = ARM_16K_TT_L1_OFFMASK,
155 		.shift      = ARM_16K_TT_L1_SHIFT,
156 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 		.valid_mask = ARM_TTE_VALID,
158 		.type_mask  = ARM_TTE_TYPE_MASK,
159 		.type_block = ARM_TTE_TYPE_BLOCK
160 	},
161 	[2] = {
162 		.size       = ARM_16K_TT_L2_SIZE,
163 		.offmask    = ARM_16K_TT_L2_OFFMASK,
164 		.shift      = ARM_16K_TT_L2_SHIFT,
165 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 		.valid_mask = ARM_TTE_VALID,
167 		.type_mask  = ARM_TTE_TYPE_MASK,
168 		.type_block = ARM_TTE_TYPE_BLOCK
169 	},
170 	[3] = {
171 		.size       = ARM_16K_TT_L3_SIZE,
172 		.offmask    = ARM_16K_TT_L3_OFFMASK,
173 		.shift      = ARM_16K_TT_L3_SHIFT,
174 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 		.valid_mask = ARM_PTE_TYPE_VALID,
176 		.type_mask  = ARM_PTE_TYPE_MASK,
177 		.type_block = ARM_TTE_TYPE_L3BLOCK
178 	}
179 };
180 
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 	[0] = {
184 		.size       = ARM_4K_TT_L0_SIZE,
185 		.offmask    = ARM_4K_TT_L0_OFFMASK,
186 		.shift      = ARM_4K_TT_L0_SHIFT,
187 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 		.valid_mask = ARM_TTE_VALID,
189 		.type_mask  = ARM_TTE_TYPE_MASK,
190 		.type_block = ARM_TTE_TYPE_BLOCK
191 	},
192 	[1] = {
193 		.size       = ARM_4K_TT_L1_SIZE,
194 		.offmask    = ARM_4K_TT_L1_OFFMASK,
195 		.shift      = ARM_4K_TT_L1_SHIFT,
196 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 		.valid_mask = ARM_TTE_VALID,
198 		.type_mask  = ARM_TTE_TYPE_MASK,
199 		.type_block = ARM_TTE_TYPE_BLOCK
200 	},
201 	[2] = {
202 		.size       = ARM_4K_TT_L2_SIZE,
203 		.offmask    = ARM_4K_TT_L2_OFFMASK,
204 		.shift      = ARM_4K_TT_L2_SHIFT,
205 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 		.valid_mask = ARM_TTE_VALID,
207 		.type_mask  = ARM_TTE_TYPE_MASK,
208 		.type_block = ARM_TTE_TYPE_BLOCK
209 	},
210 	[3] = {
211 		.size       = ARM_4K_TT_L3_SIZE,
212 		.offmask    = ARM_4K_TT_L3_OFFMASK,
213 		.shift      = ARM_4K_TT_L3_SHIFT,
214 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 		.valid_mask = ARM_PTE_TYPE_VALID,
216 		.type_mask  = ARM_PTE_TYPE_MASK,
217 		.type_block = ARM_TTE_TYPE_L3BLOCK
218 	}
219 };
220 
221 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
222 {
223 	[0] = { /* Unused */
224 		.size       = ARM_4K_TT_L0_SIZE,
225 		.offmask    = ARM_4K_TT_L0_OFFMASK,
226 		.shift      = ARM_4K_TT_L0_SHIFT,
227 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
228 		.valid_mask = ARM_TTE_VALID,
229 		.type_mask  = ARM_TTE_TYPE_MASK,
230 		.type_block = ARM_TTE_TYPE_BLOCK
231 	},
232 	[1] = { /* Concatenated, so index mask is larger than normal */
233 		.size       = ARM_4K_TT_L1_SIZE,
234 		.offmask    = ARM_4K_TT_L1_OFFMASK,
235 		.shift      = ARM_4K_TT_L1_SHIFT,
236 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
237 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
238 #else
239 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
240 #endif
241 		.valid_mask = ARM_TTE_VALID,
242 		.type_mask  = ARM_TTE_TYPE_MASK,
243 		.type_block = ARM_TTE_TYPE_BLOCK
244 	},
245 	[2] = {
246 		.size       = ARM_4K_TT_L2_SIZE,
247 		.offmask    = ARM_4K_TT_L2_OFFMASK,
248 		.shift      = ARM_4K_TT_L2_SHIFT,
249 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
250 		.valid_mask = ARM_TTE_VALID,
251 		.type_mask  = ARM_TTE_TYPE_MASK,
252 		.type_block = ARM_TTE_TYPE_BLOCK
253 	},
254 	[3] = {
255 		.size       = ARM_4K_TT_L3_SIZE,
256 		.offmask    = ARM_4K_TT_L3_OFFMASK,
257 		.shift      = ARM_4K_TT_L3_SHIFT,
258 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
259 		.valid_mask = ARM_PTE_TYPE_VALID,
260 		.type_mask  = ARM_PTE_TYPE_MASK,
261 		.type_block = ARM_TTE_TYPE_L3BLOCK
262 	}
263 };
264 
265 const struct page_table_attr pmap_pt_attr_4k = {
266 	.pta_level_info = pmap_table_level_info_4k,
267 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
268 #if __ARM_MIXED_PAGE_SIZE__
269 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
270 #else /* __ARM_MIXED_PAGE_SIZE__ */
271 #if __ARM_16K_PG__
272 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
273 #else /* __ARM_16K_PG__ */
274 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
275 #endif /* __ARM_16K_PG__ */
276 #endif /* __ARM_MIXED_PAGE_SIZE__ */
277 	.pta_max_level  = PMAP_TT_L3_LEVEL,
278 	.pta_ops = &native_pt_ops,
279 	.ap_ro = ARM_PTE_AP(AP_RORO),
280 	.ap_rw = ARM_PTE_AP(AP_RWRW),
281 	.ap_rona = ARM_PTE_AP(AP_RONA),
282 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
283 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
284 	.ap_x = ARM_PTE_PNX,
285 #if __ARM_MIXED_PAGE_SIZE__
286 	.pta_tcr_value  = TCR_EL1_4KB,
287 #endif /* __ARM_MIXED_PAGE_SIZE__ */
288 	.pta_page_size  = 4096,
289 	.pta_pagezero_size = 4096,
290 	.pta_page_shift = 12,
291 };
292 
293 const struct page_table_attr pmap_pt_attr_16k = {
294 	.pta_level_info = pmap_table_level_info_16k,
295 	.pta_root_level = PMAP_TT_L1_LEVEL,
296 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
297 	.pta_max_level  = PMAP_TT_L3_LEVEL,
298 	.pta_ops = &native_pt_ops,
299 	.ap_ro = ARM_PTE_AP(AP_RORO),
300 	.ap_rw = ARM_PTE_AP(AP_RWRW),
301 	.ap_rona = ARM_PTE_AP(AP_RONA),
302 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
303 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
304 	.ap_x = ARM_PTE_PNX,
305 #if __ARM_MIXED_PAGE_SIZE__
306 	.pta_tcr_value  = TCR_EL1_16KB,
307 #endif /* __ARM_MIXED_PAGE_SIZE__ */
308 	.pta_page_size  = 16384,
309 	.pta_pagezero_size = 16384,
310 	.pta_page_shift = 14,
311 };
312 
313 #if __ARM_16K_PG__
314 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
315 #else /* !__ARM_16K_PG__ */
316 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
317 #endif /* !__ARM_16K_PG__ */
318 
319 
320 #if DEVELOPMENT || DEBUG
321 int vm_footprint_suspend_allowed = 1;
322 
323 extern int pmap_ledgers_panic;
324 extern int pmap_ledgers_panic_leeway;
325 
326 #endif /* DEVELOPMENT || DEBUG */
327 
328 #if DEVELOPMENT || DEBUG
329 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
330 	(current_thread()->pmap_footprint_suspended)
331 #else /* DEVELOPMENT || DEBUG */
332 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
333 #endif /* DEVELOPMENT || DEBUG */
334 
335 
336 /*
337  * Represents a tlb range that will be flushed before exiting
338  * the ppl.
339  * Used by phys_attribute_clear_range to defer flushing pages in
340  * this range until the end of the operation.
341  */
342 typedef struct pmap_tlb_flush_range {
343 	pmap_t ptfr_pmap;
344 	vm_map_address_t ptfr_start;
345 	vm_map_address_t ptfr_end;
346 	bool ptfr_flush_needed;
347 } pmap_tlb_flush_range_t;
348 
349 #if XNU_MONITOR
350 /*
351  * PPL External References.
352  */
353 extern vm_offset_t   segPPLDATAB;
354 extern unsigned long segSizePPLDATA;
355 extern vm_offset_t   segPPLTEXTB;
356 extern unsigned long segSizePPLTEXT;
357 extern vm_offset_t   segPPLDATACONSTB;
358 extern unsigned long segSizePPLDATACONST;
359 
360 
361 /*
362  * PPL Global Variables
363  */
364 
365 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
366 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
367 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
368 #else
369 const boolean_t pmap_ppl_disable = FALSE;
370 #endif
371 
372 /*
373  * Indicates if the PPL has started applying APRR.
374  * This variable is accessed from various assembly trampolines, so be sure to change
375  * those if you change the size or layout of this variable.
376  */
377 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
378 
379 extern void *pmap_stacks_start;
380 extern void *pmap_stacks_end;
381 
382 #endif /* !XNU_MONITOR */
383 
384 
385 
386 /* Virtual memory region for early allocation */
387 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
388 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
389 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
390 
391 extern uint8_t bootstrap_pagetables[];
392 
393 extern unsigned int not_in_kdp;
394 
395 extern vm_offset_t first_avail;
396 
397 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
398 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
399 extern vm_offset_t     static_memory_end;
400 
401 extern const vm_map_address_t physmap_base;
402 extern const vm_map_address_t physmap_end;
403 
404 extern int maxproc, hard_maxproc;
405 
406 /* The number of address bits one TTBR can cover. */
407 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
408 
409 /*
410  * The bounds on our TTBRs.  These are for sanity checking that
411  * an address is accessible by a TTBR before we attempt to map it.
412  */
413 
414 /* The level of the root of a page table. */
415 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
416 
417 /* The number of entries in the root TT of a page table. */
418 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
419 
420 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
421 const pmap_t    kernel_pmap = &kernel_pmap_store;
422 
423 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
424 
425 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
426 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
427 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
428 
429 typedef struct tt_free_entry {
430 	struct tt_free_entry    *next;
431 } tt_free_entry_t;
432 
433 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
434 
435 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
436 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
437 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
438 #define FREE_PAGE_SIZE_TT_MAX   4
439 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
440 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
441 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
442 
443 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
444 
445 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
446 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
447 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
448 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
449 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
450 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
451 
452 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
453 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
454 
455 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
456 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
457 
458 /* Lock group used for all pmap object locks. */
459 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
460 
461 #if DEVELOPMENT || DEBUG
462 int nx_enabled = 1;                                     /* enable no-execute protection */
463 int allow_data_exec  = 0;                               /* No apps may execute data */
464 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
465 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
466 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
467 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
468 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
469 #else /* DEVELOPMENT || DEBUG */
470 const int nx_enabled = 1;                                       /* enable no-execute protection */
471 const int allow_data_exec  = 0;                         /* No apps may execute data */
472 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
473 #endif /* DEVELOPMENT || DEBUG */
474 
475 /**
476  * This variable is set true during hibernation entry to protect pmap data structures
477  * during image copying, and reset false on hibernation exit.
478  */
479 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
480 
481 #if MACH_ASSERT
482 static void pmap_check_ledgers(pmap_t pmap);
483 #else
484 static inline void
pmap_check_ledgers(__unused pmap_t pmap)485 pmap_check_ledgers(__unused pmap_t pmap)
486 {
487 }
488 #endif /* MACH_ASSERT */
489 
490 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
491 
492 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
493 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
494 
495 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
496 
497 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
498 #if defined(__arm64__)
499 /* end of shared region + 512MB for various purposes */
500 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
501 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
502     "Minimum address space size outside allowable range");
503 
504 // Max offset is 15.375GB for devices with "large" memory config
505 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
506 // Max offset is 11.375GB for devices with "small" memory config
507 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
508 
509 
510 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
511     "Large device address space size outside allowable range");
512 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
513     "Small device address space size outside allowable range");
514 
515 #  ifdef XNU_TARGET_OS_OSX
516 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
517 #  else
518 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
519 #  endif
520 #endif /* __arm64__ */
521 
522 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
523 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
524 #else
525 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
526 #endif
527 
528 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
529 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
530 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
531 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
532 #if !HAS_16BIT_ASID
533 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
534 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
535 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
536 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
537 #else
538 static uint16_t last_allocated_asid = 0;
539 #endif /* !HAS_16BIT_ASID */
540 
541 #if HAS_SPECRES_DEBUGGING
542 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
543 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
544 #endif /* HAS_SPECRES_DEBUGGING */
545 
546 
547 #if __ARM_MIXED_PAGE_SIZE__
548 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
549 #endif
550 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
551 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
552 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
553 
554 /* PTE Define Macros */
555 
556 #define ARM_PTE_IS_COMPRESSED(x, p) \
557 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
558 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
559 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
560 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
561 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
562 
563 #define pte_is_wired(pte)                                                               \
564 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
565 
566 #define pte_was_writeable(pte) \
567 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
568 
569 #define pte_set_was_writeable(pte, was_writeable) \
570 	do {                                         \
571 	        if ((was_writeable)) {               \
572 	                (pte) |= ARM_PTE_WRITEABLE;  \
573 	        } else {                             \
574 	                (pte) &= ~ARM_PTE_WRITEABLE; \
575 	        }                                    \
576 	} while(0)
577 
578 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)579 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
580 {
581 	if (wired) {
582 		*ptep |= ARM_PTE_WIRED;
583 	} else {
584 		*ptep &= ~ARM_PTE_WIRED;
585 	}
586 	/*
587 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
588 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
589 	 * never reclaimed.
590 	 */
591 	if (pmap == kernel_pmap) {
592 		return;
593 	}
594 	unsigned short *ptd_wiredcnt_ptr;
595 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
596 	if (wired) {
597 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
598 	} else {
599 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
600 		if (__improbable(prev_wired == 0)) {
601 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
602 		}
603 	}
604 }
605 
606 #if HAS_FEAT_XS
607 
608 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)609 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
610 {
611 	if (__improbable(pt_attr->stage2)) {
612 		return false;
613 	}
614 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
615 	case CACHE_ATTRINDX_DISABLE_XS:
616 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
617 		return true;
618 	default:
619 		return false;
620 	}
621 }
622 
623 #endif /* HAS_FEAT_XS */
624 
625 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
626 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
627 	arm64_sync_tlb(strong);                                                                               \
628 }
629 
630 /*
631  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
632  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
633  * will observe the updated PTE.
634  */
635 #define FLUSH_PTE()                                                                     \
636 	__builtin_arm_dmb(DMB_ISH);
637 
638 /*
639  * Synchronize updates to PTEs that were previously valid and thus may be cached in
640  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
641  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
642  * program order will not issue until the DSB completes.  Prior loads may be reordered
643  * after the barrier, but their behavior should not be materially affected by the
644  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
645  * matter for loads until the access is re-driven well after the TLB update is
646  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
647  * we should be in a position to handle access faults.  For "voluntary" PTE access
648  * restriction due to unmapping or protection, the decision to restrict access should
649  * have a data dependency on prior loads in order to avoid a data race.
650  */
651 #define FLUSH_PTE_STRONG()                                                             \
652 	__builtin_arm_dsb(DSB_ISHST);
653 
654 /**
655  * Write enough page table entries to map a single VM page. On systems where the
656  * VM page size does not match the hardware page size, multiple page table
657  * entries will need to be written.
658  *
659  * @note This function does not emit a barrier to ensure these page table writes
660  *       have completed before continuing. This is commonly needed. In the case
661  *       where a DMB or DSB barrier is needed, then use the write_pte() and
662  *       write_pte_strong() functions respectively instead of this one.
663  *
664  * @param ptep Pointer to the first page table entry to update.
665  * @param pte The value to write into each page table entry. In the case that
666  *            multiple PTEs are updated to a non-empty value, then the address
667  *            in this value will automatically be incremented for each PTE
668  *            write.
669  */
670 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)671 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
672 {
673 	/**
674 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
675 	 * systems, which is why it's checked at runtime instead of compile time.
676 	 * The "unreachable" warning needs to be suppressed because it still is a
677 	 * compile time constant on some systems.
678 	 */
679 	__unreachable_ok_push
680 	if (TEST_PAGE_RATIO_4) {
681 		if (((uintptr_t)ptep) & 0x1f) {
682 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
683 			    __func__, ptep, (void*)pte);
684 		}
685 
686 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
687 			/**
688 			 * If we're writing an empty/compressed PTE value, then don't
689 			 * auto-increment the address for each PTE write.
690 			 */
691 			*ptep = pte;
692 			*(ptep + 1) = pte;
693 			*(ptep + 2) = pte;
694 			*(ptep + 3) = pte;
695 		} else {
696 			*ptep = pte;
697 			*(ptep + 1) = pte | 0x1000;
698 			*(ptep + 2) = pte | 0x2000;
699 			*(ptep + 3) = pte | 0x3000;
700 		}
701 	} else {
702 		*ptep = pte;
703 	}
704 	__unreachable_ok_pop
705 }
706 
707 /**
708  * Writes enough page table entries to map a single VM page and then ensures
709  * those writes complete by executing a Data Memory Barrier.
710  *
711  * @note The DMB issued by this function is not strong enough to protect against
712  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
713  *       instruction is going to immediately be called after this write, it's
714  *       recommended to call write_pte_strong() instead of this function.
715  *
716  * See the function header for write_pte_fast() for more details on the
717  * parameters.
718  */
719 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)720 write_pte(pt_entry_t *ptep, pt_entry_t pte)
721 {
722 	write_pte_fast(ptep, pte);
723 	FLUSH_PTE();
724 }
725 
726 /**
727  * Writes enough page table entries to map a single VM page and then ensures
728  * those writes complete by executing a Data Synchronization Barrier. This
729  * barrier provides stronger guarantees than the DMB executed by write_pte().
730  *
731  * @note This function is useful if you're going to immediately flush the TLB
732  *       after making the PTE write. A DSB is required to protect against the
733  *       TLB invalidate being reordered before the PTE write.
734  *
735  * See the function header for write_pte_fast() for more details on the
736  * parameters.
737  */
738 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)739 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
740 {
741 	write_pte_fast(ptep, pte);
742 	FLUSH_PTE_STRONG();
743 }
744 
745 /**
746  * Retrieve the pmap structure for the thread running on the current CPU.
747  */
748 pmap_t
current_pmap()749 current_pmap()
750 {
751 	const pmap_t current = vm_map_pmap(current_thread()->map);
752 
753 	assert(current != NULL);
754 
755 #if XNU_MONITOR
756 	/**
757 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
758 	 * decided by kernel-writable memory. This function is used in various parts
759 	 * of the PPL, and besides validating that the pointer returned by this
760 	 * function is indeed a pmap structure, it's also important to ensure that
761 	 * it's actually the current thread's pmap. This is because different pmaps
762 	 * will have access to different entitlements based on the code signature of
763 	 * their loaded process. So if a different user pmap is set in the current
764 	 * thread structure (in an effort to bypass code signing restrictions), even
765 	 * though the structure would validate correctly as it is a real pmap
766 	 * structure, it should fail here.
767 	 *
768 	 * This only needs to occur for user pmaps because the kernel pmap's root
769 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
770 	 * changed so it'd be redundant to check), and its code signing fields are
771 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
772 	 * it shouldn't be possible to set those fields. Due to that, an attacker
773 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
774 	 * this check won't accomplish anything as it doesn't provide any extra code
775 	 * signing entitlements.
776 	 */
777 	if ((current != kernel_pmap) &&
778 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
779 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
780 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
781 	}
782 #endif /* XNU_MONITOR */
783 
784 	return current;
785 }
786 
787 #if DEVELOPMENT || DEBUG
788 
789 /*
790  * Trace levels are controlled by a bitmask in which each
791  * level can be enabled/disabled by the (1<<level) position
792  * in the boot arg
793  * Level 0: PPL extension functionality
794  * Level 1: pmap lifecycle (create/destroy/switch)
795  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
796  * Level 3: internal state management (attributes/fast-fault)
797  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
798  */
799 
800 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
801 
802 #define PMAP_TRACE(level, ...) \
803 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
804 	        KDBG_RELEASE(__VA_ARGS__); \
805 	}
806 #else /* DEVELOPMENT || DEBUG */
807 
808 #define PMAP_TRACE(level, ...)
809 
810 #endif /* DEVELOPMENT || DEBUG */
811 
812 
813 /*
814  * Internal function prototypes (forward declarations).
815  */
816 
817 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
818 
819 static void pmap_set_reference(ppnum_t pn);
820 
821 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
822 
823 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
824 
825 static kern_return_t pmap_expand(
826 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
827 
828 static int pmap_remove_range(
829 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
830 
831 static tt_entry_t *pmap_tt1_allocate(
832 	pmap_t, vm_size_t, unsigned int);
833 
834 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
835 
836 static void pmap_tt1_deallocate(
837 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
838 
839 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
840 
841 static kern_return_t pmap_tt_allocate(
842 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
843 
844 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
845 
846 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
847 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
848 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
849 
850 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
851 
852 
853 static void pmap_unmap_commpage(
854 	pmap_t pmap);
855 
856 static boolean_t
857 pmap_is_64bit(pmap_t);
858 
859 
860 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
861 
862 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
863 
864 static bool pmap_update_cache_attributes_locked(
865 	ppnum_t, unsigned, bool);
866 
867 static boolean_t arm_clear_fast_fault(
868 	ppnum_t ppnum,
869 	vm_prot_t fault_type,
870 	pt_entry_t *pte_p);
871 
872 static void pmap_trim_self(pmap_t pmap);
873 static void pmap_trim_subord(pmap_t subord);
874 
875 
876 /*
877  * Temporary prototypes, while we wait for pmap_enter to move to taking an
878  * address instead of a page number.
879  */
880 static kern_return_t
881 pmap_enter_addr(
882 	pmap_t pmap,
883 	vm_map_address_t v,
884 	pmap_paddr_t pa,
885 	vm_prot_t prot,
886 	vm_prot_t fault_type,
887 	unsigned int flags,
888 	boolean_t wired);
889 
890 kern_return_t
891 pmap_enter_options_addr(
892 	pmap_t pmap,
893 	vm_map_address_t v,
894 	pmap_paddr_t pa,
895 	vm_prot_t prot,
896 	vm_prot_t fault_type,
897 	unsigned int flags,
898 	boolean_t wired,
899 	unsigned int options,
900 	__unused void   *arg,
901 	__unused pmap_mapping_type_t mapping_type);
902 
903 #ifdef CONFIG_XNUPOST
904 kern_return_t pmap_test(void);
905 #endif /* CONFIG_XNUPOST */
906 
907 PMAP_SUPPORT_PROTOTYPES(
908 	kern_return_t,
909 	arm_fast_fault, (pmap_t pmap,
910 	vm_map_address_t va,
911 	vm_prot_t fault_type,
912 	bool was_af_fault,
913 	bool from_user), ARM_FAST_FAULT_INDEX);
914 
915 PMAP_SUPPORT_PROTOTYPES(
916 	boolean_t,
917 	arm_force_fast_fault, (ppnum_t ppnum,
918 	vm_prot_t allow_mode,
919 	int options), ARM_FORCE_FAST_FAULT_INDEX);
920 
921 MARK_AS_PMAP_TEXT static boolean_t
922 arm_force_fast_fault_with_flush_range(
923 	ppnum_t ppnum,
924 	vm_prot_t allow_mode,
925 	int options,
926 	pmap_tlb_flush_range_t *flush_range);
927 
928 /**
929  * Definition of the states driving the batch cache attributes update
930  * state machine.
931  */
932 typedef struct {
933 	uint64_t page_index : 32,           /* The page index to be operated on */
934 	    state : 8,                      /* The current state of the update machine */
935 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
936 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
937 	:0;
938 } batch_set_cache_attr_state_t;
939 
940 /* Possible values of the "state" field. */
941 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
942 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
943 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
944 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
945 
946 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
947 
948 PMAP_SUPPORT_PROTOTYPES(
949 	batch_set_cache_attr_state_t,
950 	pmap_batch_set_cache_attributes, (
951 #if XNU_MONITOR
952 		volatile upl_page_info_t *user_page_list,
953 #else /* !XNU_MONITOR */
954 		upl_page_info_array_t user_page_list,
955 #endif /* XNU_MONITOR */
956 		batch_set_cache_attr_state_t state,
957 		unsigned int page_cnt,
958 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
959 
960 PMAP_SUPPORT_PROTOTYPES(
961 	kern_return_t,
962 	pmap_change_wiring, (pmap_t pmap,
963 	vm_map_address_t v,
964 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
965 
966 PMAP_SUPPORT_PROTOTYPES(
967 	pmap_t,
968 	pmap_create_options, (ledger_t ledger,
969 	vm_map_size_t size,
970 	unsigned int flags,
971 	kern_return_t * kr), PMAP_CREATE_INDEX);
972 
973 PMAP_SUPPORT_PROTOTYPES(
974 	void,
975 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
976 
977 PMAP_SUPPORT_PROTOTYPES(
978 	kern_return_t,
979 	pmap_enter_options, (pmap_t pmap,
980 	vm_map_address_t v,
981 	pmap_paddr_t pa,
982 	vm_prot_t prot,
983 	vm_prot_t fault_type,
984 	unsigned int flags,
985 	boolean_t wired,
986 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
987 
988 PMAP_SUPPORT_PROTOTYPES(
989 	pmap_paddr_t,
990 	pmap_find_pa, (pmap_t pmap,
991 	addr64_t va), PMAP_FIND_PA_INDEX);
992 
993 PMAP_SUPPORT_PROTOTYPES(
994 	kern_return_t,
995 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
996 
997 
998 PMAP_SUPPORT_PROTOTYPES(
999 	boolean_t,
1000 	pmap_is_empty, (pmap_t pmap,
1001 	vm_map_offset_t va_start,
1002 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1003 
1004 
1005 PMAP_SUPPORT_PROTOTYPES(
1006 	unsigned int,
1007 	pmap_map_cpu_windows_copy, (ppnum_t pn,
1008 	vm_prot_t prot,
1009 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1010 
1011 PMAP_SUPPORT_PROTOTYPES(
1012 	void,
1013 	pmap_ro_zone_memcpy, (zone_id_t zid,
1014 	vm_offset_t va,
1015 	vm_offset_t offset,
1016 	const vm_offset_t new_data,
1017 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1018 
1019 PMAP_SUPPORT_PROTOTYPES(
1020 	uint64_t,
1021 	pmap_ro_zone_atomic_op, (zone_id_t zid,
1022 	vm_offset_t va,
1023 	vm_offset_t offset,
1024 	zro_atomic_op_t op,
1025 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1026 
1027 PMAP_SUPPORT_PROTOTYPES(
1028 	void,
1029 	pmap_ro_zone_bzero, (zone_id_t zid,
1030 	vm_offset_t va,
1031 	vm_offset_t offset,
1032 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1033 
1034 PMAP_SUPPORT_PROTOTYPES(
1035 	vm_map_offset_t,
1036 	pmap_nest, (pmap_t grand,
1037 	pmap_t subord,
1038 	addr64_t vstart,
1039 	uint64_t size,
1040 	vm_map_offset_t vrestart,
1041 	kern_return_t * krp), PMAP_NEST_INDEX);
1042 
1043 PMAP_SUPPORT_PROTOTYPES(
1044 	void,
1045 	pmap_page_protect_options, (ppnum_t ppnum,
1046 	vm_prot_t prot,
1047 	unsigned int options,
1048 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1049 
1050 PMAP_SUPPORT_PROTOTYPES(
1051 	vm_map_address_t,
1052 	pmap_protect_options, (pmap_t pmap,
1053 	vm_map_address_t start,
1054 	vm_map_address_t end,
1055 	vm_prot_t prot,
1056 	unsigned int options,
1057 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1058 
1059 PMAP_SUPPORT_PROTOTYPES(
1060 	kern_return_t,
1061 	pmap_query_page_info, (pmap_t pmap,
1062 	vm_map_offset_t va,
1063 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1064 
1065 PMAP_SUPPORT_PROTOTYPES(
1066 	mach_vm_size_t,
1067 	pmap_query_resident, (pmap_t pmap,
1068 	vm_map_address_t start,
1069 	vm_map_address_t end,
1070 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1071 
1072 PMAP_SUPPORT_PROTOTYPES(
1073 	void,
1074 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1075 
1076 PMAP_SUPPORT_PROTOTYPES(
1077 	vm_map_address_t,
1078 	pmap_remove_options, (pmap_t pmap,
1079 	vm_map_address_t start,
1080 	vm_map_address_t end,
1081 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1082 
1083 
1084 PMAP_SUPPORT_PROTOTYPES(
1085 	void,
1086 	pmap_set_cache_attributes, (ppnum_t pn,
1087 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1088 
1089 PMAP_SUPPORT_PROTOTYPES(
1090 	void,
1091 	pmap_update_compressor_page, (ppnum_t pn,
1092 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1093 
1094 PMAP_SUPPORT_PROTOTYPES(
1095 	void,
1096 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1097 
1098 #if MACH_ASSERT || XNU_MONITOR
1099 PMAP_SUPPORT_PROTOTYPES(
1100 	void,
1101 	pmap_set_process, (pmap_t pmap,
1102 	int pid,
1103 	char *procname), PMAP_SET_PROCESS_INDEX);
1104 #endif
1105 
1106 PMAP_SUPPORT_PROTOTYPES(
1107 	void,
1108 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1109 
1110 PMAP_SUPPORT_PROTOTYPES(
1111 	vm_map_offset_t,
1112 	pmap_unnest_options, (pmap_t grand,
1113 	addr64_t vaddr,
1114 	uint64_t size,
1115 	vm_map_offset_t vrestart,
1116 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1117 
1118 PMAP_SUPPORT_PROTOTYPES(
1119 	void,
1120 	phys_attribute_set, (ppnum_t pn,
1121 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1122 
1123 PMAP_SUPPORT_PROTOTYPES(
1124 	void,
1125 	phys_attribute_clear, (ppnum_t pn,
1126 	unsigned int bits,
1127 	int options,
1128 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1129 
1130 #if __ARM_RANGE_TLBI__
1131 PMAP_SUPPORT_PROTOTYPES(
1132 	vm_map_address_t,
1133 	phys_attribute_clear_range, (pmap_t pmap,
1134 	vm_map_address_t start,
1135 	vm_map_address_t end,
1136 	unsigned int bits,
1137 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1138 #endif /* __ARM_RANGE_TLBI__ */
1139 
1140 
1141 PMAP_SUPPORT_PROTOTYPES(
1142 	void,
1143 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1144 
1145 PMAP_SUPPORT_PROTOTYPES(
1146 	void,
1147 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1148 
1149 PMAP_SUPPORT_PROTOTYPES(
1150 	void,
1151 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1152 
1153 PMAP_SUPPORT_PROTOTYPES(
1154 	void,
1155 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1156 
1157 PMAP_SUPPORT_PROTOTYPES(
1158 	void,
1159 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1160 
1161 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1162 PMAP_SUPPORT_PROTOTYPES(
1163 	void,
1164 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1165 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1166 
1167 /* Definition of the states used by pmap_trim(). */
1168 typedef enum {
1169 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1170 	PMAP_TRIM_STATE_START = 0,
1171 
1172 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1173 	PMAP_TRIM_STATE_GRAND_BEFORE,
1174 
1175 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1176 	PMAP_TRIM_STATE_GRAND_AFTER,
1177 
1178 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1179 	PMAP_TRIM_STATE_SUBORD,
1180 
1181 	/* Marks that trimming is finished. */
1182 	PMAP_TRIM_STATE_DONE,
1183 
1184 	/* Sentry enum for sanity checks. */
1185 	PMAP_TRIM_STATE_COUNT,
1186 } pmap_trim_state_t;
1187 
1188 PMAP_SUPPORT_PROTOTYPES(
1189 	pmap_trim_state_t,
1190 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1191 
1192 #if HAS_APPLE_PAC
1193 PMAP_SUPPORT_PROTOTYPES(
1194 	void *,
1195 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1196 PMAP_SUPPORT_PROTOTYPES(
1197 	void *,
1198 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1199 #endif /* HAS_APPLE_PAC */
1200 
1201 
1202 
1203 
1204 PMAP_SUPPORT_PROTOTYPES(
1205 	kern_return_t,
1206 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1207 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1208 
1209 PMAP_SUPPORT_PROTOTYPES(
1210 	kern_return_t,
1211 	pmap_load_trust_cache_with_type, (TCType_t type,
1212 	const vm_address_t pmap_img4_payload,
1213 	const vm_size_t pmap_img4_payload_len,
1214 	const vm_address_t img4_manifest,
1215 	const vm_size_t img4_manifest_len,
1216 	const vm_address_t img4_aux_manifest,
1217 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1218 
1219 PMAP_SUPPORT_PROTOTYPES(
1220 	void,
1221 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1222 
1223 PMAP_SUPPORT_PROTOTYPES(
1224 	kern_return_t,
1225 	pmap_query_trust_cache, (TCQueryType_t query_type,
1226 	const uint8_t cdhash[kTCEntryHashSize],
1227 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1228 
1229 PMAP_SUPPORT_PROTOTYPES(
1230 	errno_t,
1231 	pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1232 	const void *input_data,
1233 	size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1234 
1235 #if PMAP_CS_INCLUDE_CODE_SIGNING
1236 
1237 PMAP_SUPPORT_PROTOTYPES(
1238 	kern_return_t,
1239 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1240 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1241 
1242 PMAP_SUPPORT_PROTOTYPES(
1243 	kern_return_t,
1244 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1245 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1246 
1247 PMAP_SUPPORT_PROTOTYPES(
1248 	kern_return_t,
1249 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1250 	pmap_cs_profile_t * profile_obj),
1251 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1252 
1253 PMAP_SUPPORT_PROTOTYPES(
1254 	kern_return_t,
1255 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1256 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1257 
1258 PMAP_SUPPORT_PROTOTYPES(
1259 	kern_return_t,
1260 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1261 	const void *kernel_entitlements),
1262 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1263 
1264 PMAP_SUPPORT_PROTOTYPES(
1265 	kern_return_t,
1266 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1267 	const void **kernel_entitlements),
1268 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1269 
1270 PMAP_SUPPORT_PROTOTYPES(
1271 	kern_return_t,
1272 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1273 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1274 
1275 PMAP_SUPPORT_PROTOTYPES(
1276 	kern_return_t,
1277 	pmap_cs_allow_invalid, (pmap_t pmap),
1278 	PMAP_CS_ALLOW_INVALID_INDEX);
1279 
1280 PMAP_SUPPORT_PROTOTYPES(
1281 	void,
1282 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1283 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1284 
1285 PMAP_SUPPORT_PROTOTYPES(
1286 	bool,
1287 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1288 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1289 
1290 PMAP_SUPPORT_PROTOTYPES(
1291 	void,
1292 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1293 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1294 
1295 PMAP_SUPPORT_PROTOTYPES(
1296 	void,
1297 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1298 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1299 
1300 #endif
1301 
1302 PMAP_SUPPORT_PROTOTYPES(
1303 	uint32_t,
1304 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1305 
1306 PMAP_SUPPORT_PROTOTYPES(
1307 	bool,
1308 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1309 
1310 PMAP_SUPPORT_PROTOTYPES(
1311 	void,
1312 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1313 
1314 void pmap_footprint_suspend(vm_map_t    map,
1315     boolean_t   suspend);
1316 PMAP_SUPPORT_PROTOTYPES(
1317 	void,
1318 	pmap_footprint_suspend, (vm_map_t map,
1319 	boolean_t suspend),
1320 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1321 
1322 
1323 
1324 
1325 
1326 #if DEVELOPMENT || DEBUG
1327 PMAP_SUPPORT_PROTOTYPES(
1328 	kern_return_t,
1329 	pmap_test_text_corruption, (pmap_paddr_t),
1330 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1331 #endif /* DEVELOPMENT || DEBUG */
1332 
1333 /*
1334  * The low global vector page is mapped at a fixed alias.
1335  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1336  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1337  * to check both addresses anyway for backward compatibility. So for now
1338  * we leave H6 and H7 where they were.
1339  */
1340 #if (ARM_PGSHIFT == 14)
1341 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1342 #else
1343 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1344 #endif
1345 
1346 
1347 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1348 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1349 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1350 
1351 #if XNU_MONITOR
1352 
1353 #if __has_feature(ptrauth_calls)
1354 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1355 #else
1356 #define __ptrauth_ppl_handler
1357 #endif
1358 
1359 /*
1360  * Table of function pointers used for PPL dispatch.
1361  */
1362 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1363 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1364 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1365 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1366 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1367 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1368 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1369 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1370 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1371 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1372 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1373 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1374 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1375 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1376 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1377 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1378 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1379 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1380 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1381 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1382 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1383 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1384 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1385 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1386 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1387 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1388 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1389 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1390 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1391 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1392 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1393 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1394 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1395 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1396 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1397 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1398 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1399 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1400 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1401 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1402 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1403 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1404 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1405 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1406 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1407 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1408 	[PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1409 #if PMAP_CS_INCLUDE_CODE_SIGNING
1410 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1411 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1412 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1413 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1414 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1415 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1416 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1417 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1418 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1419 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1420 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1421 #endif
1422 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1423 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1424 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1425 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1426 #if HAS_APPLE_PAC
1427 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1428 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1429 #endif /* HAS_APPLE_PAC */
1430 #if __ARM_RANGE_TLBI__
1431 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1432 #endif /* __ARM_RANGE_TLBI__ */
1433 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1434 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1435 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1436 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1437 
1438 #if DEVELOPMENT || DEBUG
1439 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1440 #endif /* DEVELOPMENT || DEBUG */
1441 
1442 
1443 };
1444 #endif
1445 
1446 #if XNU_MONITOR
1447 /**
1448  * A convenience function for setting protections on a single physical
1449  * aperture or static region mapping without invalidating the TLB.
1450  *
1451  * @note This function does not perform any TLB invalidations. That must be done
1452  *       separately to be able to safely use the updated mapping.
1453  *
1454  * @note This function understands the difference between the VM page size and
1455  *       the kernel page size and will update multiple PTEs if the sizes differ.
1456  *       In other words, enough PTEs will always get updated to change the
1457  *       permissions on a PAGE_SIZE amount of memory.
1458  *
1459  * @note The PVH lock for the physical page represented by this mapping must
1460  *       already be locked.
1461  *
1462  * @note This function assumes the caller has already verified that the PTE
1463  *       pointer does indeed point to a physical aperture or static region page
1464  *       table. Please validate your inputs before passing it along to this
1465  *       function.
1466  *
1467  * @param ptep Pointer to the physical aperture or static region page table to
1468  *             update with a new XPRR index.
1469  * @param expected_perm The XPRR index that is expected to already exist at the
1470  *                      current mapping. If the current index doesn't match this
1471  *                      then the system will panic.
1472  * @param new_perm The new XPRR index to update the mapping with.
1473  */
1474 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1475 pmap_set_pte_xprr_perm(
1476 	pt_entry_t * const ptep,
1477 	unsigned int expected_perm,
1478 	unsigned int new_perm)
1479 {
1480 	assert(ptep != NULL);
1481 
1482 	pt_entry_t spte = *ptep;
1483 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1484 
1485 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1486 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1487 		    __func__, ptep, new_perm, expected_perm);
1488 	}
1489 
1490 	/**
1491 	 * The PTE involved should be valid, should not have the hint bit set, and
1492 	 * should have the expected XPRR index.
1493 	 */
1494 	if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1495 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1496 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1497 		    __func__, ptep, spte, new_perm, expected_perm);
1498 	}
1499 
1500 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1501 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1502 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1503 		    __func__, ptep, spte, new_perm, expected_perm);
1504 	}
1505 
1506 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1507 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1508 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1509 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1510 	}
1511 
1512 	pt_entry_t template = spte;
1513 	template &= ~ARM_PTE_XPRR_MASK;
1514 	template |= xprr_perm_to_pte(new_perm);
1515 
1516 	write_pte_strong(ptep, template);
1517 }
1518 
1519 /**
1520  * Update the protections on a single physical aperture mapping and invalidate
1521  * the TLB so the mapping can be used.
1522  *
1523  * @note The PVH lock for the physical page must already be locked.
1524  *
1525  * @param pai The physical address index of the page whose physical aperture
1526  *            mapping will be updated with new permissions.
1527  * @param expected_perm The XPRR index that is expected to already exist at the
1528  *                      current mapping. If the current index doesn't match this
1529  *                      then the system will panic.
1530  * @param new_perm The new XPRR index to update the mapping with.
1531  */
1532 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1533 pmap_set_xprr_perm(
1534 	unsigned int pai,
1535 	unsigned int expected_perm,
1536 	unsigned int new_perm)
1537 {
1538 	pvh_assert_locked(pai);
1539 
1540 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1541 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1542 
1543 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1544 
1545 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1546 	sync_tlb_flush();
1547 }
1548 
1549 /**
1550  * Update the protections on a range of physical aperture or static region
1551  * mappings and invalidate the TLB so the mappings can be used.
1552  *
1553  * @note Static region mappings can only be updated before machine_lockdown().
1554  *       Physical aperture mappings can be updated at any time.
1555  *
1556  * @param start The starting virtual address of the static region or physical
1557  *              aperture range whose permissions will be updated.
1558  * @param end The final (inclusive) virtual address of the static region or
1559  *            physical aperture range whose permissions will be updated.
1560  * @param expected_perm The XPRR index that is expected to already exist at the
1561  *                      current mappings. If the current indices don't match
1562  *                      this then the system will panic.
1563  * @param new_perm The new XPRR index to update the mappings with.
1564  */
1565 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1566 pmap_set_range_xprr_perm(
1567 	vm_address_t start,
1568 	vm_address_t end,
1569 	unsigned int expected_perm,
1570 	unsigned int new_perm)
1571 {
1572 	/**
1573 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1574 	 */
1575 	if (__improbable((start | end) & ARM_PGMASK)) {
1576 		panic_plain("%s: start or end not page aligned, "
1577 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1578 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1579 	}
1580 
1581 	if (__improbable(start > end)) {
1582 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1583 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1584 	}
1585 
1586 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1587 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1588 
1589 	if (__improbable(!(in_physmap || in_static))) {
1590 		panic_plain("%s: address not in static region or physical aperture, "
1591 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1592 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1593 	}
1594 
1595 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1596 		panic_plain("%s: invalid XPRR index, "
1597 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1598 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1599 	}
1600 
1601 	/*
1602 	 * Walk over the PTEs for the given range, and set the protections on those
1603 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1604 	 * one twig entry (whichever twig entry currently maps "va").
1605 	 */
1606 	vm_address_t va = start;
1607 	while (va < end) {
1608 		/**
1609 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1610 		 * PTEs from va to tte_va_end will have their permissions updated.
1611 		 */
1612 		vm_address_t tte_va_end =
1613 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1614 
1615 		if (tte_va_end > end) {
1616 			tte_va_end = end;
1617 		}
1618 
1619 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1620 
1621 		if (ttep == NULL) {
1622 			panic_plain("%s: physical aperture or static region tte is NULL, "
1623 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1624 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1625 		}
1626 
1627 		tt_entry_t tte = *ttep;
1628 
1629 		if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1630 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1631 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1632 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1633 		}
1634 
1635 		/* Walk over the given L3 page table page and update the PTEs. */
1636 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1637 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1638 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1639 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1640 
1641 		/**
1642 		 * The current PTE pointer is incremented by the page ratio (ratio of
1643 		 * VM page size to kernel hardware page size) because one call to
1644 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1645 		 * a PAGE_SIZE worth of hardware pages.
1646 		 */
1647 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1648 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1649 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1650 			pvh_lock(pai);
1651 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1652 			pvh_unlock(pai);
1653 		}
1654 
1655 		va = tte_va_end;
1656 	}
1657 
1658 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1659 }
1660 
1661 #endif /* XNU_MONITOR */
1662 
1663 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1664 PMAP_ZINFO_PALLOC(
1665 	pmap_t pmap, int bytes)
1666 {
1667 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1668 }
1669 
1670 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1671 PMAP_ZINFO_PFREE(
1672 	pmap_t pmap,
1673 	int bytes)
1674 {
1675 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1676 }
1677 
1678 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1679 pmap_tt_ledger_credit(
1680 	pmap_t          pmap,
1681 	vm_size_t       size)
1682 {
1683 	if (pmap != kernel_pmap) {
1684 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1685 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1686 	}
1687 }
1688 
1689 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1690 pmap_tt_ledger_debit(
1691 	pmap_t          pmap,
1692 	vm_size_t       size)
1693 {
1694 	if (pmap != kernel_pmap) {
1695 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1696 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1697 	}
1698 }
1699 
1700 static inline void
pmap_update_plru(uint16_t asid_index __unused)1701 pmap_update_plru(uint16_t asid_index __unused)
1702 {
1703 #if !HAS_16BIT_ASID
1704 	if (__probable(pmap_asid_plru)) {
1705 		unsigned plru_index = asid_index >> 6;
1706 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1707 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1708 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1709 		}
1710 	}
1711 #endif /* !HAS_16BIT_ASID */
1712 }
1713 
1714 static bool
alloc_asid(pmap_t pmap)1715 alloc_asid(pmap_t pmap)
1716 {
1717 	int vasid = -1;
1718 	uint16_t hw_asid;
1719 
1720 	pmap_simple_lock(&asid_lock);
1721 
1722 #if !HAS_16BIT_ASID
1723 	if (__probable(pmap_asid_plru)) {
1724 		unsigned plru_index = 0;
1725 		uint64_t lowest_gen = asid_plru_generation[0];
1726 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1727 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1728 			if (asid_plru_generation[i] < lowest_gen) {
1729 				plru_index = i;
1730 				lowest_gen = asid_plru_generation[i];
1731 				lowest_gen_bitmap = asid_plru_bitmap[i];
1732 			}
1733 		}
1734 
1735 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1736 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1737 			if (temp_plru) {
1738 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1739 #if DEVELOPMENT || DEBUG
1740 				++pmap_asid_hits;
1741 #endif
1742 				break;
1743 			}
1744 		}
1745 	}
1746 #else
1747 	/**
1748 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1749 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1750 	 * However, we first try to allocate starting from the position of the most-recently allocated
1751 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1752 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1753 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1754 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1755 	 * logic, without requiring prohibitively expensive RCTX instructions.
1756 	 */
1757 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1758 #endif /* !HAS_16BIT_ASID */
1759 	if (__improbable(vasid < 0)) {
1760 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1761 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1762 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1763 #if DEVELOPMENT || DEBUG
1764 		++pmap_asid_misses;
1765 #endif
1766 	}
1767 	if (__improbable(vasid < 0)) {
1768 		pmap_simple_unlock(&asid_lock);
1769 		return false;
1770 	}
1771 	assert((uint32_t)vasid < pmap_max_asids);
1772 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1773 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1774 #if HAS_16BIT_ASID
1775 	last_allocated_asid = (uint16_t)vasid;
1776 #endif /* HAS_16BIT_ASID */
1777 	pmap_simple_unlock(&asid_lock);
1778 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1779 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1780 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1781 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1782 		 * reassign to a reserved VASID. */
1783 		assert(pmap->sw_asid < UINT8_MAX);
1784 		pmap->sw_asid = UINT8_MAX;
1785 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1786 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1787 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1788 		assert(hw_asid < MAX_HW_ASIDS);
1789 	}
1790 	pmap_update_plru(hw_asid);
1791 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1792 #if __ARM_KERNEL_PROTECT__
1793 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1794 #endif
1795 	pmap->hw_asid = hw_asid;
1796 	return true;
1797 }
1798 
1799 static void
free_asid(pmap_t pmap)1800 free_asid(pmap_t pmap)
1801 {
1802 	unsigned int vasid;
1803 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1804 	if (__improbable(hw_asid == 0)) {
1805 		return;
1806 	}
1807 
1808 #if __ARM_KERNEL_PROTECT__
1809 	hw_asid >>= 1;
1810 #endif
1811 	hw_asid -= 1;
1812 
1813 #if HAS_16BIT_ASID
1814 	vasid = hw_asid;
1815 #else
1816 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1817 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1818 	} else {
1819 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1820 	}
1821 
1822 	if (__probable(pmap_asid_plru)) {
1823 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1824 	}
1825 #endif /* HAS_16BIT_ASID */
1826 	pmap_simple_lock(&asid_lock);
1827 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1828 	bitmap_set(&asid_bitmap[0], vasid);
1829 	pmap_simple_unlock(&asid_lock);
1830 }
1831 
1832 
1833 boolean_t
pmap_valid_address(pmap_paddr_t addr)1834 pmap_valid_address(
1835 	pmap_paddr_t addr)
1836 {
1837 	return pa_valid(addr);
1838 }
1839 
1840 
1841 
1842 
1843 
1844 
1845 /*
1846  *      Map memory at initialization.  The physical addresses being
1847  *      mapped are not managed and are never unmapped.
1848  *
1849  *      For now, VM is already on, we only need to map the
1850  *      specified memory.
1851  */
1852 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1853 pmap_map(
1854 	vm_map_address_t virt,
1855 	vm_offset_t start,
1856 	vm_offset_t end,
1857 	vm_prot_t prot,
1858 	unsigned int flags)
1859 {
1860 	kern_return_t   kr;
1861 	vm_size_t       ps;
1862 
1863 	ps = PAGE_SIZE;
1864 	while (start < end) {
1865 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1866 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1867 
1868 		if (kr != KERN_SUCCESS) {
1869 			panic("%s: failed pmap_enter, "
1870 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1871 			    __FUNCTION__,
1872 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1873 		}
1874 
1875 		virt += ps;
1876 		start += ps;
1877 	}
1878 	return virt;
1879 }
1880 
1881 #if XNU_MONITOR
1882 /**
1883  * Remove kernel writeablity from an IO PTE value if the page is owned by
1884  * guarded mode software.
1885  *
1886  * @param paddr The physical address of the page which has to be non-DRAM.
1887  * @param tmplate The PTE value to be evaluated.
1888  *
1889  * @return A new PTE value with permission bits modified.
1890  */
1891 static inline
1892 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1893 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1894 {
1895 	assert(!pa_valid(paddr));
1896 
1897 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1898 
1899 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1900 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1901 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1902 		switch (xprr_perm) {
1903 		case XPRR_KERN_RO_PERM:
1904 			break;
1905 		case XPRR_KERN_RW_PERM:
1906 			tmplate &= ~ARM_PTE_XPRR_MASK;
1907 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1908 			break;
1909 		default:
1910 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1911 		}
1912 	}
1913 
1914 	return tmplate;
1915 }
1916 #endif /* XNU_MONITOR */
1917 
1918 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1919 pmap_map_bd_with_options(
1920 	vm_map_address_t virt,
1921 	vm_offset_t start,
1922 	vm_offset_t end,
1923 	vm_prot_t prot,
1924 	int32_t options)
1925 {
1926 	pt_entry_t      mem_attr;
1927 
1928 	switch (options & PMAP_MAP_BD_MASK) {
1929 	case PMAP_MAP_BD_WCOMB:
1930 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1931 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1932 		break;
1933 	case PMAP_MAP_BD_POSTED:
1934 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1935 		break;
1936 	case PMAP_MAP_BD_POSTED_REORDERED:
1937 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1938 		break;
1939 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1940 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1941 		break;
1942 	default:
1943 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1944 		break;
1945 	}
1946 
1947 	/* not cacheable and not buffered */
1948 	pt_entry_t tmplate = pa_to_pte(start)
1949 	    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1950 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1951 	    | mem_attr;
1952 
1953 #if __ARM_KERNEL_PROTECT__
1954 	tmplate |= ARM_PTE_NG;
1955 #endif /* __ARM_KERNEL_PROTECT__ */
1956 
1957 	vm_map_address_t vaddr = virt;
1958 	vm_offset_t paddr = start;
1959 	while (paddr < end) {
1960 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1961 		if (ptep == PT_ENTRY_NULL) {
1962 			panic("pmap_map_bd");
1963 		}
1964 
1965 		/**
1966 		 * For every iteration, the paddr encoded in tmplate is incrementing,
1967 		 * but we always start with the original AP bits defined at the top
1968 		 * of the function in tmplate and only modify the AP bits in the pte
1969 		 * variable.
1970 		 */
1971 		pt_entry_t pte;
1972 #if XNU_MONITOR
1973 		if (!pa_valid(paddr)) {
1974 			pte = pmap_construct_io_pte(paddr, tmplate);
1975 		} else {
1976 			pte = tmplate;
1977 		}
1978 #else /* !XNU_MONITOR */
1979 		pte = tmplate;
1980 #endif
1981 
1982 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1983 		write_pte_strong(ptep, pte);
1984 
1985 		pte_increment_pa(tmplate);
1986 		vaddr += PAGE_SIZE;
1987 		paddr += PAGE_SIZE;
1988 	}
1989 
1990 	if (end >= start) {
1991 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
1992 	}
1993 
1994 	return vaddr;
1995 }
1996 
1997 /*
1998  *      Back-door routine for mapping kernel VM at initialization.
1999  *      Useful for mapping memory outside the range
2000  *      [vm_first_phys, vm_last_phys] (i.e., devices).
2001  *      Otherwise like pmap_map.
2002  */
2003 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2004 pmap_map_bd(
2005 	vm_map_address_t virt,
2006 	vm_offset_t start,
2007 	vm_offset_t end,
2008 	vm_prot_t prot)
2009 {
2010 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
2011 }
2012 
2013 /*
2014  *      Back-door routine for mapping kernel VM at initialization.
2015  *      Useful for mapping memory specific physical addresses in early
2016  *      boot (i.e., before kernel_map is initialized).
2017  *
2018  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
2019  */
2020 
2021 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2022 pmap_map_high_window_bd(
2023 	vm_offset_t pa_start,
2024 	vm_size_t len,
2025 	vm_prot_t prot)
2026 {
2027 	pt_entry_t              *ptep, pte;
2028 	vm_map_address_t        va_start = VREGION1_START;
2029 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
2030 	vm_map_address_t        va_end;
2031 	vm_map_address_t        va;
2032 	vm_size_t               offset;
2033 
2034 	offset = pa_start & PAGE_MASK;
2035 	pa_start -= offset;
2036 	len += offset;
2037 
2038 	if (len > (va_max - va_start)) {
2039 		panic("%s: area too large, "
2040 		    "pa_start=%p, len=%p, prot=0x%x",
2041 		    __FUNCTION__,
2042 		    (void*)pa_start, (void*)len, prot);
2043 	}
2044 
2045 scan:
2046 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2047 		ptep = pmap_pte(kernel_pmap, va_start);
2048 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2049 		if (*ptep == ARM_PTE_TYPE_FAULT) {
2050 			break;
2051 		}
2052 	}
2053 	if (va_start > va_max) {
2054 		panic("%s: insufficient pages, "
2055 		    "pa_start=%p, len=%p, prot=0x%x",
2056 		    __FUNCTION__,
2057 		    (void*)pa_start, (void*)len, prot);
2058 	}
2059 
2060 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2061 		ptep = pmap_pte(kernel_pmap, va_end);
2062 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2063 		if (*ptep != ARM_PTE_TYPE_FAULT) {
2064 			va_start = va_end + PAGE_SIZE;
2065 			goto scan;
2066 		}
2067 	}
2068 
2069 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2070 		ptep = pmap_pte(kernel_pmap, va);
2071 		pte = pa_to_pte(pa_start)
2072 		    | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2073 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2074 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2075 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2076 #if __ARM_KERNEL_PROTECT__
2077 		pte |= ARM_PTE_NG;
2078 #endif /* __ARM_KERNEL_PROTECT__ */
2079 		write_pte_strong(ptep, pte);
2080 	}
2081 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2082 #if KASAN
2083 	kasan_notify_address(va_start, len);
2084 #endif
2085 	return va_start;
2086 }
2087 
2088 static uint32_t
pmap_compute_max_asids(void)2089 pmap_compute_max_asids(void)
2090 {
2091 	DTEntry entry;
2092 	void const *prop = NULL;
2093 	uint32_t max_asids;
2094 	int err;
2095 	unsigned int prop_size;
2096 
2097 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2098 	assert(err == kSuccess);
2099 
2100 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2101 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2102 		 * we can choose a more flexible default value here. */
2103 		return MAX_ASIDS;
2104 	}
2105 
2106 	if (prop_size != sizeof(max_asids)) {
2107 		panic("pmap-max-asids property is not a 32-bit integer");
2108 	}
2109 
2110 	max_asids = *((uint32_t const *)prop);
2111 #if HAS_16BIT_ASID
2112 	if (max_asids > MAX_HW_ASIDS) {
2113 		panic("pmap-max-asids 0x%x too large", max_asids);
2114 	}
2115 #else
2116 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2117 	max_asids = (max_asids + 63) & ~63UL;
2118 
2119 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2120 		/* currently capped by size of pmap->sw_asid */
2121 		panic("pmap-max-asids 0x%x too large", max_asids);
2122 	}
2123 #endif /* HAS_16BIT_ASID */
2124 	if (max_asids == 0) {
2125 		panic("pmap-max-asids cannot be zero");
2126 	}
2127 	return max_asids;
2128 }
2129 
2130 #if __arm64__
2131 /*
2132  * pmap_get_arm64_prot
2133  *
2134  * return effective armv8 VMSA block protections including
2135  * table AP/PXN/XN overrides of a pmap entry
2136  *
2137  */
2138 
2139 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2140 pmap_get_arm64_prot(
2141 	pmap_t pmap,
2142 	vm_offset_t addr)
2143 {
2144 	tt_entry_t tte = 0;
2145 	unsigned int level = 0;
2146 	uint64_t tte_type = 0;
2147 	uint64_t effective_prot_bits = 0;
2148 	uint64_t aggregate_tte = 0;
2149 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2150 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2151 
2152 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2153 		tte = *pmap_ttne(pmap, level, addr);
2154 
2155 		if (!(tte & ARM_TTE_VALID)) {
2156 			return 0;
2157 		}
2158 
2159 		tte_type = tte & ARM_TTE_TYPE_MASK;
2160 
2161 		if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2162 		    (level == pt_attr->pta_max_level)) {
2163 			/* Block or page mapping; both have the same protection bit layout. */
2164 			break;
2165 		} else if (tte_type == ARM_TTE_TYPE_TABLE) {
2166 			/* All of the table bits we care about are overrides, so just OR them together. */
2167 			aggregate_tte |= tte;
2168 		}
2169 	}
2170 
2171 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2172 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2173 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2174 
2175 	/* Start with the PTE bits. */
2176 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2177 
2178 	/* Table AP bits mask out block/page AP bits */
2179 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2180 
2181 	/* XN/PXN bits can be OR'd in. */
2182 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2183 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2184 
2185 	return effective_prot_bits;
2186 }
2187 #endif /* __arm64__ */
2188 
2189 /**
2190  * Helper macros for accessing the "unnested" and "in-progress" bits in
2191  * pmap->nested_region_unnested_table_bitmap.
2192  */
2193 #define UNNEST_BIT(index) ((index) * 2)
2194 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2195 
2196 
2197 /*
2198  *	Bootstrap the system enough to run with virtual memory.
2199  *
2200  *	The early VM initialization code has already allocated
2201  *	the first CPU's translation table and made entries for
2202  *	all the one-to-one mappings to be found there.
2203  *
2204  *	We must set up the kernel pmap structures, the
2205  *	physical-to-virtual translation lookup tables for the
2206  *	physical memory to be managed (between avail_start and
2207  *	avail_end).
2208  *
2209  *	Map the kernel's code and data, and allocate the system page table.
2210  *	Page_size must already be set.
2211  *
2212  *	Parameters:
2213  *	first_avail	first available physical page -
2214  *			   after kernel page tables
2215  *	avail_start	PA of first managed physical page
2216  *	avail_end	PA of last managed physical page
2217  */
2218 
2219 void
pmap_bootstrap(vm_offset_t vstart)2220 pmap_bootstrap(
2221 	vm_offset_t vstart)
2222 {
2223 	vm_map_offset_t maxoffset;
2224 
2225 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2226 
2227 #if XNU_MONITOR
2228 
2229 #if DEVELOPMENT || DEBUG
2230 	PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2231 #endif
2232 
2233 #if CONFIG_CSR_FROM_DT
2234 	if (csr_unsafe_kernel_text) {
2235 		pmap_ppl_disable = true;
2236 	}
2237 #endif /* CONFIG_CSR_FROM_DT */
2238 
2239 #endif /* XNU_MONITOR */
2240 
2241 #if DEVELOPMENT || DEBUG
2242 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2243 		kprintf("Kernel traces for pmap operations enabled\n");
2244 	}
2245 #endif
2246 
2247 	/*
2248 	 *	Initialize the kernel pmap.
2249 	 */
2250 #if ARM_PARAMETERIZED_PMAP
2251 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2252 #endif /* ARM_PARAMETERIZED_PMAP */
2253 #if HAS_APPLE_PAC
2254 	kernel_pmap->disable_jop = 0;
2255 #endif /* HAS_APPLE_PAC */
2256 	kernel_pmap->tte = cpu_tte;
2257 	kernel_pmap->ttep = cpu_ttep;
2258 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2259 	kernel_pmap->max = UINTPTR_MAX;
2260 	os_atomic_init(&kernel_pmap->ref_count, 1);
2261 #if XNU_MONITOR
2262 	os_atomic_init(&kernel_pmap->nested_count, 0);
2263 #endif
2264 	kernel_pmap->nx_enabled = TRUE;
2265 #ifdef  __arm64__
2266 	kernel_pmap->is_64bit = TRUE;
2267 #else
2268 	kernel_pmap->is_64bit = FALSE;
2269 #endif
2270 #if CONFIG_ROSETTA
2271 	kernel_pmap->is_rosetta = FALSE;
2272 #endif
2273 
2274 #if ARM_PARAMETERIZED_PMAP
2275 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2276 #endif /* ARM_PARAMETERIZED_PMAP */
2277 
2278 	kernel_pmap->nested_region_addr = 0x0ULL;
2279 	kernel_pmap->nested_region_size = 0x0ULL;
2280 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2281 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2282 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2283 
2284 	kernel_pmap->hw_asid = 0;
2285 	kernel_pmap->sw_asid = 0;
2286 
2287 	pmap_lock_init(kernel_pmap);
2288 
2289 	pmap_max_asids = pmap_compute_max_asids();
2290 #if HAS_16BIT_ASID
2291 	asid_chunk_size = MAX_HW_ASIDS;
2292 #else
2293 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2294 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2295 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2296 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2297 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2298 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2299 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2300 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2301 #endif /* HAS_16BIT_ASIDS */
2302 
2303 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2304 
2305 #if HAS_SPECRES_DEBUGGING
2306 	PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2307 
2308 	if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2309 		panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2310 	}
2311 #endif /* HAS_SPECRES_DEBUGGING */
2312 
2313 	/**
2314 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2315 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2316 	 * space for these data structures.
2317 	 */
2318 	pmap_data_bootstrap();
2319 
2320 	/**
2321 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2322 	 */
2323 	uat_bootstrap();
2324 
2325 
2326 	/**
2327 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2328 	 */
2329 	sart_bootstrap();
2330 
2331 	/**
2332 	 * Don't make any assumptions about the alignment of avail_start before this
2333 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2334 	 */
2335 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2336 
2337 	const pmap_paddr_t pmap_struct_start = avail_start;
2338 
2339 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2340 	avail_start = round_page(avail_start + asid_table_size);
2341 
2342 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2343 
2344 	vm_first_phys = gPhysBase;
2345 	vm_last_phys = trunc_page(avail_end);
2346 
2347 	queue_init(&map_pmap_list);
2348 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2349 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2350 	free_page_size_tt_count = 0;
2351 	free_page_size_tt_max = 0;
2352 	free_tt_list = TT_FREE_ENTRY_NULL;
2353 	free_tt_count = 0;
2354 	free_tt_max = 0;
2355 
2356 	virtual_space_start = vstart;
2357 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2358 
2359 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2360 #if !HAS_16BIT_ASID
2361 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2362 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2363 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2364 #endif /* !HAS_16BIT_ASID */
2365 
2366 
2367 
2368 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2369 		maxoffset = trunc_page(maxoffset);
2370 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2371 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2372 			arm_pmap_max_offset_default = maxoffset;
2373 		}
2374 	}
2375 #if defined(__arm64__)
2376 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2377 		maxoffset = trunc_page(maxoffset);
2378 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2379 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2380 			arm64_pmap_max_offset_default = maxoffset;
2381 		}
2382 	}
2383 #endif
2384 
2385 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2386 
2387 
2388 #if PMAP_CS_PPL_MONITOR
2389 	/* Initialize the PPL trust cache read-write lock */
2390 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2391 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2392 #endif
2393 
2394 #if DEVELOPMENT || DEBUG
2395 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2396 	    &vm_footprint_suspend_allowed,
2397 	    sizeof(vm_footprint_suspend_allowed));
2398 #endif /* DEVELOPMENT || DEBUG */
2399 
2400 #if KASAN
2401 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2402 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2403 #endif /* KASAN */
2404 
2405 	/**
2406 	 * Ensure that avail_start is always left on a page boundary. The calling
2407 	 * code might not perform any alignment before allocating page tables so
2408 	 * this is important.
2409 	 */
2410 	avail_start = round_page(avail_start);
2411 }
2412 
2413 #if XNU_MONITOR
2414 
2415 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2416 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2417 {
2418 	pmap_paddr_t cur_pa;
2419 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2420 		assert(pa_valid(cur_pa));
2421 		ppattr_pa_set_monitor(cur_pa);
2422 	}
2423 }
2424 
2425 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2426 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2427     pmap_paddr_t end_pa,
2428     unsigned int expected_perm,
2429     unsigned int new_perm)
2430 {
2431 	vm_offset_t start_va = phystokv(start_pa);
2432 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2433 
2434 	pa_set_range_monitor(start_pa, end_pa);
2435 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2436 }
2437 
2438 static void
pmap_lockdown_kc(void)2439 pmap_lockdown_kc(void)
2440 {
2441 	extern vm_offset_t vm_kernelcache_base;
2442 	extern vm_offset_t vm_kernelcache_top;
2443 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2444 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2445 	pmap_paddr_t cur_pa = start_pa;
2446 	vm_offset_t cur_va = vm_kernelcache_base;
2447 	while (cur_pa < end_pa) {
2448 		vm_size_t range_size = end_pa - cur_pa;
2449 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2450 		if (ptov_va != cur_va) {
2451 			/*
2452 			 * If the physical address maps back to a virtual address that is non-linear
2453 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2454 			 * reclaimed by the OS and should therefore not be locked down.
2455 			 */
2456 			cur_pa += range_size;
2457 			cur_va += range_size;
2458 			continue;
2459 		}
2460 		unsigned int pai = pa_index(cur_pa);
2461 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2462 
2463 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2464 
2465 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2466 			panic("pai %d already locked down", pai);
2467 		}
2468 
2469 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2470 		cur_pa += ARM_PGBYTES;
2471 		cur_va += ARM_PGBYTES;
2472 	}
2473 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2474 	extern uint64_t ctrr_ro_test;
2475 	extern uint64_t ctrr_nx_test;
2476 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2477 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2478 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2479 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2480 	}
2481 #endif
2482 }
2483 
2484 void
pmap_static_allocations_done(void)2485 pmap_static_allocations_done(void)
2486 {
2487 	pmap_paddr_t monitor_start_pa;
2488 	pmap_paddr_t monitor_end_pa;
2489 
2490 	/*
2491 	 * Protect the bootstrap (V=P and V->P) page tables.
2492 	 *
2493 	 * These bootstrap allocations will be used primarily for page tables.
2494 	 * If we wish to secure the page tables, we need to start by marking
2495 	 * these bootstrap allocations as pages that we want to protect.
2496 	 */
2497 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2498 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2499 
2500 	/* The bootstrap page tables are mapped RW at boostrap. */
2501 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2502 
2503 	/*
2504 	 * We use avail_start as a pointer to the first address that has not
2505 	 * been reserved for bootstrap, so we know which pages to give to the
2506 	 * virtual memory layer.
2507 	 */
2508 	monitor_start_pa = first_avail_phys;
2509 	monitor_end_pa = avail_start;
2510 
2511 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2512 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2513 
2514 	/*
2515 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2516 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2517 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2518 	 * they can't be allocated for other uses.  We don't need a special xPRR
2519 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2520 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2521 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2522 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2523 	 * to believe we are dealing with an user XO page upon performing a translation.
2524 	 */
2525 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2526 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2527 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2528 
2529 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2530 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2531 
2532 	/* PPL data is RW for the PPL, RO for the kernel. */
2533 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2534 
2535 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2536 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2537 
2538 	/* PPL text is RX for the PPL, RO for the kernel. */
2539 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2540 
2541 
2542 	/*
2543 	 * In order to support DTrace, the save areas for the PPL must be
2544 	 * writable.  This is due to the fact that DTrace will try to update
2545 	 * register state.
2546 	 */
2547 	if (pmap_ppl_disable) {
2548 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2549 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2550 
2551 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2552 	}
2553 
2554 
2555 	if (segSizePPLDATACONST > 0) {
2556 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2557 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2558 
2559 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2560 	}
2561 
2562 	/*
2563 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2564 	 * precaution.  The real RW mappings are at a different location with guard pages.
2565 	 */
2566 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2567 
2568 	/* Prevent remapping of the kernelcache */
2569 	pmap_lockdown_kc();
2570 }
2571 
2572 
2573 void
pmap_lockdown_ppl(void)2574 pmap_lockdown_ppl(void)
2575 {
2576 	/* Mark the PPL as being locked down. */
2577 
2578 	mp_disable_preemption(); // for _nopreempt locking operations
2579 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2580 	if (commpage_text_kva != 0) {
2581 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2582 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2583 	}
2584 	mp_enable_preemption();
2585 
2586 	/* Write-protect the kernel RO commpage. */
2587 #error "XPRR configuration error"
2588 }
2589 #endif /* XNU_MONITOR */
2590 
2591 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2592 pmap_virtual_space(
2593 	vm_offset_t *startp,
2594 	vm_offset_t *endp
2595 	)
2596 {
2597 	*startp = virtual_space_start;
2598 	*endp = virtual_space_end;
2599 }
2600 
2601 
2602 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2603 pmap_virtual_region(
2604 	unsigned int region_select,
2605 	vm_map_offset_t *startp,
2606 	vm_map_size_t *size
2607 	)
2608 {
2609 	boolean_t       ret = FALSE;
2610 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2611 	if (region_select == 0) {
2612 		/*
2613 		 * In this config, the bootstrap mappings should occupy their own L2
2614 		 * TTs, as they should be immutable after boot.  Having the associated
2615 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2616 		 * while allowing the rest of the kernel address range to be remapped.
2617 		 */
2618 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2619 #if defined(ARM_LARGE_MEMORY)
2620 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2621 #else
2622 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2623 #endif
2624 		ret = TRUE;
2625 	}
2626 
2627 #if defined(ARM_LARGE_MEMORY)
2628 	if (region_select == 1) {
2629 		*startp = VREGION1_START;
2630 		*size = VREGION1_SIZE;
2631 		ret = TRUE;
2632 	}
2633 #endif
2634 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2635 #if defined(ARM_LARGE_MEMORY)
2636 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
2637 	if (region_select == 0) {
2638 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2639 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2640 		ret = TRUE;
2641 	}
2642 
2643 	if (region_select == 1) {
2644 		*startp = VREGION1_START;
2645 		*size = VREGION1_SIZE;
2646 		ret = TRUE;
2647 	}
2648 #else /* !defined(ARM_LARGE_MEMORY) */
2649 	unsigned long low_global_vr_mask = 0;
2650 	vm_map_size_t low_global_vr_size = 0;
2651 
2652 	if (region_select == 0) {
2653 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2654 		if (!TEST_PAGE_SIZE_4K) {
2655 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2656 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2657 		} else {
2658 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2659 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2660 		}
2661 		ret = TRUE;
2662 	}
2663 	if (region_select == 1) {
2664 		*startp = VREGION1_START;
2665 		*size = VREGION1_SIZE;
2666 		ret = TRUE;
2667 	}
2668 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2669 	if (!TEST_PAGE_SIZE_4K) {
2670 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2671 		low_global_vr_size = 0x2000000;
2672 	} else {
2673 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2674 		low_global_vr_size = 0x800000;
2675 	}
2676 
2677 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2678 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2679 		*size = low_global_vr_size;
2680 		ret = TRUE;
2681 	}
2682 
2683 	if (region_select == 3) {
2684 		/* In this config, we allow the bootstrap mappings to occupy the same
2685 		 * page table pages as the heap.
2686 		 */
2687 		*startp = VM_MIN_KERNEL_ADDRESS;
2688 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2689 		ret = TRUE;
2690 	}
2691 #endif /* defined(ARM_LARGE_MEMORY) */
2692 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2693 	return ret;
2694 }
2695 
2696 /*
2697  * Routines to track and allocate physical pages during early boot.
2698  * On most systems that memory runs from first_avail through to avail_end
2699  * with no gaps.
2700  *
2701  * If the system supports ECC and ecc_bad_pages_count > 0, we
2702  * need to skip those pages.
2703  */
2704 
2705 static unsigned int avail_page_count = 0;
2706 static bool need_ram_ranges_init = true;
2707 
2708 
2709 /**
2710  * Checks to see if a given page is in
2711  * the array of known bad pages
2712  *
2713  * @param ppn page number to check
2714  */
2715 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2716 pmap_is_bad_ram(__unused ppnum_t ppn)
2717 {
2718 	return false;
2719 }
2720 
2721 /**
2722  * Prepare bad ram pages to be skipped.
2723  */
2724 
2725 
2726 /*
2727  * Initialize the count of available pages. No lock needed here,
2728  * as this code is called while kernel boot up is single threaded.
2729  */
2730 static void
initialize_ram_ranges(void)2731 initialize_ram_ranges(void)
2732 {
2733 	pmap_paddr_t first = first_avail;
2734 	pmap_paddr_t end = avail_end;
2735 
2736 	assert(first <= end);
2737 	assert(first == (first & ~PAGE_MASK));
2738 	assert(end == (end & ~PAGE_MASK));
2739 	avail_page_count = atop(end - first);
2740 
2741 	need_ram_ranges_init = false;
2742 
2743 }
2744 
2745 unsigned int
pmap_free_pages(void)2746 pmap_free_pages(
2747 	void)
2748 {
2749 	if (need_ram_ranges_init) {
2750 		initialize_ram_ranges();
2751 	}
2752 	return avail_page_count;
2753 }
2754 
2755 unsigned int
pmap_free_pages_span(void)2756 pmap_free_pages_span(
2757 	void)
2758 {
2759 	if (need_ram_ranges_init) {
2760 		initialize_ram_ranges();
2761 	}
2762 	return (unsigned int)atop(avail_end - first_avail);
2763 }
2764 
2765 
2766 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2767 pmap_next_page_hi(
2768 	ppnum_t            * pnum,
2769 	__unused boolean_t might_free)
2770 {
2771 	return pmap_next_page(pnum);
2772 }
2773 
2774 
2775 boolean_t
pmap_next_page(ppnum_t * pnum)2776 pmap_next_page(
2777 	ppnum_t *pnum)
2778 {
2779 	if (need_ram_ranges_init) {
2780 		initialize_ram_ranges();
2781 	}
2782 
2783 
2784 	if (first_avail != avail_end) {
2785 		*pnum = (ppnum_t)atop(first_avail);
2786 		first_avail += PAGE_SIZE;
2787 		assert(avail_page_count > 0);
2788 		--avail_page_count;
2789 		return TRUE;
2790 	}
2791 	assert(avail_page_count == 0);
2792 	return FALSE;
2793 }
2794 
2795 
2796 /*
2797  *	Initialize the pmap module.
2798  *	Called by vm_init, to initialize any structures that the pmap
2799  *	system needs to map virtual memory.
2800  */
2801 void
pmap_init(void)2802 pmap_init(
2803 	void)
2804 {
2805 	/*
2806 	 *	Protect page zero in the kernel map.
2807 	 *	(can be overruled by permanent transltion
2808 	 *	table entries at page zero - see arm_vm_init).
2809 	 */
2810 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2811 
2812 	pmap_initialized = TRUE;
2813 
2814 	/*
2815 	 *	Create the zone of physical maps
2816 	 *	and the physical-to-virtual entries.
2817 	 */
2818 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2819 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2820 
2821 
2822 	/*
2823 	 *	Initialize the pmap object (for tracking the vm_page_t
2824 	 *	structures for pages we allocate to be page tables in
2825 	 *	pmap_expand().
2826 	 */
2827 	_vm_object_allocate(mem_size, pmap_object);
2828 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2829 
2830 	/*
2831 	 * The values of [hard_]maxproc may have been scaled, make sure
2832 	 * they are still less than the value of pmap_max_asids.
2833 	 */
2834 	if ((uint32_t)maxproc > pmap_max_asids) {
2835 		maxproc = pmap_max_asids;
2836 	}
2837 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2838 		hard_maxproc = pmap_max_asids;
2839 	}
2840 }
2841 
2842 /**
2843  * Verify that a given physical page contains no mappings (outside of the
2844  * default physical aperture mapping).
2845  *
2846  * @param ppnum Physical page number to check there are no mappings to.
2847  *
2848  * @return True if there are no mappings, false otherwise or if the page is not
2849  *         kernel-managed.
2850  */
2851 bool
pmap_verify_free(ppnum_t ppnum)2852 pmap_verify_free(ppnum_t ppnum)
2853 {
2854 	const pmap_paddr_t pa = ptoa(ppnum);
2855 
2856 	assert(pa != vm_page_fictitious_addr);
2857 
2858 	/* Only mappings to kernel-managed physical memory are tracked. */
2859 	if (!pa_valid(pa)) {
2860 		return false;
2861 	}
2862 
2863 	const unsigned int pai = pa_index(pa);
2864 	pv_entry_t **pvh = pai_to_pvh(pai);
2865 
2866 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2867 }
2868 
2869 #if MACH_ASSERT
2870 /**
2871  * Verify that a given physical page contains no mappings (outside of the
2872  * default physical aperture mapping) and if it does, then panic.
2873  *
2874  * @note It's recommended to use pmap_verify_free() directly when operating in
2875  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2876  *       normally being called from outside of the PPL, and the pv_head_table
2877  *       can't be modified outside of the PPL).
2878  *
2879  * @param ppnum Physical page number to check there are no mappings to.
2880  */
2881 void
pmap_assert_free(ppnum_t ppnum)2882 pmap_assert_free(ppnum_t ppnum)
2883 {
2884 	const pmap_paddr_t pa = ptoa(ppnum);
2885 
2886 	/* Only mappings to kernel-managed physical memory are tracked. */
2887 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2888 		return;
2889 	}
2890 
2891 	const unsigned int pai = pa_index(pa);
2892 	pv_entry_t **pvh = pai_to_pvh(pai);
2893 
2894 	/**
2895 	 * This function is always called from outside of the PPL. Because of this,
2896 	 * the PVH entry can't be locked. This function is generally only called
2897 	 * before the VM reclaims a physical page and shouldn't be creating new
2898 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2899 	 * the worst case is that the system will panic in another way, and we were
2900 	 * already about to panic anyway.
2901 	 */
2902 
2903 	/**
2904 	 * Since pmap_verify_free() returned false, that means there is at least one
2905 	 * mapping left. Let's get some extra info on the first mapping we find to
2906 	 * dump in the panic string (the common case is that there is one spare
2907 	 * mapping that was never unmapped).
2908 	 */
2909 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2910 
2911 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2912 		first_ptep = pvh_ptep(pvh);
2913 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2914 		pv_entry_t *pvep = pvh_pve_list(pvh);
2915 
2916 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2917 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2918 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2919 			if (first_ptep != PT_ENTRY_NULL) {
2920 				break;
2921 			}
2922 		}
2923 
2924 		/* The PVE should have at least one valid PTE. */
2925 		assert(first_ptep != PT_ENTRY_NULL);
2926 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2927 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2928 		    __func__, pvh, pai);
2929 	} else {
2930 		/**
2931 		 * The mapping disappeared between here and the pmap_verify_free() call.
2932 		 * The only way that can happen is if the VM was racing this call with
2933 		 * a call that unmaps PTEs. Operations on this page should not be
2934 		 * occurring at the same time as this check, and unfortunately we can't
2935 		 * lock the PVH entry to prevent it, so just panic instead.
2936 		 */
2937 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2938 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2939 		    __func__, pvh, pai);
2940 	}
2941 
2942 	/* Panic with a unique string identifying the first bad mapping and owner. */
2943 	{
2944 		/* First PTE is mapped by the main CPUs. */
2945 		pmap_t pmap = ptep_get_pmap(first_ptep);
2946 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2947 
2948 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2949 		    "%s CPU mapping (pmap: %p)",
2950 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2951 	}
2952 }
2953 #endif
2954 
2955 
2956 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2957 pmap_root_alloc_size(pmap_t pmap)
2958 {
2959 #pragma unused(pmap)
2960 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2961 	unsigned int root_level = pt_attr_root_level(pt_attr);
2962 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2963 }
2964 
2965 
2966 /*
2967  *	Create and return a physical map.
2968  *
2969  *	If the size specified for the map
2970  *	is zero, the map is an actual physical
2971  *	map, and may be referenced by the
2972  *	hardware.
2973  *
2974  *	If the size specified is non-zero,
2975  *	the map will be used in software only, and
2976  *	is bounded by that size.
2977  */
2978 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2979 pmap_create_options_internal(
2980 	ledger_t ledger,
2981 	vm_map_size_t size,
2982 	unsigned int flags,
2983 	kern_return_t *kr)
2984 {
2985 	unsigned        i;
2986 	unsigned        tte_index_max;
2987 	pmap_t          p;
2988 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2989 #if defined(HAS_APPLE_PAC)
2990 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2991 #endif /* defined(HAS_APPLE_PAC) */
2992 	kern_return_t   local_kr = KERN_SUCCESS;
2993 
2994 	if (size != 0) {
2995 		{
2996 			// Size parameter should only be set for stage 2.
2997 			return PMAP_NULL;
2998 		}
2999 	}
3000 
3001 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3002 		return PMAP_NULL;
3003 	}
3004 
3005 #if XNU_MONITOR
3006 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3007 		goto pmap_create_fail;
3008 	}
3009 
3010 	assert(p != PMAP_NULL);
3011 
3012 	if (ledger) {
3013 		pmap_ledger_validate(ledger);
3014 		pmap_ledger_retain(ledger);
3015 	}
3016 #else
3017 	/*
3018 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
3019 	 *	the translation table of the right size for the pmap.
3020 	 */
3021 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3022 		local_kr = KERN_RESOURCE_SHORTAGE;
3023 		goto pmap_create_fail;
3024 	}
3025 #endif
3026 
3027 	p->ledger = ledger;
3028 
3029 
3030 	p->pmap_vm_map_cs_enforced = false;
3031 	p->min = 0;
3032 
3033 
3034 #if CONFIG_ROSETTA
3035 	if (flags & PMAP_CREATE_ROSETTA) {
3036 		p->is_rosetta = TRUE;
3037 	} else {
3038 		p->is_rosetta = FALSE;
3039 	}
3040 #endif /* CONFIG_ROSETTA */
3041 
3042 #if defined(HAS_APPLE_PAC)
3043 	p->disable_jop = disable_jop;
3044 #endif /* defined(HAS_APPLE_PAC) */
3045 
3046 	p->nested_region_true_start = 0;
3047 	p->nested_region_true_end = ~0;
3048 
3049 	p->nx_enabled = true;
3050 	p->is_64bit = is_64bit;
3051 	p->nested_pmap = PMAP_NULL;
3052 	p->type = PMAP_TYPE_USER;
3053 
3054 #if ARM_PARAMETERIZED_PMAP
3055 	/* Default to the native pt_attr */
3056 	p->pmap_pt_attr = native_pt_attr;
3057 #endif /* ARM_PARAMETERIZED_PMAP */
3058 #if __ARM_MIXED_PAGE_SIZE__
3059 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3060 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3061 	}
3062 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3063 	p->max = pmap_user_va_size(p);
3064 
3065 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3066 		local_kr = KERN_NO_SPACE;
3067 		goto id_alloc_fail;
3068 	}
3069 
3070 	pmap_lock_init(p);
3071 
3072 	p->tt_entry_free = (tt_entry_t *)0;
3073 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3074 
3075 
3076 #if XNU_MONITOR
3077 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3078 #else
3079 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3080 #endif
3081 	if (!(p->tte)) {
3082 		local_kr = KERN_RESOURCE_SHORTAGE;
3083 		goto tt1_alloc_fail;
3084 	}
3085 
3086 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3087 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3088 
3089 	/* nullify the translation table */
3090 	for (i = 0; i < tte_index_max; i++) {
3091 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3092 	}
3093 
3094 	FLUSH_PTE();
3095 
3096 	/*
3097 	 *  initialize the rest of the structure
3098 	 */
3099 	p->nested_region_addr = 0x0ULL;
3100 	p->nested_region_size = 0x0ULL;
3101 	p->nested_region_unnested_table_bitmap = NULL;
3102 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3103 
3104 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3105 	p->nested_no_bounds_refcnt = 0;
3106 	p->nested_bounds_set = false;
3107 
3108 
3109 #if MACH_ASSERT
3110 	p->pmap_pid = 0;
3111 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3112 #endif /* MACH_ASSERT */
3113 #if DEVELOPMENT || DEBUG
3114 	p->footprint_was_suspended = FALSE;
3115 #endif /* DEVELOPMENT || DEBUG */
3116 
3117 #if XNU_MONITOR
3118 	os_atomic_init(&p->nested_count, 0);
3119 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3120 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3121 	os_atomic_thread_fence(release);
3122 #endif
3123 	os_atomic_init(&p->ref_count, 1);
3124 	pmap_simple_lock(&pmaps_lock);
3125 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3126 	pmap_simple_unlock(&pmaps_lock);
3127 
3128 	/*
3129 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3130 	 * which can lead to a concurrent disconnect operation making the balance
3131 	 * transiently negative.  The ledger should still ultimately balance out,
3132 	 * which we still check upon pmap destruction.
3133 	 */
3134 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3135 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3136 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3137 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3138 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3139 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3140 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3141 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3142 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3143 
3144 	return p;
3145 
3146 tt1_alloc_fail:
3147 	pmap_get_pt_ops(p)->free_id(p);
3148 id_alloc_fail:
3149 #if XNU_MONITOR
3150 	pmap_free_pmap(p);
3151 
3152 	if (ledger) {
3153 		pmap_ledger_release(ledger);
3154 	}
3155 #else
3156 	zfree(pmap_zone, p);
3157 #endif
3158 pmap_create_fail:
3159 #if XNU_MONITOR
3160 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3161 #endif
3162 	*kr = local_kr;
3163 #if XNU_MONITOR
3164 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3165 #endif
3166 	return PMAP_NULL;
3167 }
3168 
3169 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3170 pmap_create_options(
3171 	ledger_t ledger,
3172 	vm_map_size_t size,
3173 	unsigned int flags)
3174 {
3175 	pmap_t pmap;
3176 	kern_return_t kr = KERN_SUCCESS;
3177 
3178 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3179 
3180 	ledger_reference(ledger);
3181 
3182 #if XNU_MONITOR
3183 	for (;;) {
3184 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3185 		if (kr != KERN_RESOURCE_SHORTAGE) {
3186 			break;
3187 		}
3188 		assert(pmap == PMAP_NULL);
3189 		pmap_alloc_page_for_ppl(0);
3190 		kr = KERN_SUCCESS;
3191 	}
3192 #else
3193 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3194 #endif
3195 
3196 	if (pmap == PMAP_NULL) {
3197 		ledger_dereference(ledger);
3198 	}
3199 
3200 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3201 
3202 	return pmap;
3203 }
3204 
3205 #if XNU_MONITOR
3206 /*
3207  * This symbol remains in place when the PPL is enabled so that the dispatch
3208  * table does not change from development to release configurations.
3209  */
3210 #endif
3211 #if MACH_ASSERT || XNU_MONITOR
3212 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3213 pmap_set_process_internal(
3214 	__unused pmap_t pmap,
3215 	__unused int pid,
3216 	__unused char *procname)
3217 {
3218 #if MACH_ASSERT
3219 	if (pmap == NULL || pmap->pmap_pid == -1) {
3220 		return;
3221 	}
3222 
3223 	validate_pmap_mutable(pmap);
3224 
3225 	pmap->pmap_pid = pid;
3226 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3227 #endif /* MACH_ASSERT */
3228 }
3229 #endif /* MACH_ASSERT || XNU_MONITOR */
3230 
3231 #if MACH_ASSERT
3232 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3233 pmap_set_process(
3234 	pmap_t pmap,
3235 	int pid,
3236 	char *procname)
3237 {
3238 #if XNU_MONITOR
3239 	pmap_set_process_ppl(pmap, pid, procname);
3240 #else
3241 	pmap_set_process_internal(pmap, pid, procname);
3242 #endif
3243 }
3244 #endif /* MACH_ASSERT */
3245 
3246 /*
3247  * pmap_deallocate_all_leaf_tts:
3248  *
3249  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3250  * removing and deallocating all TTEs.
3251  */
3252 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3253 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3254 {
3255 	tt_entry_t tte = ARM_TTE_EMPTY;
3256 	tt_entry_t * ttep = NULL;
3257 	tt_entry_t * last_ttep = NULL;
3258 
3259 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3260 
3261 	assert(level < pt_attr_leaf_level(pt_attr));
3262 
3263 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3264 
3265 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3266 		tte = *ttep;
3267 
3268 		if (!(tte & ARM_TTE_VALID)) {
3269 			continue;
3270 		}
3271 
3272 		if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3273 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3274 			    "pmap=%p, first_ttep=%p, level=%u",
3275 			    __FUNCTION__, ttep, (void *)tte,
3276 			    pmap, first_ttep, level);
3277 		}
3278 
3279 		/* Must be valid, type table */
3280 		if (level < pt_attr_twig_level(pt_attr)) {
3281 			/* If we haven't reached the twig level, recurse to the next level. */
3282 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3283 		}
3284 
3285 		/* Remove the TTE. */
3286 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3287 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3288 	}
3289 }
3290 
3291 /*
3292  * We maintain stats and ledgers so that a task's physical footprint is:
3293  * phys_footprint = ((internal - alternate_accounting)
3294  *                   + (internal_compressed - alternate_accounting_compressed)
3295  *                   + iokit_mapped
3296  *                   + purgeable_nonvolatile
3297  *                   + purgeable_nonvolatile_compressed
3298  *                   + page_table)
3299  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3300  */
3301 
3302 /*
3303  *	Retire the given physical map from service.
3304  *	Should only be called if the map contains
3305  *	no valid mappings.
3306  */
3307 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3308 pmap_destroy_internal(
3309 	pmap_t pmap)
3310 {
3311 	if (pmap == PMAP_NULL) {
3312 		return;
3313 	}
3314 
3315 	validate_pmap(pmap);
3316 
3317 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3318 
3319 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3320 	if (ref_count > 0) {
3321 		return;
3322 	} else if (__improbable(ref_count < 0)) {
3323 		panic("pmap %p: refcount underflow", pmap);
3324 	} else if (__improbable(pmap == kernel_pmap)) {
3325 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3326 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3327 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3328 	}
3329 
3330 	/*
3331 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3332 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3333 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3334 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3335 	 * ref_count of 0 and panic.
3336 	 */
3337 	os_atomic_thread_fence(seq_cst);
3338 
3339 #if XNU_MONITOR
3340 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3341 		panic("pmap %p: attempt to destroy while nested", pmap);
3342 	}
3343 	const int max_cpu = ml_get_max_cpu_number();
3344 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3345 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3346 		if (cpu_data == NULL) {
3347 			continue;
3348 		}
3349 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3350 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3351 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3352 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3353 		}
3354 	}
3355 #endif
3356 	pmap_unmap_commpage(pmap);
3357 
3358 	pmap_simple_lock(&pmaps_lock);
3359 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3360 	pmap_simple_unlock(&pmaps_lock);
3361 
3362 	pmap_trim_self(pmap);
3363 
3364 	/*
3365 	 *	Free the memory maps, then the
3366 	 *	pmap structure.
3367 	 */
3368 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3369 
3370 
3371 
3372 	if (pmap->tte) {
3373 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3374 		pmap->tte = (tt_entry_t *) NULL;
3375 		pmap->ttep = 0;
3376 	}
3377 
3378 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3379 
3380 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3381 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3382 		sync_tlb_flush();
3383 	} else {
3384 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3385 		sync_tlb_flush();
3386 		/* return its asid to the pool */
3387 		pmap_get_pt_ops(pmap)->free_id(pmap);
3388 		if (pmap->nested_pmap != NULL) {
3389 #if XNU_MONITOR
3390 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3391 #endif
3392 			/* release the reference we hold on the nested pmap */
3393 			pmap_destroy_internal(pmap->nested_pmap);
3394 		}
3395 	}
3396 
3397 	pmap_check_ledgers(pmap);
3398 
3399 	if (pmap->nested_region_unnested_table_bitmap) {
3400 #if XNU_MONITOR
3401 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3402 #else
3403 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3404 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3405 #endif
3406 	}
3407 
3408 #if XNU_MONITOR
3409 	if (pmap->ledger) {
3410 		pmap_ledger_release(pmap->ledger);
3411 	}
3412 
3413 	pmap_lock_destroy(pmap);
3414 	pmap_free_pmap(pmap);
3415 #else
3416 	pmap_lock_destroy(pmap);
3417 	zfree(pmap_zone, pmap);
3418 #endif
3419 }
3420 
3421 void
pmap_destroy(pmap_t pmap)3422 pmap_destroy(
3423 	pmap_t pmap)
3424 {
3425 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3426 
3427 	ledger_t ledger = pmap->ledger;
3428 
3429 #if XNU_MONITOR
3430 	pmap_destroy_ppl(pmap);
3431 
3432 	pmap_ledger_check_balance(pmap);
3433 #else
3434 	pmap_destroy_internal(pmap);
3435 #endif
3436 
3437 	ledger_dereference(ledger);
3438 
3439 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3440 }
3441 
3442 
3443 /*
3444  *	Add a reference to the specified pmap.
3445  */
3446 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3447 pmap_reference_internal(
3448 	pmap_t pmap)
3449 {
3450 	if (pmap != PMAP_NULL) {
3451 		validate_pmap_mutable(pmap);
3452 		os_atomic_inc(&pmap->ref_count, acquire);
3453 	}
3454 }
3455 
3456 void
pmap_reference(pmap_t pmap)3457 pmap_reference(
3458 	pmap_t pmap)
3459 {
3460 #if XNU_MONITOR
3461 	pmap_reference_ppl(pmap);
3462 #else
3463 	pmap_reference_internal(pmap);
3464 #endif
3465 }
3466 
3467 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3468 pmap_tt1_allocate(
3469 	pmap_t          pmap,
3470 	vm_size_t       size,
3471 	unsigned        option)
3472 {
3473 	tt_entry_t      *tt1 = NULL;
3474 	tt_free_entry_t *tt1_free;
3475 	pmap_paddr_t    pa;
3476 	vm_address_t    va;
3477 	vm_address_t    va_end;
3478 	kern_return_t   ret;
3479 
3480 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3481 		size = PAGE_SIZE;
3482 	}
3483 
3484 	/**
3485 	 * We expect top level translation tables to always fit into a single
3486 	 * physical page. This would also catch a misconfiguration if 4K
3487 	 * concatenated page tables needed more than one physical tt1 page.
3488 	 */
3489 	if (__improbable(size > PAGE_SIZE)) {
3490 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3491 	}
3492 
3493 	pmap_simple_lock(&tt1_lock);
3494 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3495 		free_page_size_tt_count--;
3496 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3497 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3498 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3499 		free_tt_count--;
3500 		tt1 = (tt_entry_t *)free_tt_list;
3501 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3502 	}
3503 	pmap_simple_unlock(&tt1_lock);
3504 
3505 	if (tt1 != NULL) {
3506 		pmap_tt_ledger_credit(pmap, size);
3507 		return (tt_entry_t *)tt1;
3508 	}
3509 
3510 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3511 
3512 	if (ret == KERN_RESOURCE_SHORTAGE) {
3513 		return (tt_entry_t *)0;
3514 	}
3515 
3516 #if XNU_MONITOR
3517 	assert(pa);
3518 #endif
3519 
3520 	if (size < PAGE_SIZE) {
3521 		va = phystokv(pa) + size;
3522 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3523 		tt_free_entry_t *next_free = NULL;
3524 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3525 			tt1_free = (tt_free_entry_t *)va;
3526 			tt1_free->next = next_free;
3527 			next_free = tt1_free;
3528 		}
3529 		pmap_simple_lock(&tt1_lock);
3530 		local_free_list->next = free_tt_list;
3531 		free_tt_list = next_free;
3532 		free_tt_count += ((PAGE_SIZE / size) - 1);
3533 		if (free_tt_count > free_tt_max) {
3534 			free_tt_max = free_tt_count;
3535 		}
3536 		pmap_simple_unlock(&tt1_lock);
3537 	}
3538 
3539 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3540 	 * Depending on the device, this can vary between 512b and 16K. */
3541 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3542 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3543 	pmap_tt_ledger_credit(pmap, size);
3544 
3545 	return (tt_entry_t *) phystokv(pa);
3546 }
3547 
3548 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3549 pmap_tt1_deallocate(
3550 	pmap_t pmap,
3551 	tt_entry_t *tt,
3552 	vm_size_t size,
3553 	unsigned option)
3554 {
3555 	tt_free_entry_t *tt_entry;
3556 
3557 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3558 		size = PAGE_SIZE;
3559 	}
3560 
3561 	tt_entry = (tt_free_entry_t *)tt;
3562 	assert(not_in_kdp);
3563 	pmap_simple_lock(&tt1_lock);
3564 
3565 	if (size < PAGE_SIZE) {
3566 		free_tt_count++;
3567 		if (free_tt_count > free_tt_max) {
3568 			free_tt_max = free_tt_count;
3569 		}
3570 		tt_entry->next = free_tt_list;
3571 		free_tt_list = tt_entry;
3572 	}
3573 
3574 	if (size == PAGE_SIZE) {
3575 		free_page_size_tt_count++;
3576 		if (free_page_size_tt_count > free_page_size_tt_max) {
3577 			free_page_size_tt_max = free_page_size_tt_count;
3578 		}
3579 		tt_entry->next = free_page_size_tt_list;
3580 		free_page_size_tt_list = tt_entry;
3581 	}
3582 
3583 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3584 		pmap_simple_unlock(&tt1_lock);
3585 		pmap_tt_ledger_debit(pmap, size);
3586 		return;
3587 	}
3588 
3589 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3590 		free_page_size_tt_count--;
3591 		tt = (tt_entry_t *)free_page_size_tt_list;
3592 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3593 
3594 		pmap_simple_unlock(&tt1_lock);
3595 
3596 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3597 
3598 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3599 
3600 		pmap_simple_lock(&tt1_lock);
3601 	}
3602 
3603 	pmap_simple_unlock(&tt1_lock);
3604 	pmap_tt_ledger_debit(pmap, size);
3605 }
3606 
3607 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3608 pmap_tt_allocate(
3609 	pmap_t pmap,
3610 	tt_entry_t **ttp,
3611 	unsigned int level,
3612 	unsigned int options)
3613 {
3614 	pmap_paddr_t pa;
3615 	*ttp = NULL;
3616 
3617 	/* Traverse the tt_entry_free list to find a free tt_entry */
3618 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3619 		return KERN_ABORTED;
3620 	}
3621 
3622 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3623 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3624 
3625 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3626 		tt_free_next = tt_free_cur->next;
3627 		tt_free_cur->next = NULL;
3628 		*ttp = (tt_entry_t *)tt_free_cur;
3629 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3630 	}
3631 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3632 
3633 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3634 	if (*ttp == NULL) {
3635 		pt_desc_t       *ptdp;
3636 
3637 		const unsigned int alloc_flags =
3638 		    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3639 		/*
3640 		 *  Allocate a VM page for the level x page table entries.
3641 		 */
3642 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3643 			if (options & PMAP_OPTIONS_NOWAIT) {
3644 				return KERN_RESOURCE_SHORTAGE;
3645 			}
3646 			VM_PAGE_WAIT();
3647 		}
3648 
3649 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3650 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3651 			if (options & PMAP_OPTIONS_NOWAIT) {
3652 				/* Deallocate all allocated resources so far. */
3653 				pmap_pages_free(pa, PAGE_SIZE);
3654 				return KERN_RESOURCE_SHORTAGE;
3655 			}
3656 			VM_PAGE_WAIT();
3657 		}
3658 
3659 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3660 			OSAddAtomic64(1, &alloc_ttepages_count);
3661 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3662 		} else {
3663 			OSAddAtomic64(1, &alloc_ptepages_count);
3664 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3665 		}
3666 
3667 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3668 
3669 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3670 
3671 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3672 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3673 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3674 
3675 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3676 		if (PAGE_SIZE > pmap_page_size) {
3677 			vm_address_t    va;
3678 			vm_address_t    va_end;
3679 
3680 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3681 				/* Deallocate all allocated resources so far. */
3682 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3683 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3684 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3685 				pmap_pages_free(pa, PAGE_SIZE);
3686 				ptd_deallocate(ptdp);
3687 
3688 				return KERN_ABORTED;
3689 			}
3690 
3691 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3692 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3693 				pmap->tt_entry_free = (tt_entry_t *)va;
3694 			}
3695 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3696 		}
3697 
3698 		*ttp = (tt_entry_t *)phystokv(pa);
3699 	}
3700 
3701 #if XNU_MONITOR
3702 	assert(*ttp);
3703 #endif
3704 
3705 	return KERN_SUCCESS;
3706 }
3707 
3708 
3709 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3710 pmap_tt_deallocate(
3711 	pmap_t pmap,
3712 	tt_entry_t *ttp,
3713 	unsigned int level)
3714 {
3715 	pt_desc_t *ptdp;
3716 	ptd_info_t *ptd_info;
3717 	unsigned pt_acc_cnt;
3718 	unsigned i;
3719 	vm_offset_t     free_page = 0;
3720 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3721 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3722 
3723 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3724 
3725 	ptdp = ptep_get_ptd(ttp);
3726 	ptd_info = ptd_get_info(ptdp, ttp);
3727 
3728 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3729 
3730 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3731 		ptd_info->refcnt = 0;
3732 	}
3733 
3734 	if (__improbable(ptd_info->refcnt != 0)) {
3735 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3736 	}
3737 
3738 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3739 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3740 	}
3741 
3742 	if (pt_acc_cnt == 0) {
3743 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3744 		unsigned pt_free_entry_cnt = 1;
3745 
3746 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3747 			tt_free_entry_t *tt_free_list_next;
3748 
3749 			tt_free_list_next = tt_free_list->next;
3750 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3751 				pt_free_entry_cnt++;
3752 			}
3753 			tt_free_list = tt_free_list_next;
3754 		}
3755 		if (pt_free_entry_cnt == max_pt_index) {
3756 			tt_free_entry_t *tt_free_list_cur;
3757 
3758 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3759 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3760 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3761 
3762 			while (tt_free_list_cur) {
3763 				tt_free_entry_t *tt_free_list_next;
3764 
3765 				tt_free_list_next = tt_free_list_cur->next;
3766 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3767 					tt_free_list->next = tt_free_list_next->next;
3768 				} else {
3769 					tt_free_list = tt_free_list_next;
3770 				}
3771 				tt_free_list_cur = tt_free_list_next;
3772 			}
3773 		} else {
3774 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3775 			pmap->tt_entry_free = ttp;
3776 		}
3777 	} else {
3778 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3779 		pmap->tt_entry_free = ttp;
3780 	}
3781 
3782 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3783 
3784 	if (free_page != 0) {
3785 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3786 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3787 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3788 		if (level < pt_attr_leaf_level(pt_attr)) {
3789 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3790 		} else {
3791 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3792 		}
3793 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3794 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3795 	}
3796 }
3797 
3798 /**
3799  * Safely clear out a translation table entry.
3800  *
3801  * @note If the TTE to clear out points to a leaf table, then that leaf table
3802  *       must have a refcnt of zero before the TTE can be removed.
3803  * @note This function expects to be called with pmap locked exclusive, and will
3804  *       return with pmap unlocked.
3805  *
3806  * @param pmap The pmap containing the page table whose TTE is being removed.
3807  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3808  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3809  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3810  * @param ttep Pointer to the TTE that should be cleared out.
3811  * @param level The level of the page table that contains the TTE to be removed.
3812  */
3813 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3814 pmap_tte_remove(
3815 	pmap_t pmap,
3816 	vm_offset_t va_start,
3817 	vm_offset_t va_end,
3818 	bool need_strong_sync,
3819 	tt_entry_t *ttep,
3820 	unsigned int level)
3821 {
3822 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3823 
3824 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3825 	const tt_entry_t tte = *ttep;
3826 
3827 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3828 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3829 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3830 	}
3831 
3832 	*ttep = (tt_entry_t) 0;
3833 	FLUSH_PTE_STRONG();
3834 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3835 	if (va_end > va_start) {
3836 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3837 	}
3838 
3839 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3840 
3841 	/**
3842 	 * Remember, the passed in "level" parameter refers to the level above the
3843 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3844 	 * page table).
3845 	 */
3846 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3847 
3848 	/**
3849 	 * Non-leaf pagetables don't track active references in the PTD and instead
3850 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3851 	 * the real refcount below.
3852 	 */
3853 	unsigned short refcnt = PT_DESC_REFCOUNT;
3854 
3855 	/*
3856 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3857 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3858 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3859 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3860 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3861 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3862 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3863 	 * synchronize it against the disconnect operation.  If that removal caused the
3864 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3865 	 * operation is finished using the relevant pagetable descriptor.
3866 	 * Address these cases by waiting until all CPUs have been observed to not be
3867 	 * executing pmap_disconnect().
3868 	 */
3869 	if (remove_leaf_table) {
3870 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3871 		const int max_cpu = ml_get_max_cpu_number();
3872 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3873 		bool inflight_disconnect;
3874 
3875 		/*
3876 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3877 		 * ahead of any prior PTE load which may have observed the effect of a
3878 		 * concurrent disconnect operation.  An acquire fence is required for this;
3879 		 * a load-acquire operation is insufficient.
3880 		 */
3881 		os_atomic_thread_fence(acquire);
3882 		do {
3883 			inflight_disconnect = false;
3884 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3885 			    i >= 0;
3886 			    i = bitmap_next(&active_disconnects[0], i)) {
3887 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3888 				if (cpu_data == NULL) {
3889 					continue;
3890 				}
3891 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3892 					__builtin_arm_wfe();
3893 					inflight_disconnect = true;
3894 					continue;
3895 				}
3896 				os_atomic_clear_exclusive();
3897 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3898 			}
3899 		} while (inflight_disconnect);
3900 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3901 		os_atomic_thread_fence(acquire);
3902 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3903 	}
3904 
3905 #if MACH_ASSERT
3906 	/**
3907 	 * On internal devices, always do the page table consistency check
3908 	 * regardless of page table level or the actual refcnt value.
3909 	 */
3910 	{
3911 #else /* MACH_ASSERT */
3912 	/**
3913 	 * Only perform the page table consistency check when deleting leaf page
3914 	 * tables and it seems like there might be valid/compressed mappings
3915 	 * leftover.
3916 	 */
3917 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3918 #endif /* MACH_ASSERT */
3919 
3920 		/**
3921 		 * There are multiple problems that can arise as a non-zero refcnt:
3922 		 * 1. A bug in the refcnt management logic.
3923 		 * 2. A memory stomper or hardware failure.
3924 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3925 		 *    space before destroying a pmap.
3926 		 *
3927 		 * By looping over the page table and determining how many valid or
3928 		 * compressed entries there actually are, we can narrow down which of
3929 		 * these three cases is causing this panic. If the expected refcnt
3930 		 * (valid + compressed) and the actual refcnt don't match then the
3931 		 * problem is probably either a memory corruption issue (if the
3932 		 * non-empty entries don't match valid+compressed, that could also be a
3933 		 * sign of corruption) or refcnt management bug. Otherwise, there
3934 		 * actually are leftover mappings and the higher layers of xnu are
3935 		 * probably at fault.
3936 		 */
3937 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3938 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3939 
3940 		pt_entry_t *ptep = bpte;
3941 		unsigned short non_empty = 0, valid = 0, comp = 0;
3942 
3943 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3944 			/**
3945 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3946 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3947 			 * That's because it's possible for the 4-tuple PTE clear operation in
3948 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3949 			 * pmap_disconnect() to race each other in such a way that the compressed marker
3950 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3951 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
3952 			 * but we don't want it to trip our internal checks here.
3953 			 */
3954 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3955 				if ((i % PAGE_RATIO) == 0) {
3956 					comp++;
3957 				} else {
3958 					continue;
3959 				}
3960 			} else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3961 				valid++;
3962 			}
3963 
3964 			/* Keep track of all non-empty entries to detect memory corruption. */
3965 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3966 				non_empty++;
3967 			}
3968 		}
3969 
3970 #if MACH_ASSERT
3971 		/**
3972 		 * On internal machines, panic whenever a page table getting deleted has
3973 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3974 		 * non-zero refcnt.
3975 		 */
3976 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3977 #else /* MACH_ASSERT */
3978 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3979 		{
3980 #endif /* MACH_ASSERT */
3981 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3982 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3983 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3984 		}
3985 	}
3986 }
3987 
3988 /**
3989  * Given a pointer to an entry within a `level` page table, delete the
3990  * page table at `level` + 1 that is represented by that entry. For instance,
3991  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3992  * contains the PA of the L3 table, and `level` would be "2".
3993  *
3994  * @note If the table getting deallocated is a leaf table, then that leaf table
3995  *       must have a refcnt of zero before getting deallocated. All other levels
3996  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3997  * @note This function expects to be called with pmap locked exclusive and will
3998  *       return with pmap unlocked.
3999  *
4000  * @param pmap The pmap that owns the page table to be deallocated.
4001  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4002  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4003  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4004  * @param ttep Pointer to the `level` TTE to remove.
4005  * @param level The level of the table that contains an entry pointing to the
4006  *              table to be removed. The deallocated page table will be a
4007  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
4008  *              deleted).
4009  */
4010 void
4011 pmap_tte_deallocate(
4012 	pmap_t pmap,
4013 	vm_offset_t va_start,
4014 	vm_offset_t va_end,
4015 	bool need_strong_sync,
4016 	tt_entry_t *ttep,
4017 	unsigned int level)
4018 {
4019 	tt_entry_t tte;
4020 
4021 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4022 
4023 	tte = *ttep;
4024 
4025 	if (tte_get_ptd(tte)->pmap != pmap) {
4026 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4027 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4028 	}
4029 
4030 	assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
4031 	    __func__, ttep, (unsigned long long)tte);
4032 
4033 	/* pmap_tte_remove() will drop the pmap lock */
4034 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4035 
4036 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4037 }
4038 
4039 /*
4040  *	Remove a range of hardware page-table entries.
4041  *	The entries given are the first (inclusive)
4042  *	and last (exclusive) entries for the VM pages.
4043  *	The virtual address is the va for the first pte.
4044  *
4045  *	The pmap must be locked.
4046  *	If the pmap is not the kernel pmap, the range must lie
4047  *	entirely within one pte-page.  This is NOT checked.
4048  *	Assumes that the pte-page exists.
4049  *
4050  *	Returns the number of PTE changed
4051  */
4052 MARK_AS_PMAP_TEXT static int
4053 pmap_remove_range(
4054 	pmap_t pmap,
4055 	vm_map_address_t va,
4056 	pt_entry_t *bpte,
4057 	pt_entry_t *epte)
4058 {
4059 	bool need_strong_sync = false;
4060 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4061 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4062 	if (num_changed > 0) {
4063 		PMAP_UPDATE_TLBS(pmap, va,
4064 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4065 	}
4066 	return num_changed;
4067 }
4068 
4069 
4070 #ifdef PVH_FLAG_EXEC
4071 
4072 /*
4073  *	Update the access protection bits of the physical aperture mapping for a page.
4074  *	This is useful, for example, in guranteeing that a verified executable page
4075  *	has no writable mappings anywhere in the system, including the physical
4076  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4077  *	synchronization overhead in cases where the call to this function is
4078  *	guaranteed to be followed by other TLB operations.
4079  */
4080 void
4081 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4082 {
4083 #if __ARM_PTE_PHYSMAP__
4084 	pvh_assert_locked(pai);
4085 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4086 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4087 
4088 	pt_entry_t tmplate = *pte_p;
4089 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4090 		return;
4091 	}
4092 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4093 	if (tmplate & ARM_PTE_HINT_MASK) {
4094 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4095 		    __func__, pte_p, (void *)kva, tmplate);
4096 	}
4097 	write_pte_strong(pte_p, tmplate);
4098 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4099 	if (!flush_tlb_async) {
4100 		sync_tlb_flush();
4101 	}
4102 #endif
4103 }
4104 #endif /* defined(PVH_FLAG_EXEC) */
4105 
4106 
4107 
4108 MARK_AS_PMAP_TEXT int
4109 pmap_remove_range_options(
4110 	pmap_t pmap,
4111 	vm_map_address_t va,
4112 	pt_entry_t *bpte,
4113 	pt_entry_t *epte,
4114 	vm_map_address_t *eva,
4115 	bool *need_strong_sync __unused,
4116 	int options)
4117 {
4118 	pt_entry_t     *cpte;
4119 	size_t          npages = 0;
4120 	int             num_removed, num_unwired;
4121 	int             num_pte_changed;
4122 	unsigned int    pai = 0;
4123 	pmap_paddr_t    pa;
4124 	int             num_external, num_internal, num_reusable;
4125 	int             num_alt_internal;
4126 	uint64_t        num_compressed, num_alt_compressed;
4127 	int16_t         refcnt = 0;
4128 
4129 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4130 
4131 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4132 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4133 
4134 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4135 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4136 	}
4137 
4138 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4139 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4140 	}
4141 
4142 	if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4143 		panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4144 	}
4145 
4146 	num_removed = 0;
4147 	num_unwired = 0;
4148 	num_pte_changed = 0;
4149 	num_external = 0;
4150 	num_internal = 0;
4151 	num_reusable = 0;
4152 	num_compressed = 0;
4153 	num_alt_internal = 0;
4154 	num_alt_compressed = 0;
4155 
4156 #if XNU_MONITOR
4157 	bool ro_va = false;
4158 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4159 		ro_va = true;
4160 	}
4161 #endif
4162 	for (cpte = bpte; cpte < epte;
4163 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4164 		pt_entry_t      spte;
4165 		boolean_t       managed = FALSE;
4166 
4167 		/*
4168 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4169 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4170 		 */
4171 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4172 			*eva = va;
4173 			break;
4174 		}
4175 
4176 		spte = *((volatile pt_entry_t*)cpte);
4177 
4178 		while (!managed) {
4179 			if (pmap != kernel_pmap &&
4180 			    (options & PMAP_OPTIONS_REMOVE) &&
4181 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4182 				/*
4183 				 * "pmap" must be locked at this point,
4184 				 * so this should not race with another
4185 				 * pmap_remove_range() or pmap_enter().
4186 				 */
4187 
4188 				/* one less "compressed"... */
4189 				num_compressed++;
4190 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4191 					/* ... but it used to be "ALTACCT" */
4192 					num_alt_compressed++;
4193 				}
4194 
4195 				/* clear marker */
4196 				write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4197 				/*
4198 				 * "refcnt" also accounts for
4199 				 * our "compressed" markers,
4200 				 * so let's update it here.
4201 				 */
4202 				--refcnt;
4203 				spte = *((volatile pt_entry_t*)cpte);
4204 			}
4205 			/*
4206 			 * It may be possible for the pte to transition from managed
4207 			 * to unmanaged in this timeframe; for now, elide the assert.
4208 			 * We should break out as a consequence of checking pa_valid.
4209 			 */
4210 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4211 			pa = pte_to_pa(spte);
4212 			if (!pa_valid(pa)) {
4213 #if XNU_MONITOR
4214 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4215 #endif
4216 #if XNU_MONITOR
4217 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4218 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4219 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4220 					    __func__, (uint64_t)pa);
4221 				}
4222 #endif
4223 				break;
4224 			}
4225 #if HAS_FEAT_XS
4226 			if (pte_is_xs(pt_attr, spte)) {
4227 				*need_strong_sync = true;
4228 			}
4229 #endif /* HAS_FEAT_XS */
4230 			pai = pa_index(pa);
4231 			pvh_lock(pai);
4232 			spte = *((volatile pt_entry_t*)cpte);
4233 			pa = pte_to_pa(spte);
4234 			if (pai == pa_index(pa)) {
4235 				managed = TRUE;
4236 				break; // Leave pai locked as we will unlock it after we free the PV entry
4237 			}
4238 			pvh_unlock(pai);
4239 		}
4240 
4241 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4242 			/*
4243 			 * There used to be a valid mapping here but it
4244 			 * has already been removed when the page was
4245 			 * sent to the VM compressor, so nothing left to
4246 			 * remove now...
4247 			 */
4248 			continue;
4249 		}
4250 
4251 		/* remove the translation, do not flush the TLB */
4252 		if (*cpte != ARM_PTE_TYPE_FAULT) {
4253 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4254 			assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4255 #if MACH_ASSERT
4256 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4257 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4258 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4259 			}
4260 #endif
4261 			write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4262 			num_pte_changed++;
4263 		}
4264 
4265 		if ((spte != ARM_PTE_TYPE_FAULT) &&
4266 		    (pmap != kernel_pmap)) {
4267 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4268 			assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4269 			--refcnt;
4270 		}
4271 
4272 		if (pte_is_wired(spte)) {
4273 			pte_set_wired(pmap, cpte, 0);
4274 			num_unwired++;
4275 		}
4276 		/*
4277 		 * if not managed, we're done
4278 		 */
4279 		if (!managed) {
4280 			continue;
4281 		}
4282 
4283 #if XNU_MONITOR
4284 		if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4285 			panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4286 		}
4287 		if (__improbable(ro_va)) {
4288 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4289 		}
4290 #endif
4291 
4292 		/*
4293 		 * find and remove the mapping from the chain for this
4294 		 * physical address.
4295 		 */
4296 		bool is_internal, is_altacct;
4297 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4298 
4299 		if (is_altacct) {
4300 			assert(is_internal);
4301 			num_internal++;
4302 			num_alt_internal++;
4303 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4304 				ppattr_clear_altacct(pai);
4305 				ppattr_clear_internal(pai);
4306 			}
4307 		} else if (is_internal) {
4308 			if (ppattr_test_reusable(pai)) {
4309 				num_reusable++;
4310 			} else {
4311 				num_internal++;
4312 			}
4313 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4314 				ppattr_clear_internal(pai);
4315 			}
4316 		} else {
4317 			num_external++;
4318 		}
4319 		pvh_unlock(pai);
4320 		num_removed++;
4321 	}
4322 
4323 	/*
4324 	 *	Update the counts
4325 	 */
4326 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4327 
4328 	if (pmap != kernel_pmap) {
4329 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4330 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4331 		}
4332 
4333 		/* update ledgers */
4334 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4335 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4336 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4337 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4338 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4339 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4340 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4341 		/* make needed adjustments to phys_footprint */
4342 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4343 		    ((num_internal -
4344 		    num_alt_internal) +
4345 		    (num_compressed -
4346 		    num_alt_compressed)) * pmap_page_size);
4347 	}
4348 
4349 	/* flush the ptable entries we have written */
4350 	if (num_pte_changed > 0) {
4351 		FLUSH_PTE_STRONG();
4352 	}
4353 
4354 	return num_pte_changed;
4355 }
4356 
4357 
4358 /*
4359  *	Remove the given range of addresses
4360  *	from the specified map.
4361  *
4362  *	It is assumed that the start and end are properly
4363  *	rounded to the hardware page size.
4364  */
4365 void
4366 pmap_remove(
4367 	pmap_t pmap,
4368 	vm_map_address_t start,
4369 	vm_map_address_t end)
4370 {
4371 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4372 }
4373 
4374 MARK_AS_PMAP_TEXT vm_map_address_t
4375 pmap_remove_options_internal(
4376 	pmap_t pmap,
4377 	vm_map_address_t start,
4378 	vm_map_address_t end,
4379 	int options)
4380 {
4381 	vm_map_address_t eva = end;
4382 	pt_entry_t     *bpte, *epte;
4383 	pt_entry_t     *pte_p;
4384 	tt_entry_t     *tte_p;
4385 	int             remove_count = 0;
4386 	bool            need_strong_sync = false;
4387 	bool            unlock = true;
4388 
4389 	validate_pmap_mutable(pmap);
4390 
4391 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4392 
4393 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4394 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4395 	}
4396 
4397 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4398 
4399 	tte_p = pmap_tte(pmap, start);
4400 
4401 	if (tte_p == (tt_entry_t *) NULL) {
4402 		goto done;
4403 	}
4404 
4405 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4406 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4407 		bpte = &pte_p[pte_index(pt_attr, start)];
4408 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4409 
4410 		/*
4411 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4412 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4413 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4414 		 */
4415 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4416 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4417 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4418 		}
4419 
4420 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4421 		    &need_strong_sync, options);
4422 
4423 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4424 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4425 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4426 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4427 		}
4428 	}
4429 
4430 done:
4431 	if (unlock) {
4432 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4433 	}
4434 
4435 	if (remove_count > 0) {
4436 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4437 	}
4438 	return eva;
4439 }
4440 
4441 void
4442 pmap_remove_options(
4443 	pmap_t pmap,
4444 	vm_map_address_t start,
4445 	vm_map_address_t end,
4446 	int options)
4447 {
4448 	vm_map_address_t va;
4449 
4450 	if (pmap == PMAP_NULL) {
4451 		return;
4452 	}
4453 
4454 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4455 
4456 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4457 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4458 	    VM_KERNEL_ADDRHIDE(end));
4459 
4460 	/*
4461 	 * We allow single-page requests to execute non-preemptibly,
4462 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4463 	 * operation, and there are a couple of special use cases that
4464 	 * require a non-preemptible single-page operation.
4465 	 */
4466 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4467 		pmap_verify_preemptible();
4468 	}
4469 
4470 	/*
4471 	 *      Invalidate the translation buffer first
4472 	 */
4473 	va = start;
4474 	while (va < end) {
4475 		vm_map_address_t l;
4476 
4477 #if XNU_TARGET_OS_XR
4478 		/* rdar://84856940 */
4479 		unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4480 
4481 		l = va + BATCH_SIZE;
4482 
4483 		vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4484 
4485 		if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4486 			// We're not allowed to cross an L2 boundary.
4487 			l = l_twig;
4488 		}
4489 #else /* XNU_TARGET_OS_XR */
4490 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4491 #endif /* XNU_TARGET_OS_XR */
4492 		if (l > end) {
4493 			l = end;
4494 		}
4495 
4496 #if XNU_MONITOR
4497 		va = pmap_remove_options_ppl(pmap, va, l, options);
4498 
4499 		pmap_ledger_check_balance(pmap);
4500 #else
4501 		va = pmap_remove_options_internal(pmap, va, l, options);
4502 #endif
4503 	}
4504 
4505 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4506 }
4507 
4508 
4509 /*
4510  *	Remove phys addr if mapped in specified map
4511  */
4512 void
4513 pmap_remove_some_phys(
4514 	__unused pmap_t map,
4515 	__unused ppnum_t pn)
4516 {
4517 	/* Implement to support working set code */
4518 }
4519 
4520 /*
4521  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4522  * switch a thread onto a new vm_map.
4523  */
4524 void
4525 pmap_switch_user(thread_t thread, vm_map_t new_map)
4526 {
4527 	pmap_t new_pmap = new_map->pmap;
4528 
4529 
4530 	thread->map = new_map;
4531 	pmap_set_pmap(new_pmap, thread);
4532 
4533 }
4534 
4535 void
4536 pmap_set_pmap(
4537 	pmap_t pmap,
4538 #if     !__ARM_USER_PROTECT__
4539 	__unused
4540 #endif
4541 	thread_t        thread)
4542 {
4543 	pmap_switch(pmap);
4544 #if __ARM_USER_PROTECT__
4545 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4546 	thread->machine.asid = pmap->hw_asid;
4547 #endif
4548 }
4549 
4550 static void
4551 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4552 {
4553 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4554 }
4555 
4556 #if HAS_SPECRES
4557 static void
4558 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4559 {
4560 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4561 	asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4562 }
4563 
4564 #if REQUIRES_DVP_RCTX
4565 static void
4566 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4567 {
4568 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4569 	asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4570 }
4571 #endif /* REQUIRES_DVP_RCTX */
4572 #endif /* HAS_SPECRES */
4573 
4574 static inline bool
4575 pmap_user_ttb_is_clear(void)
4576 {
4577 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4578 }
4579 
4580 MARK_AS_PMAP_TEXT void
4581 pmap_switch_internal(
4582 	pmap_t pmap)
4583 {
4584 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4585 #if XNU_MONITOR
4586 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4587 
4588 	/**
4589 	 * Make sure a pmap is never active-and-nested. For more details,
4590 	 * see pmap_set_nested_internal().
4591 	 */
4592 	os_atomic_thread_fence(seq_cst);
4593 	if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4594 		panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4595 	}
4596 #endif
4597 	validate_pmap_mutable(pmap);
4598 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4599 	uint16_t asid_index = pmap->hw_asid;
4600 	bool do_asid_flush = false;
4601 	bool do_commpage_flush = false;
4602 #if HAS_SPECRES
4603 	bool do_speculation_restriction = false;
4604 #endif /* HAS_SPECRES */
4605 
4606 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4607 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4608 	}
4609 #if __ARM_KERNEL_PROTECT__
4610 	asid_index >>= 1;
4611 #endif
4612 
4613 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4614 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4615 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4616 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4617 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4618 	bool break_before_make = do_shared_region_flush;
4619 
4620 #if !HAS_16BIT_ASID
4621 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4622 		asid_index -= 1;
4623 		pmap_update_plru(asid_index);
4624 
4625 		/* Paranoia. */
4626 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4627 
4628 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4629 		uint8_t new_sw_asid = pmap->sw_asid;
4630 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4631 
4632 		if (new_sw_asid != last_sw_asid) {
4633 			/**
4634 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4635 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4636 			 * then this switch runs the risk of aliasing.  We need to flush the
4637 			 * TLB for this phyiscal ASID in this case.
4638 			 */
4639 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4640 			do_asid_flush = true;
4641 #if HAS_SPECRES
4642 			do_speculation_restriction = true;
4643 #endif /* HAS_SPECRES */
4644 			break_before_make = true;
4645 		}
4646 	}
4647 #endif /* !HAS_16BIT_ASID */
4648 
4649 #if HAS_SPECRES_DEBUGGING
4650 	if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4651 		do_speculation_restriction = true;
4652 	} else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4653 		do_speculation_restriction = false;
4654 	}
4655 #endif /* HAS_SPECRES_DEBUGGING */
4656 
4657 #if __ARM_MIXED_PAGE_SIZE__
4658 	if (pt_attr->pta_tcr_value != get_tcr()) {
4659 		break_before_make = true;
4660 	}
4661 #endif
4662 #if __ARM_MIXED_PAGE_SIZE__
4663 	/*
4664 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4665 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4666 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4667 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4668 	 * conflict abort or other unpredictable behavior.
4669 	 */
4670 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4671 		do_commpage_flush = true;
4672 	}
4673 	if (do_commpage_flush) {
4674 		break_before_make = true;
4675 	}
4676 #endif
4677 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4678 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4679 		pmap_clear_user_ttb_internal();
4680 	}
4681 
4682 #if HAS_SPECRES
4683 	/**
4684 	 * Perform an CFP/DVP flush if required.
4685 	 */
4686 	if (__improbable(do_speculation_restriction)) {
4687 		pmap_flush_core_cfp_asid_async(pmap);
4688 #if REQUIRES_DVP_RCTX
4689 		pmap_flush_core_dvp_asid_async(pmap);
4690 #endif /* REQUIRES_DVP_RCTX */
4691 #if DEVELOPMENT || DEBUG
4692 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4693 #endif /* DEVELOPMENT || DEBUG */
4694 	}
4695 #endif /* HAS_SPECRES */
4696 
4697 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4698 	 * to flush the userspace mappings for that region.  Those mappings are global
4699 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4700 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4701 	if (__improbable(do_shared_region_flush)) {
4702 #if __ARM_RANGE_TLBI__
4703 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4704 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4705 
4706 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4707 		 * There may still be non-global entries that overlap with the incoming pmap's
4708 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4709 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4710 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4711 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4712 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4713 		 * to consider additional invalidation here in the future. */
4714 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4715 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4716 		} else {
4717 			/*
4718 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4719 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4720 			 * have a single-page shared region anyway, not least because pmap_nest()
4721 			 * requires L2 block alignment of the address and size.
4722 			 */
4723 			do_asid_flush = false;
4724 			flush_core_tlb_async();
4725 		}
4726 #else
4727 		do_asid_flush = false;
4728 		flush_core_tlb_async();
4729 #endif // __ARM_RANGE_TLBI__
4730 	}
4731 
4732 #if __ARM_MIXED_PAGE_SIZE__
4733 	if (__improbable(do_commpage_flush)) {
4734 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4735 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4736 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4737 		flush_core_tlb_allrange_async(rtlbi_param);
4738 	}
4739 #endif
4740 	if (__improbable(do_asid_flush)) {
4741 		pmap_flush_core_tlb_asid_async(pmap);
4742 #if DEVELOPMENT || DEBUG
4743 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4744 #endif /* DEVELOPMENT || DEBUG */
4745 	}
4746 
4747 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4748 #if HAS_SPECRES && !HAS_ERRATA_123855614
4749 	    || do_speculation_restriction
4750 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4751 	    )) {
4752 		sync_tlb_flush_local();
4753 	}
4754 
4755 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4756 }
4757 
4758 void
4759 pmap_switch(
4760 	pmap_t pmap)
4761 {
4762 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4763 #if XNU_MONITOR
4764 	pmap_switch_ppl(pmap);
4765 #else
4766 	pmap_switch_internal(pmap);
4767 #endif
4768 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4769 }
4770 
4771 void
4772 pmap_page_protect(
4773 	ppnum_t ppnum,
4774 	vm_prot_t prot)
4775 {
4776 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4777 }
4778 
4779 /*
4780  *	Routine:	pmap_page_protect_options
4781  *
4782  *	Function:
4783  *		Lower the permission for all mappings to a given
4784  *		page.
4785  */
4786 MARK_AS_PMAP_TEXT static void
4787 pmap_page_protect_options_with_flush_range(
4788 	ppnum_t ppnum,
4789 	vm_prot_t prot,
4790 	unsigned int options,
4791 	pmap_tlb_flush_range_t *flush_range)
4792 {
4793 	pmap_paddr_t    phys = ptoa(ppnum);
4794 	pv_entry_t    **pv_h;
4795 	pv_entry_t     *pve_p, *orig_pve_p;
4796 	pv_entry_t     *pveh_p;
4797 	pv_entry_t     *pvet_p;
4798 	pt_entry_t     *pte_p, *orig_pte_p;
4799 	pv_entry_t     *new_pve_p;
4800 	pt_entry_t     *new_pte_p;
4801 	vm_offset_t     pvh_flags;
4802 	unsigned int    pai;
4803 	bool            remove;
4804 	bool            set_NX;
4805 	unsigned int    pvh_cnt = 0;
4806 	unsigned int    pass1_updated = 0;
4807 	unsigned int    pass2_updated = 0;
4808 
4809 	assert(ppnum != vm_page_fictitious_addr);
4810 
4811 	/* Only work with managed pages. */
4812 	if (!pa_valid(phys)) {
4813 		return;
4814 	}
4815 
4816 	/*
4817 	 * Determine the new protection.
4818 	 */
4819 	switch (prot) {
4820 	case VM_PROT_ALL:
4821 		return;         /* nothing to do */
4822 	case VM_PROT_READ:
4823 	case VM_PROT_READ | VM_PROT_EXECUTE:
4824 		remove = false;
4825 		break;
4826 	default:
4827 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4828 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4829 		remove = true;
4830 		break;
4831 	}
4832 
4833 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4834 	if (remove) {
4835 #if !XNU_MONITOR
4836 		mp_disable_preemption();
4837 #endif
4838 		pmap_cpu_data = pmap_get_cpu_data();
4839 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4840 		/*
4841 		 * Ensure the store to inflight_disconnect will be observed before any of the
4842 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4843 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4844 		 * another CPU, in between this function's clearing a PTE and dropping the
4845 		 * corresponding pagetable refcount.  That can lead to a panic if the
4846 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4847 		 * store barrier; a store-release operation would not be sufficient.
4848 		 */
4849 		os_atomic_thread_fence(release);
4850 	}
4851 
4852 	pai = pa_index(phys);
4853 	pvh_lock(pai);
4854 	pv_h = pai_to_pvh(pai);
4855 	pvh_flags = pvh_get_flags(pv_h);
4856 
4857 #if XNU_MONITOR
4858 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4859 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4860 	}
4861 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4862 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4863 	}
4864 	if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4865 		panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4866 	}
4867 #endif
4868 
4869 
4870 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4871 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4872 	pveh_p = PV_ENTRY_NULL;
4873 	pvet_p = PV_ENTRY_NULL;
4874 	new_pve_p = PV_ENTRY_NULL;
4875 	new_pte_p = PT_ENTRY_NULL;
4876 
4877 
4878 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4879 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4880 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4881 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4882 		pveh_p = pve_p;
4883 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4884 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4885 	}
4886 
4887 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4888 	int pve_ptep_idx = 0;
4889 
4890 	/*
4891 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4892 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4893 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4894 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4895 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4896 	 * tlb_flush_needed to be true while issue_tlbi is false.
4897 	 */
4898 	bool issue_tlbi = false;
4899 	bool tlb_flush_needed = false;
4900 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4901 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4902 		pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4903 		bool update = false;
4904 
4905 		if (pve_p != PV_ENTRY_NULL) {
4906 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4907 			if (pte_p == PT_ENTRY_NULL) {
4908 				goto protect_skip_pve_pass1;
4909 			}
4910 		}
4911 
4912 #ifdef PVH_FLAG_IOMMU
4913 		if (pvh_ptep_is_iommu(pte_p)) {
4914 #if XNU_MONITOR
4915 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4916 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4917 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4918 			}
4919 #endif
4920 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4921 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4922 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4923 			}
4924 			goto protect_skip_pve_pass1;
4925 		}
4926 #endif
4927 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4928 		const pmap_t pmap = ptdp->pmap;
4929 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4930 
4931 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4932 #if MACH_ASSERT
4933 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4934 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4935 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4936 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4937 
4938 				pv_entry_t *check_pvep = pve_p;
4939 
4940 				do {
4941 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4942 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4943 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4944 					}
4945 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4946 
4947 				/* Restore previous PTEP value. */
4948 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4949 			}
4950 #endif
4951 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4952 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4953 		}
4954 
4955 #if DEVELOPMENT || DEBUG
4956 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4957 #else
4958 		if ((prot & VM_PROT_EXECUTE))
4959 #endif
4960 		{
4961 			set_NX = false;
4962 		} else {
4963 			set_NX = true;
4964 		}
4965 
4966 #if HAS_FEAT_XS
4967 		/**
4968 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4969 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4970 		 */
4971 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4972 #endif /* HAS_FEAT_XS */
4973 
4974 		/* Remove the mapping if new protection is NONE */
4975 		if (remove) {
4976 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4977 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4978 				    __func__, pmap, ppnum);
4979 			}
4980 
4981 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4982 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4983 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4984 			pt_entry_t spte = *pte_p;
4985 
4986 			if (pte_is_wired(spte)) {
4987 				pte_set_wired(pmap, pte_p, 0);
4988 				spte = *pte_p;
4989 				if (pmap != kernel_pmap) {
4990 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4991 				}
4992 			}
4993 
4994 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4995 			    (uint64_t)spte, pte_p, ppnum);
4996 
4997 			if (compress && is_internal && (pmap != kernel_pmap)) {
4998 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4999 				/* mark this PTE as having been "compressed" */
5000 				tmplate = ARM_PTE_COMPRESSED;
5001 				if (is_altacct) {
5002 					tmplate |= ARM_PTE_COMPRESSED_ALT;
5003 				}
5004 			} else {
5005 				tmplate = ARM_PTE_TYPE_FAULT;
5006 			}
5007 
5008 			assert(spte != tmplate);
5009 			write_pte_fast(pte_p, tmplate);
5010 			update = true;
5011 			++pass1_updated;
5012 
5013 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5014 
5015 			if (pmap != kernel_pmap) {
5016 				if (ppattr_test_reusable(pai) &&
5017 				    is_internal &&
5018 				    !is_altacct) {
5019 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5020 				} else if (!is_internal) {
5021 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5022 				}
5023 
5024 				if (is_altacct) {
5025 					assert(is_internal);
5026 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5027 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5028 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5029 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5030 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5031 					}
5032 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5033 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5034 				} else if (ppattr_test_reusable(pai)) {
5035 					assert(is_internal);
5036 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5037 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5038 						/* was not in footprint, but is now */
5039 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5040 					}
5041 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5042 				} else if (is_internal) {
5043 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5044 
5045 					/*
5046 					 * Update all stats related to physical footprint, which only
5047 					 * deals with internal pages.
5048 					 */
5049 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5050 						/*
5051 						 * This removal is only being done so we can send this page to
5052 						 * the compressor; therefore it mustn't affect total task footprint.
5053 						 */
5054 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5055 					} else {
5056 						/*
5057 						 * This internal page isn't going to the compressor, so adjust stats to keep
5058 						 * phys_footprint up to date.
5059 						 */
5060 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5061 					}
5062 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5063 				} else {
5064 					/* external page: no impact on ledgers */
5065 				}
5066 			}
5067 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5068 		} else {
5069 			pt_entry_t spte = *pte_p;
5070 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5071 
5072 			if (pmap == kernel_pmap) {
5073 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5074 			} else {
5075 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5076 			}
5077 
5078 			/*
5079 			 * While the naive implementation of this would serve to add execute
5080 			 * permission, this is not how the VM uses this interface, or how
5081 			 * x86_64 implements it.  So ignore requests to add execute permissions.
5082 			 */
5083 			if (set_NX) {
5084 				tmplate |= pt_attr_leaf_xn(pt_attr);
5085 			}
5086 
5087 
5088 			assert(spte != ARM_PTE_TYPE_FAULT);
5089 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5090 
5091 			if (spte != tmplate) {
5092 				/*
5093 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5094 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5095 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
5096 				 * should always be cleared by this function.
5097 				 */
5098 				pte_set_was_writeable(tmplate, true);
5099 				write_pte_fast(pte_p, tmplate);
5100 				update = true;
5101 				++pass1_updated;
5102 			} else if (pte_was_writeable(tmplate)) {
5103 				/*
5104 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5105 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5106 				 * write access to a page, this function should always at least clear that flag for
5107 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5108 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5109 				 * be handled through arm_fast_fault().
5110 				 */
5111 				pte_set_was_writeable(tmplate, false);
5112 				write_pte_fast(pte_p, tmplate);
5113 			}
5114 		}
5115 
5116 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5117 			tlb_flush_needed = true;
5118 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5119 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5120 				issue_tlbi = true;
5121 			}
5122 		}
5123 protect_skip_pve_pass1:
5124 		pte_p = PT_ENTRY_NULL;
5125 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5126 			pve_ptep_idx = 0;
5127 			pve_p = pve_next(pve_p);
5128 		}
5129 	}
5130 
5131 	if (tlb_flush_needed) {
5132 		FLUSH_PTE_STRONG();
5133 	}
5134 
5135 	if (!remove && !issue_tlbi) {
5136 		goto protect_finish;
5137 	}
5138 
5139 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5140 	pv_entry_t **pve_pp = pv_h;
5141 	pve_p = orig_pve_p;
5142 	pte_p = orig_pte_p;
5143 	pve_ptep_idx = 0;
5144 
5145 	/*
5146 	 * We need to keep track of whether a particular PVE list contains IOMMU
5147 	 * mappings when removing entries, because we should only remove CPU
5148 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5149 	 * it around.
5150 	 */
5151 	bool iommu_mapping_in_pve = false;
5152 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5153 		if (pve_p != PV_ENTRY_NULL) {
5154 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5155 			if (pte_p == PT_ENTRY_NULL) {
5156 				goto protect_skip_pve_pass2;
5157 			}
5158 		}
5159 
5160 #ifdef PVH_FLAG_IOMMU
5161 		if (pvh_ptep_is_iommu(pte_p)) {
5162 			iommu_mapping_in_pve = true;
5163 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5164 				/*
5165 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5166 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5167 				 * contain the single IOMMU PTE and exit the loop.
5168 				 */
5169 				new_pte_p = pte_p;
5170 				break;
5171 			}
5172 			goto protect_skip_pve_pass2;
5173 		}
5174 #endif
5175 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5176 		const pmap_t pmap = ptdp->pmap;
5177 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5178 
5179 		if (remove) {
5180 			if (!compress && (pmap != kernel_pmap)) {
5181 				/*
5182 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5183 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5184 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5185 				 * under us.
5186 				 */
5187 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5188 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5189 				}
5190 			}
5191 			/* Remove this CPU mapping from PVE list. */
5192 			if (pve_p != PV_ENTRY_NULL) {
5193 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5194 			}
5195 		} else {
5196 			pt_entry_t spte = *pte_p;
5197 			if (pte_was_writeable(spte)) {
5198 				pte_set_was_writeable(spte, false);
5199 				write_pte_fast(pte_p, spte);
5200 			} else {
5201 				goto protect_skip_pve_pass2;
5202 			}
5203 		}
5204 		++pass2_updated;
5205 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5206 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5207 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5208 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5209 		}
5210 
5211 protect_skip_pve_pass2:
5212 		pte_p = PT_ENTRY_NULL;
5213 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5214 			pve_ptep_idx = 0;
5215 
5216 			if (remove) {
5217 				/**
5218 				 * If there are any IOMMU mappings in the PVE list, preserve
5219 				 * those mappings in a new PVE list (new_pve_p) which will later
5220 				 * become the new PVH entry. Keep track of the CPU mappings in
5221 				 * pveh_p/pvet_p so they can be deallocated later.
5222 				 */
5223 				if (iommu_mapping_in_pve) {
5224 					iommu_mapping_in_pve = false;
5225 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5226 					pve_remove(pv_h, pve_pp, pve_p);
5227 					pveh_p = pvh_pve_list(pv_h);
5228 					pve_p->pve_next = new_pve_p;
5229 					new_pve_p = pve_p;
5230 					pve_p = temp_pve_p;
5231 					continue;
5232 				} else {
5233 					pvet_p = pve_p;
5234 					pvh_cnt++;
5235 				}
5236 			}
5237 
5238 			pve_pp = pve_next_ptr(pve_p);
5239 			pve_p = pve_next(pve_p);
5240 			iommu_mapping_in_pve = false;
5241 		}
5242 	}
5243 
5244 protect_finish:
5245 
5246 #ifdef PVH_FLAG_EXEC
5247 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5248 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5249 	}
5250 #endif
5251 	if (__improbable(pass1_updated != pass2_updated)) {
5252 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5253 		    __func__, pass1_updated, pass2_updated);
5254 	}
5255 	/* if we removed a bunch of entries, take care of them now */
5256 	if (remove) {
5257 		if (new_pve_p != PV_ENTRY_NULL) {
5258 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5259 			pvh_set_flags(pv_h, pvh_flags);
5260 		} else if (new_pte_p != PT_ENTRY_NULL) {
5261 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5262 			pvh_set_flags(pv_h, pvh_flags);
5263 		} else {
5264 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5265 				pmap_flush_noncoherent_page(phys);
5266 			}
5267 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5268 		}
5269 	}
5270 
5271 	if (flush_range && tlb_flush_needed) {
5272 		if (!remove) {
5273 			flush_range->ptfr_flush_needed = true;
5274 			tlb_flush_needed = false;
5275 		}
5276 	}
5277 
5278 	/*
5279 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5280 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5281 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5282 	 * a page to be repurposed while it is still live in the TLBs.
5283 	 */
5284 	if (remove && tlb_flush_needed) {
5285 		sync_tlb_flush();
5286 	}
5287 
5288 
5289 	pvh_unlock(pai);
5290 
5291 	if (remove) {
5292 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5293 #if !XNU_MONITOR
5294 		mp_enable_preemption();
5295 #endif
5296 	}
5297 
5298 	if (!remove && tlb_flush_needed) {
5299 		sync_tlb_flush();
5300 	}
5301 
5302 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5303 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5304 	}
5305 }
5306 
5307 MARK_AS_PMAP_TEXT void
5308 pmap_page_protect_options_internal(
5309 	ppnum_t ppnum,
5310 	vm_prot_t prot,
5311 	unsigned int options,
5312 	void *arg)
5313 {
5314 	if (arg != NULL) {
5315 		/*
5316 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5317 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5318 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5319 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5320 		 * In that case, force the flush to take place.
5321 		 */
5322 		options &= ~PMAP_OPTIONS_NOFLUSH;
5323 	}
5324 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5325 }
5326 
5327 void
5328 pmap_page_protect_options(
5329 	ppnum_t ppnum,
5330 	vm_prot_t prot,
5331 	unsigned int options,
5332 	void *arg)
5333 {
5334 	pmap_paddr_t    phys = ptoa(ppnum);
5335 
5336 	assert(ppnum != vm_page_fictitious_addr);
5337 
5338 	/* Only work with managed pages. */
5339 	if (!pa_valid(phys)) {
5340 		return;
5341 	}
5342 
5343 	/*
5344 	 * Determine the new protection.
5345 	 */
5346 	if (prot == VM_PROT_ALL) {
5347 		return;         /* nothing to do */
5348 	}
5349 
5350 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5351 
5352 #if XNU_MONITOR
5353 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5354 #else
5355 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5356 #endif
5357 
5358 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5359 }
5360 
5361 
5362 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5363 MARK_AS_PMAP_TEXT void
5364 pmap_disable_user_jop_internal(pmap_t pmap)
5365 {
5366 	if (pmap == kernel_pmap) {
5367 		panic("%s: called with kernel_pmap", __func__);
5368 	}
5369 	validate_pmap_mutable(pmap);
5370 	pmap->disable_jop = true;
5371 }
5372 
5373 void
5374 pmap_disable_user_jop(pmap_t pmap)
5375 {
5376 #if XNU_MONITOR
5377 	pmap_disable_user_jop_ppl(pmap);
5378 #else
5379 	pmap_disable_user_jop_internal(pmap);
5380 #endif
5381 }
5382 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5383 
5384 /*
5385  * Indicates if the pmap layer enforces some additional restrictions on the
5386  * given set of protections.
5387  */
5388 bool
5389 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5390 {
5391 	return false;
5392 }
5393 
5394 /*
5395  *	Set the physical protection on the
5396  *	specified range of this map as requested.
5397  *	VERY IMPORTANT: Will not increase permissions.
5398  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5399  */
5400 void
5401 pmap_protect(
5402 	pmap_t pmap,
5403 	vm_map_address_t b,
5404 	vm_map_address_t e,
5405 	vm_prot_t prot)
5406 {
5407 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5408 }
5409 
5410 MARK_AS_PMAP_TEXT vm_map_address_t
5411 pmap_protect_options_internal(
5412 	pmap_t pmap,
5413 	vm_map_address_t start,
5414 	vm_map_address_t end,
5415 	vm_prot_t prot,
5416 	unsigned int options,
5417 	__unused void *args)
5418 {
5419 	tt_entry_t      *tte_p;
5420 	pt_entry_t      *bpte_p, *epte_p;
5421 	pt_entry_t      *pte_p;
5422 	boolean_t        set_NX = TRUE;
5423 	boolean_t        set_XO = FALSE;
5424 	boolean_t        should_have_removed = FALSE;
5425 	bool             need_strong_sync = false;
5426 
5427 	/* Validate the pmap input before accessing its data. */
5428 	validate_pmap_mutable(pmap);
5429 
5430 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5431 
5432 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5433 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5434 	}
5435 
5436 #if DEVELOPMENT || DEBUG
5437 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5438 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5439 			should_have_removed = TRUE;
5440 		}
5441 	} else
5442 #endif
5443 	{
5444 		/* Determine the new protection. */
5445 		switch (prot) {
5446 		case VM_PROT_EXECUTE:
5447 			set_XO = TRUE;
5448 			OS_FALLTHROUGH;
5449 		case VM_PROT_READ:
5450 		case VM_PROT_READ | VM_PROT_EXECUTE:
5451 			break;
5452 		case VM_PROT_READ | VM_PROT_WRITE:
5453 		case VM_PROT_ALL:
5454 			return end;         /* nothing to do */
5455 		default:
5456 			should_have_removed = TRUE;
5457 		}
5458 	}
5459 
5460 	if (should_have_removed) {
5461 		panic("%s: should have been a remove operation, "
5462 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5463 		    __FUNCTION__,
5464 		    pmap, (void *)start, (void *)end, prot, options, args);
5465 	}
5466 
5467 #if DEVELOPMENT || DEBUG
5468 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5469 #else
5470 	if ((prot & VM_PROT_EXECUTE))
5471 #endif
5472 	{
5473 		set_NX = FALSE;
5474 	} else {
5475 		set_NX = TRUE;
5476 	}
5477 
5478 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5479 	vm_map_address_t va = start;
5480 	unsigned int npages = 0;
5481 
5482 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5483 
5484 	tte_p = pmap_tte(pmap, start);
5485 
5486 	if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5487 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5488 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5489 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5490 		pte_p = bpte_p;
5491 
5492 		for (pte_p = bpte_p;
5493 		    pte_p < epte_p;
5494 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5495 			++npages;
5496 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5497 			    pmap_pending_preemption())) {
5498 				break;
5499 			}
5500 			pt_entry_t spte;
5501 #if DEVELOPMENT || DEBUG
5502 			boolean_t  force_write = FALSE;
5503 #endif
5504 
5505 			spte = *((volatile pt_entry_t*)pte_p);
5506 
5507 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5508 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5509 				continue;
5510 			}
5511 
5512 			pmap_paddr_t    pa;
5513 			unsigned int    pai = 0;
5514 			boolean_t       managed = FALSE;
5515 
5516 			while (!managed) {
5517 				/*
5518 				 * It may be possible for the pte to transition from managed
5519 				 * to unmanaged in this timeframe; for now, elide the assert.
5520 				 * We should break out as a consequence of checking pa_valid.
5521 				 */
5522 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5523 				pa = pte_to_pa(spte);
5524 				if (!pa_valid(pa)) {
5525 					break;
5526 				}
5527 				pai = pa_index(pa);
5528 				pvh_lock(pai);
5529 				spte = *((volatile pt_entry_t*)pte_p);
5530 				pa = pte_to_pa(spte);
5531 				if (pai == pa_index(pa)) {
5532 					managed = TRUE;
5533 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5534 				}
5535 				pvh_unlock(pai);
5536 			}
5537 
5538 			if ((spte == ARM_PTE_TYPE_FAULT) ||
5539 			    ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5540 				continue;
5541 			}
5542 
5543 			pt_entry_t      tmplate;
5544 
5545 			if (pmap == kernel_pmap) {
5546 #if DEVELOPMENT || DEBUG
5547 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5548 					force_write = TRUE;
5549 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5550 				} else
5551 #endif
5552 				{
5553 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5554 				}
5555 			} else {
5556 #if DEVELOPMENT || DEBUG
5557 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5558 					assert(pmap->type != PMAP_TYPE_NESTED);
5559 					force_write = TRUE;
5560 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5561 				} else
5562 #endif
5563 				{
5564 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5565 				}
5566 			}
5567 
5568 			/*
5569 			 * XXX Removing "NX" would
5570 			 * grant "execute" access
5571 			 * immediately, bypassing any
5572 			 * checks VM might want to do
5573 			 * in its soft fault path.
5574 			 * pmap_protect() and co. are
5575 			 * not allowed to increase
5576 			 * access permissions.
5577 			 */
5578 			if (set_NX) {
5579 				tmplate |= pt_attr_leaf_xn(pt_attr);
5580 			} else {
5581 				if (pmap == kernel_pmap) {
5582 					/* do NOT clear "PNX"! */
5583 					tmplate |= ARM_PTE_NX;
5584 				} else {
5585 					/* do NOT clear "NX"! */
5586 					tmplate |= pt_attr_leaf_x(pt_attr);
5587 					if (set_XO) {
5588 						tmplate &= ~ARM_PTE_APMASK;
5589 						tmplate |= pt_attr_leaf_rona(pt_attr);
5590 					}
5591 				}
5592 			}
5593 
5594 #if DEVELOPMENT || DEBUG
5595 			if (force_write) {
5596 				/*
5597 				 * TODO: Run CS/Monitor checks here.
5598 				 */
5599 				if (managed) {
5600 					/*
5601 					 * We are marking the page as writable,
5602 					 * so we consider it to be modified and
5603 					 * referenced.
5604 					 */
5605 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5606 					tmplate |= ARM_PTE_AF;
5607 
5608 					if (ppattr_test_reffault(pai)) {
5609 						ppattr_clear_reffault(pai);
5610 					}
5611 
5612 					if (ppattr_test_modfault(pai)) {
5613 						ppattr_clear_modfault(pai);
5614 					}
5615 				}
5616 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5617 				/*
5618 				 * An immediate request for anything other than
5619 				 * write should still mark the page as
5620 				 * referenced if managed.
5621 				 */
5622 				if (managed) {
5623 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5624 					tmplate |= ARM_PTE_AF;
5625 
5626 					if (ppattr_test_reffault(pai)) {
5627 						ppattr_clear_reffault(pai);
5628 					}
5629 				}
5630 			}
5631 #endif
5632 
5633 			/* We do not expect to write fast fault the entry. */
5634 			pte_set_was_writeable(tmplate, false);
5635 #if HAS_FEAT_XS
5636 			if (pte_is_xs(pt_attr, spte)) {
5637 				need_strong_sync = true;
5638 			}
5639 #endif /* HAS_FEAT_XS */
5640 
5641 			write_pte_fast(pte_p, tmplate);
5642 
5643 			if (managed) {
5644 				pvh_assert_locked(pai);
5645 				pvh_unlock(pai);
5646 			}
5647 		}
5648 		FLUSH_PTE_STRONG();
5649 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5650 	} else {
5651 		va = end;
5652 	}
5653 
5654 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5655 	return va;
5656 }
5657 
5658 void
5659 pmap_protect_options(
5660 	pmap_t pmap,
5661 	vm_map_address_t b,
5662 	vm_map_address_t e,
5663 	vm_prot_t prot,
5664 	unsigned int options,
5665 	__unused void *args)
5666 {
5667 	vm_map_address_t l, beg;
5668 
5669 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5670 
5671 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5672 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5673 		    pmap, (uint64_t)b, (uint64_t)e);
5674 	}
5675 
5676 	/*
5677 	 * We allow single-page requests to execute non-preemptibly,
5678 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5679 	 * operation, and there are a couple of special use cases that
5680 	 * require a non-preemptible single-page operation.
5681 	 */
5682 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5683 		pmap_verify_preemptible();
5684 	}
5685 
5686 #if DEVELOPMENT || DEBUG
5687 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5688 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5689 			pmap_remove_options(pmap, b, e, options);
5690 			return;
5691 		}
5692 	} else
5693 #endif
5694 	{
5695 		/* Determine the new protection. */
5696 		switch (prot) {
5697 		case VM_PROT_EXECUTE:
5698 		case VM_PROT_READ:
5699 		case VM_PROT_READ | VM_PROT_EXECUTE:
5700 			break;
5701 		case VM_PROT_READ | VM_PROT_WRITE:
5702 		case VM_PROT_ALL:
5703 			return;         /* nothing to do */
5704 		default:
5705 			pmap_remove_options(pmap, b, e, options);
5706 			return;
5707 		}
5708 	}
5709 
5710 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5711 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5712 	    VM_KERNEL_ADDRHIDE(e));
5713 
5714 	beg = b;
5715 
5716 	while (beg < e) {
5717 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5718 
5719 		if (l > e) {
5720 			l = e;
5721 		}
5722 
5723 #if XNU_MONITOR
5724 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5725 #else
5726 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5727 #endif
5728 	}
5729 
5730 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5731 }
5732 
5733 /**
5734  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5735  *
5736  * @param pmap pmap to insert the pages into.
5737  * @param va virtual address to map the pages into.
5738  * @param pa page number of the first physical page to map.
5739  * @param size block size, in number of pages.
5740  * @param prot mapping protection attributes.
5741  * @param attr flags to pass to pmap_enter().
5742  *
5743  * @return KERN_SUCCESS.
5744  */
5745 kern_return_t
5746 pmap_map_block(
5747 	pmap_t pmap,
5748 	addr64_t va,
5749 	ppnum_t pa,
5750 	uint32_t size,
5751 	vm_prot_t prot,
5752 	int attr,
5753 	unsigned int flags)
5754 {
5755 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5756 }
5757 
5758 /**
5759  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5760  * As opposed to pmap_map_block(), this function takes
5761  * a physical address as an input and operates using the
5762  * page size associated with the input pmap.
5763  *
5764  * @param pmap pmap to insert the pages into.
5765  * @param va virtual address to map the pages into.
5766  * @param pa physical address of the first physical page to map.
5767  * @param size block size, in number of pages.
5768  * @param prot mapping protection attributes.
5769  * @param attr flags to pass to pmap_enter().
5770  *
5771  * @return KERN_SUCCESS.
5772  */
5773 kern_return_t
5774 pmap_map_block_addr(
5775 	pmap_t pmap,
5776 	addr64_t va,
5777 	pmap_paddr_t pa,
5778 	uint32_t size,
5779 	vm_prot_t prot,
5780 	int attr,
5781 	unsigned int flags)
5782 {
5783 #if __ARM_MIXED_PAGE_SIZE__
5784 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5785 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5786 #else
5787 	const uint64_t pmap_page_size = PAGE_SIZE;
5788 #endif
5789 
5790 	for (ppnum_t page = 0; page < size; page++) {
5791 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5792 			panic("%s: failed pmap_enter_addr, "
5793 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5794 			    __FUNCTION__,
5795 			    pmap, va, (uint64_t)pa, size, prot, flags);
5796 		}
5797 
5798 		va += pmap_page_size;
5799 		pa += pmap_page_size;
5800 	}
5801 
5802 	return KERN_SUCCESS;
5803 }
5804 
5805 kern_return_t
5806 pmap_enter_addr(
5807 	pmap_t pmap,
5808 	vm_map_address_t v,
5809 	pmap_paddr_t pa,
5810 	vm_prot_t prot,
5811 	vm_prot_t fault_type,
5812 	unsigned int flags,
5813 	boolean_t wired)
5814 {
5815 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5816 }
5817 
5818 /*
5819  *	Insert the given physical page (p) at
5820  *	the specified virtual address (v) in the
5821  *	target physical map with the protection requested.
5822  *
5823  *	If specified, the page will be wired down, meaning
5824  *	that the related pte can not be reclaimed.
5825  *
5826  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5827  *	or lose information.  That is, this routine must actually
5828  *	insert this page into the given map eventually (must make
5829  *	forward progress eventually.
5830  */
5831 kern_return_t
5832 pmap_enter(
5833 	pmap_t pmap,
5834 	vm_map_address_t v,
5835 	ppnum_t pn,
5836 	vm_prot_t prot,
5837 	vm_prot_t fault_type,
5838 	unsigned int flags,
5839 	boolean_t wired,
5840 	__unused pmap_mapping_type_t mapping_type)
5841 {
5842 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5843 }
5844 
5845 /*
5846  * Attempt to commit the pte.
5847  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5848  * Performs no page table or accounting writes on failures.
5849  */
5850 static inline bool
5851 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5852 {
5853 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5854 	bool success = false, changed_wiring = false;
5855 
5856 	__unreachable_ok_push
5857 	if (TEST_PAGE_RATIO_4) {
5858 		/*
5859 		 * 16K virtual pages w/ 4K hw pages.
5860 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5861 		 * As a result we require the exclusive pmap lock.
5862 		 */
5863 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5864 		*old_pte = *pte_p;
5865 		if (*old_pte == new_pte) {
5866 			/* Another thread completed this operation. Nothing to do here. */
5867 			success = true;
5868 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5869 		    (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5870 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5871 			success = false;
5872 		} else {
5873 			write_pte_fast(pte_p, new_pte);
5874 			success = true;
5875 		}
5876 	} else {
5877 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5878 	}
5879 	__unreachable_ok_pop
5880 
5881 	if (success && *old_pte != new_pte) {
5882 		if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5883 			bool need_strong_sync = false;
5884 			FLUSH_PTE_STRONG();
5885 #if HAS_FEAT_XS
5886 			if (pte_is_xs(pt_attr, *old_pte)) {
5887 				need_strong_sync = true;
5888 			}
5889 #endif /* HAS_FEAT_XS */
5890 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5891 		} else {
5892 			FLUSH_PTE();
5893 			__builtin_arm_isb(ISB_SY);
5894 		}
5895 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5896 		    (new_pte & ARM_PTE_WIRED) != 0 :
5897 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5898 
5899 		if (pmap != kernel_pmap && changed_wiring) {
5900 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5901 			if (new_pte & ARM_PTE_WIRED) {
5902 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5903 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5904 			} else {
5905 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5906 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5907 			}
5908 		}
5909 
5910 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5911 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5912 	}
5913 	return success;
5914 }
5915 
5916 MARK_AS_PMAP_TEXT static pt_entry_t
5917 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5918 {
5919 	pt_entry_t pte;
5920 
5921 	switch (wimg & (VM_WIMG_MASK)) {
5922 	case VM_WIMG_IO:
5923 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5924 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5925 		// AP, while preserving the security benefits of using device
5926 		// mapping against side-channel attacks. On pre-H14 platforms,
5927 		// the accesses will still be strongly ordered.
5928 		if (is_dram_addr(pa)) {
5929 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5930 		} else {
5931 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5932 #if HAS_FEAT_XS
5933 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5934 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5935 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5936 			}
5937 #endif /* HAS_FEAT_XS */
5938 		}
5939 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5940 		break;
5941 	case VM_WIMG_RT:
5942 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5943 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5944 		break;
5945 	case VM_WIMG_POSTED:
5946 		if (is_dram_addr(pa)) {
5947 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5948 		} else {
5949 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5950 		}
5951 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5952 		break;
5953 	case VM_WIMG_POSTED_REORDERED:
5954 		if (is_dram_addr(pa)) {
5955 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5956 		} else {
5957 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5958 		}
5959 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5960 		break;
5961 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5962 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5963 #if HAS_FEAT_XS
5964 		if (!is_dram_addr(pa)) {
5965 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5966 		}
5967 #endif /* HAS_FEAT_XS */
5968 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5969 		break;
5970 	case VM_WIMG_WCOMB:
5971 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5972 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5973 		break;
5974 	case VM_WIMG_WTHRU:
5975 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5976 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5977 		break;
5978 	case VM_WIMG_COPYBACK:
5979 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5980 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5981 		break;
5982 	case VM_WIMG_INNERWBACK:
5983 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5984 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5985 		break;
5986 	default:
5987 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5988 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5989 	}
5990 
5991 	return pte;
5992 }
5993 
5994 
5995 /*
5996  * Construct a PTE (and the physical page attributes) for the given virtual to
5997  * physical mapping.
5998  *
5999  * This function has no side effects and is safe to call so that it is safe to
6000  * call while attempting a pmap_enter transaction.
6001  */
6002 MARK_AS_PMAP_TEXT static pt_entry_t
6003 pmap_construct_pte(
6004 	const pmap_t pmap,
6005 	vm_map_address_t va,
6006 	pmap_paddr_t pa,
6007 	vm_prot_t prot,
6008 	vm_prot_t fault_type,
6009 	boolean_t wired,
6010 	const pt_attr_t* const pt_attr,
6011 	uint16_t *pp_attr_bits /* OUTPUT */
6012 	)
6013 {
6014 	bool set_NX = false, set_XO = false;
6015 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
6016 	assert(pp_attr_bits != NULL);
6017 	*pp_attr_bits = 0;
6018 
6019 	if (wired) {
6020 		pte |= ARM_PTE_WIRED;
6021 	}
6022 
6023 #if DEVELOPMENT || DEBUG
6024 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6025 #else
6026 	if ((prot & VM_PROT_EXECUTE))
6027 #endif
6028 	{
6029 		set_NX = false;
6030 	} else {
6031 		set_NX = true;
6032 	}
6033 
6034 	if (prot == VM_PROT_EXECUTE) {
6035 		set_XO = true;
6036 	}
6037 
6038 	if (set_NX) {
6039 		pte |= pt_attr_leaf_xn(pt_attr);
6040 	} else {
6041 		if (pmap == kernel_pmap) {
6042 			pte |= ARM_PTE_NX;
6043 		} else {
6044 			pte |= pt_attr_leaf_x(pt_attr);
6045 		}
6046 	}
6047 
6048 	if (pmap == kernel_pmap) {
6049 #if __ARM_KERNEL_PROTECT__
6050 		pte |= ARM_PTE_NG;
6051 #endif /* __ARM_KERNEL_PROTECT__ */
6052 		if (prot & VM_PROT_WRITE) {
6053 			pte |= ARM_PTE_AP(AP_RWNA);
6054 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6055 		} else {
6056 			pte |= ARM_PTE_AP(AP_RONA);
6057 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6058 		}
6059 	} else {
6060 		if (pmap->type != PMAP_TYPE_NESTED) {
6061 			pte |= ARM_PTE_NG;
6062 		} else if ((pmap->nested_region_unnested_table_bitmap)
6063 		    && (va >= pmap->nested_region_addr)
6064 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6065 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
6066 
6067 			if ((pmap->nested_region_unnested_table_bitmap)
6068 			    && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6069 				pte |= ARM_PTE_NG;
6070 			}
6071 		}
6072 		if (prot & VM_PROT_WRITE) {
6073 			assert(pmap->type != PMAP_TYPE_NESTED);
6074 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6075 				if (fault_type & VM_PROT_WRITE) {
6076 					pte |= pt_attr_leaf_rw(pt_attr);
6077 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6078 				} else {
6079 					pte |= pt_attr_leaf_ro(pt_attr);
6080 					/*
6081 					 * Mark the page as MODFAULT so that a subsequent write
6082 					 * may be handled through arm_fast_fault().
6083 					 */
6084 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6085 					pte_set_was_writeable(pte, true);
6086 				}
6087 			} else {
6088 				pte |= pt_attr_leaf_rw(pt_attr);
6089 				*pp_attr_bits |= PP_ATTR_REFERENCED;
6090 			}
6091 		} else {
6092 			if (set_XO) {
6093 				pte |= pt_attr_leaf_rona(pt_attr);
6094 			} else {
6095 				pte |= pt_attr_leaf_ro(pt_attr);
6096 			}
6097 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6098 		}
6099 	}
6100 
6101 	pte |= ARM_PTE_AF;
6102 	return pte;
6103 }
6104 
6105 MARK_AS_PMAP_TEXT kern_return_t
6106 pmap_enter_options_internal(
6107 	pmap_t pmap,
6108 	vm_map_address_t v,
6109 	pmap_paddr_t pa,
6110 	vm_prot_t prot,
6111 	vm_prot_t fault_type,
6112 	unsigned int flags,
6113 	boolean_t wired,
6114 	unsigned int options)
6115 {
6116 	ppnum_t         pn = (ppnum_t)atop(pa);
6117 	pt_entry_t      pte;
6118 	pt_entry_t      spte;
6119 	pt_entry_t      *pte_p;
6120 	bool            refcnt_updated;
6121 	bool            wiredcnt_updated;
6122 	bool            ro_va = false;
6123 	unsigned int    wimg_bits;
6124 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6125 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6126 	kern_return_t   kr = KERN_SUCCESS;
6127 	uint16_t pp_attr_bits;
6128 	volatile uint16_t *refcnt;
6129 	volatile uint16_t *wiredcnt;
6130 	pv_free_list_t *local_pv_free;
6131 
6132 	validate_pmap_mutable(pmap);
6133 
6134 #if XNU_MONITOR
6135 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6136 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6137 	}
6138 #endif
6139 
6140 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6141 
6142 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6143 		panic("%s: pmap %p v 0x%llx not page-aligned",
6144 		    __func__, pmap, (unsigned long long)v);
6145 	}
6146 
6147 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6148 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6149 	}
6150 
6151 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6152 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6153 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6154 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6155 	}
6156 
6157 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6158 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6159 		    pmap, (uint64_t)pa);
6160 	}
6161 
6162 	/* The PA should not extend beyond the architected physical address space */
6163 	pa &= ARM_PTE_PAGE_MASK;
6164 
6165 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6166 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6167 		extern vm_offset_t ctrr_test_page;
6168 		if (__probable(v != ctrr_test_page))
6169 #endif
6170 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6171 	}
6172 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6173 		if (__improbable(prot != VM_PROT_READ)) {
6174 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6175 			    __func__, (unsigned long long)v, prot);
6176 		}
6177 		ro_va = true;
6178 	}
6179 	assert(pn != vm_page_fictitious_addr);
6180 
6181 	refcnt_updated = false;
6182 	wiredcnt_updated = false;
6183 
6184 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6185 		/*
6186 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6187 		 *
6188 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6189 		 */
6190 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6191 	}
6192 
6193 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6194 		return KERN_ABORTED;
6195 	}
6196 
6197 	/*
6198 	 *	Expand pmap to include this pte.  Assume that
6199 	 *	pmap is always expanded to include enough hardware
6200 	 *	pages to map one VM page.
6201 	 */
6202 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6203 		/* Must unlock to expand the pmap. */
6204 		pmap_unlock(pmap, lock_mode);
6205 
6206 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6207 
6208 		if (kr != KERN_SUCCESS) {
6209 			return kr;
6210 		}
6211 
6212 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6213 			return KERN_ABORTED;
6214 		}
6215 	}
6216 
6217 	if (options & PMAP_OPTIONS_NOENTER) {
6218 		pmap_unlock(pmap, lock_mode);
6219 		return KERN_SUCCESS;
6220 	}
6221 
6222 	/*
6223 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6224 	 * done via a cmpxchg loop.
6225 	 * We need to be careful about modifying non-local data structures before commiting
6226 	 * the new pte since we may need to re-do the transaction.
6227 	 */
6228 	spte = os_atomic_load(pte_p, relaxed);
6229 	while (!committed) {
6230 		refcnt = NULL;
6231 		wiredcnt = NULL;
6232 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6233 		had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6234 
6235 		if (pmap != kernel_pmap) {
6236 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6237 			refcnt = &ptd_info->refcnt;
6238 			wiredcnt = &ptd_info->wiredcnt;
6239 			/*
6240 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6241 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6242 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6243 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6244 			 * have PTDs, so we can't use the check there.
6245 			 */
6246 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6247 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6248 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6249 			}
6250 			/*
6251 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6252 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6253 			 * or acquire the pmap lock exclusive.
6254 			 */
6255 			if (!wiredcnt_updated) {
6256 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6257 				wiredcnt_updated = true;
6258 			}
6259 			if (!refcnt_updated) {
6260 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6261 				refcnt_updated = true;
6262 				drop_refcnt = true;
6263 			}
6264 		}
6265 
6266 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6267 			/*
6268 			 * There is already a mapping here & it's for a different physical page.
6269 			 * First remove that mapping.
6270 			 *
6271 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6272 			 */
6273 			if (lock_mode == PMAP_LOCK_SHARED) {
6274 				if (pmap_lock_shared_to_exclusive(pmap)) {
6275 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6276 				} else {
6277 					/*
6278 					 * We failed to upgrade to an exclusive lock.
6279 					 * As a result we no longer hold the lock at all,
6280 					 * so we need to re-acquire it and restart the transaction.
6281 					 */
6282 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6283 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6284 					/* pmap might have changed after we dropped the lock. Try again. */
6285 					spte = os_atomic_load(pte_p, relaxed);
6286 					continue;
6287 				}
6288 			}
6289 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6290 			spte = ARM_PTE_TYPE_FAULT;
6291 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6292 		}
6293 
6294 		/*
6295 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6296 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6297 		 * read-write protection. The PMAP layer though still needs to use the right
6298 		 * index, which is the older XO-now-TPRO one and that is specially selected
6299 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6300 		 */
6301 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6302 			if (__improbable(pmap == kernel_pmap)) {
6303 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6304 				    __func__);
6305 			}
6306 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6307 		} else {
6308 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6309 		}
6310 
6311 
6312 		if (pa_valid(pa)) {
6313 			unsigned int pai;
6314 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6315 
6316 			is_internal = FALSE;
6317 			is_altacct = FALSE;
6318 
6319 			pai = pa_index(pa);
6320 
6321 			pvh_lock(pai);
6322 
6323 			/*
6324 			 * Make sure that the current per-cpu PV free list has
6325 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6326 			 * if the transaction succeeds. We're either in the
6327 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6328 			 * Note that we can still be interrupted, but a primary
6329 			 * interrupt handler can never enter the pmap.
6330 			 */
6331 #if !XNU_MONITOR
6332 			assert(get_preemption_level() > 0);
6333 #endif
6334 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6335 			pv_entry_t **pv_h = pai_to_pvh(pai);
6336 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6337 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6338 
6339 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6340 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6341 				int new_allocated_pves = 0;
6342 
6343 				while (new_allocated_pves < 2) {
6344 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6345 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6346 					if (pv_status == PV_ALLOC_FAIL) {
6347 						break;
6348 					} else if (pv_status == PV_ALLOC_RETRY) {
6349 						/*
6350 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6351 						 * it will have dropped the pmap lock while doing so.
6352 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6353 						 * be on a different CPU now.
6354 						 */
6355 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6356 					} else {
6357 						/* If we've gotten this far then a node should've been allocated. */
6358 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6359 
6360 						new_allocated_pves++;
6361 					}
6362 				}
6363 
6364 				for (int i = 0; i < new_allocated_pves; i++) {
6365 					pv_free(new_pve_p[i]);
6366 				}
6367 			}
6368 
6369 			if (pv_status == PV_ALLOC_FAIL) {
6370 				pvh_unlock(pai);
6371 				kr = KERN_RESOURCE_SHORTAGE;
6372 				break;
6373 			} else if (pv_status == PV_ALLOC_RETRY) {
6374 				pvh_unlock(pai);
6375 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6376 				spte = os_atomic_load(pte_p, relaxed);
6377 				continue;
6378 			}
6379 
6380 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6381 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6382 			} else {
6383 				wimg_bits = pmap_cache_attributes(pn);
6384 			}
6385 
6386 			/* We may be retrying this operation after dropping the PVH lock.
6387 			 * Cache attributes for the physical page may have changed while the lock
6388 			 * was dropped, so clear any cache attributes we may have previously set
6389 			 * in the PTE template. */
6390 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6391 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6392 
6393 
6394 
6395 #if XNU_MONITOR
6396 			/* The regular old kernel is not allowed to remap PPL pages. */
6397 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6398 				panic("%s: page belongs to PPL, "
6399 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6400 				    __FUNCTION__,
6401 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6402 			}
6403 
6404 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6405 				panic("%s: page locked down, "
6406 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6407 				    __FUNCTION__,
6408 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6409 			}
6410 #endif
6411 
6412 
6413 
6414 
6415 
6416 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6417 			if (!committed) {
6418 				pvh_unlock(pai);
6419 				continue;
6420 			}
6421 			had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6422 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6423 
6424 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6425 			/*
6426 			 * If there was already a valid pte here then we reuse its reference
6427 			 * on the ptd and drop the one that we took above.
6428 			 */
6429 			drop_refcnt = had_valid_mapping;
6430 
6431 			if (!had_valid_mapping) {
6432 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6433 				int pve_ptep_idx = 0;
6434 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6435 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6436 				if (pv_status != PV_ALLOC_SUCCESS) {
6437 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6438 					    __func__, pv_status, new_pve_p, pmap);
6439 				}
6440 
6441 				if (pmap != kernel_pmap) {
6442 					if (options & PMAP_OPTIONS_INTERNAL) {
6443 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6444 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6445 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6446 							/*
6447 							 * Make a note to ourselves that this
6448 							 * mapping is using alternative
6449 							 * accounting. We'll need this in order
6450 							 * to know which ledger to debit when
6451 							 * the mapping is removed.
6452 							 *
6453 							 * The altacct bit must be set while
6454 							 * the pv head is locked. Defer the
6455 							 * ledger accounting until after we've
6456 							 * dropped the lock.
6457 							 */
6458 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6459 							is_altacct = TRUE;
6460 						}
6461 					}
6462 					if (ppattr_test_reusable(pai) &&
6463 					    !is_altacct) {
6464 						is_reusable = TRUE;
6465 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6466 						is_internal = TRUE;
6467 					} else {
6468 						is_external = TRUE;
6469 					}
6470 				}
6471 			}
6472 
6473 			pvh_unlock(pai);
6474 
6475 			if (pp_attr_bits != 0) {
6476 				ppattr_pa_set_bits(pa, pp_attr_bits);
6477 			}
6478 
6479 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6480 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6481 
6482 				if (is_internal) {
6483 					/*
6484 					 * Make corresponding adjustments to
6485 					 * phys_footprint statistics.
6486 					 */
6487 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6488 					if (is_altacct) {
6489 						/*
6490 						 * If this page is internal and
6491 						 * in an IOKit region, credit
6492 						 * the task's total count of
6493 						 * dirty, internal IOKit pages.
6494 						 * It should *not* count towards
6495 						 * the task's total physical
6496 						 * memory footprint, because
6497 						 * this entire region was
6498 						 * already billed to the task
6499 						 * at the time the mapping was
6500 						 * created.
6501 						 *
6502 						 * Put another way, this is
6503 						 * internal++ and
6504 						 * alternate_accounting++, so
6505 						 * net effect on phys_footprint
6506 						 * is 0. That means: don't
6507 						 * touch phys_footprint here.
6508 						 */
6509 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6510 					} else {
6511 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6512 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6513 							skip_footprint_debit = true;
6514 						} else {
6515 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6516 						}
6517 					}
6518 				}
6519 				if (is_reusable) {
6520 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6521 				} else if (is_external) {
6522 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6523 				}
6524 			}
6525 		} else {
6526 			if (prot & VM_PROT_EXECUTE) {
6527 				kr = KERN_FAILURE;
6528 				break;
6529 			}
6530 
6531 			wimg_bits = pmap_cache_attributes(pn);
6532 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6533 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6534 			}
6535 
6536 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6537 
6538 #if XNU_MONITOR
6539 			pte = pmap_construct_io_pte(pa, pte);
6540 
6541 			/**
6542 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6543 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6544 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6545 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6546 			 */
6547 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6548 			    ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6549 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6550 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6551 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6552 				    __func__, (uint64_t)pte_to_pa(spte));
6553 			}
6554 #endif
6555 
6556 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6557 			if (committed) {
6558 				had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6559 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6560 
6561 				/**
6562 				 * If there was already a valid pte here then we reuse its
6563 				 * reference on the ptd and drop the one that we took above.
6564 				 */
6565 				drop_refcnt = had_valid_mapping;
6566 			}
6567 		}
6568 		if (committed) {
6569 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6570 				assert(pmap != kernel_pmap);
6571 
6572 				/* One less "compressed" */
6573 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6574 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6575 
6576 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6577 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6578 				} else if (!skip_footprint_debit) {
6579 					/* Was part of the footprint */
6580 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6581 				}
6582 				/* The old entry held a reference so drop the extra one that we took above. */
6583 				drop_refcnt = true;
6584 			}
6585 		}
6586 	}
6587 
6588 	if (drop_refcnt && refcnt != NULL) {
6589 		assert(refcnt_updated);
6590 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6591 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6592 		}
6593 	}
6594 
6595 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6596 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6597 	}
6598 
6599 	pmap_unlock(pmap, lock_mode);
6600 
6601 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6602 		pmap_phys_write_disable(v);
6603 	}
6604 
6605 	return kr;
6606 }
6607 
6608 kern_return_t
6609 pmap_enter_options_addr(
6610 	pmap_t pmap,
6611 	vm_map_address_t v,
6612 	pmap_paddr_t pa,
6613 	vm_prot_t prot,
6614 	vm_prot_t fault_type,
6615 	unsigned int flags,
6616 	boolean_t wired,
6617 	unsigned int options,
6618 	__unused void   *arg,
6619 	__unused pmap_mapping_type_t mapping_type)
6620 {
6621 	kern_return_t kr = KERN_FAILURE;
6622 
6623 
6624 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6625 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6626 
6627 
6628 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6629 	do {
6630 #if XNU_MONITOR
6631 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6632 #else
6633 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6634 #endif
6635 
6636 		if (kr == KERN_RESOURCE_SHORTAGE) {
6637 #if XNU_MONITOR
6638 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6639 #endif
6640 			if (nowait_requested) {
6641 				break;
6642 			}
6643 		}
6644 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6645 
6646 #if XNU_MONITOR
6647 	pmap_ledger_check_balance(pmap);
6648 #endif
6649 
6650 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6651 
6652 	return kr;
6653 }
6654 
6655 kern_return_t
6656 pmap_enter_options(
6657 	pmap_t pmap,
6658 	vm_map_address_t v,
6659 	ppnum_t pn,
6660 	vm_prot_t prot,
6661 	vm_prot_t fault_type,
6662 	unsigned int flags,
6663 	boolean_t wired,
6664 	unsigned int options,
6665 	__unused void   *arg,
6666 	pmap_mapping_type_t mapping_type)
6667 {
6668 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6669 }
6670 
6671 /*
6672  *	Routine:	pmap_change_wiring
6673  *	Function:	Change the wiring attribute for a map/virtual-address
6674  *			pair.
6675  *	In/out conditions:
6676  *			The mapping must already exist in the pmap.
6677  */
6678 MARK_AS_PMAP_TEXT kern_return_t
6679 pmap_change_wiring_internal(
6680 	pmap_t pmap,
6681 	vm_map_address_t v,
6682 	boolean_t wired)
6683 {
6684 	pt_entry_t     *pte_p;
6685 	pmap_paddr_t    pa;
6686 
6687 	validate_pmap_mutable(pmap);
6688 
6689 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6690 		return KERN_ABORTED;
6691 	}
6692 
6693 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6694 
6695 	pte_p = pmap_pte(pmap, v);
6696 	if (pte_p == PT_ENTRY_NULL) {
6697 		if (!wired) {
6698 			/*
6699 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6700 			 * may have been freed by a remove operation.
6701 			 */
6702 			goto pmap_change_wiring_return;
6703 		} else {
6704 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6705 		}
6706 	}
6707 	/*
6708 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6709 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6710 	 */
6711 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6712 
6713 	while (pa_valid(pa)) {
6714 		pmap_paddr_t new_pa;
6715 
6716 		pvh_lock(pa_index(pa));
6717 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6718 
6719 		if (pa == new_pa) {
6720 			break;
6721 		}
6722 
6723 		pvh_unlock(pa_index(pa));
6724 		pa = new_pa;
6725 	}
6726 
6727 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6728 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6729 		if (!wired) {
6730 			/* PTE cleared by prior remove/disconnect operation */
6731 			goto pmap_change_wiring_cleanup;
6732 		} else {
6733 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6734 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6735 		}
6736 	}
6737 
6738 	assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6739 	if (wired != pte_is_wired(*pte_p)) {
6740 		pte_set_wired(pmap, pte_p, wired);
6741 		if (pmap != kernel_pmap) {
6742 			if (wired) {
6743 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6744 			} else if (!wired) {
6745 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6746 			}
6747 		}
6748 	}
6749 
6750 pmap_change_wiring_cleanup:
6751 	if (pa_valid(pa)) {
6752 		pvh_unlock(pa_index(pa));
6753 	}
6754 
6755 pmap_change_wiring_return:
6756 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6757 
6758 	return KERN_SUCCESS;
6759 }
6760 
6761 void
6762 pmap_change_wiring(
6763 	pmap_t pmap,
6764 	vm_map_address_t v,
6765 	boolean_t wired)
6766 {
6767 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6768 	pmap_verify_preemptible();
6769 
6770 	kern_return_t kr = KERN_FAILURE;
6771 #if XNU_MONITOR
6772 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6773 	do {
6774 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6775 	} while (kr == KERN_ABORTED);
6776 
6777 	pmap_ledger_check_balance(pmap);
6778 #else
6779 	/* Since we verified preemptibility, call the helper only once. */
6780 	kr = pmap_change_wiring_internal(pmap, v, wired);
6781 #endif
6782 
6783 	if (kr != KERN_SUCCESS) {
6784 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6785 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6786 	}
6787 }
6788 
6789 MARK_AS_PMAP_TEXT pmap_paddr_t
6790 pmap_find_pa_internal(
6791 	pmap_t pmap,
6792 	addr64_t va)
6793 {
6794 	pmap_paddr_t    pa = 0;
6795 
6796 	validate_pmap(pmap);
6797 
6798 	if (pmap != kernel_pmap) {
6799 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6800 	}
6801 
6802 	pa = pmap_vtophys(pmap, va);
6803 
6804 	if (pmap != kernel_pmap) {
6805 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6806 	}
6807 
6808 	return pa;
6809 }
6810 
6811 pmap_paddr_t
6812 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6813 {
6814 	pmap_paddr_t pa = 0;
6815 
6816 	if (pmap == kernel_pmap) {
6817 		pa = mmu_kvtop(va);
6818 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6819 		/*
6820 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6821 		 * translation even if PAN would prevent kernel access through the translation.
6822 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6823 		 */
6824 		pa = mmu_uvtop(va);
6825 	}
6826 	return pa;
6827 }
6828 
6829 pmap_paddr_t
6830 pmap_find_pa(
6831 	pmap_t pmap,
6832 	addr64_t va)
6833 {
6834 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6835 
6836 	if (pa != 0) {
6837 		return pa;
6838 	}
6839 
6840 	if (not_in_kdp) {
6841 #if XNU_MONITOR
6842 		return pmap_find_pa_ppl(pmap, va);
6843 #else
6844 		return pmap_find_pa_internal(pmap, va);
6845 #endif
6846 	} else {
6847 		return pmap_vtophys(pmap, va);
6848 	}
6849 }
6850 
6851 ppnum_t
6852 pmap_find_phys_nofault(
6853 	pmap_t pmap,
6854 	addr64_t va)
6855 {
6856 	ppnum_t ppn;
6857 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6858 	return ppn;
6859 }
6860 
6861 ppnum_t
6862 pmap_find_phys(
6863 	pmap_t pmap,
6864 	addr64_t va)
6865 {
6866 	ppnum_t ppn;
6867 	ppn = atop(pmap_find_pa(pmap, va));
6868 	return ppn;
6869 }
6870 
6871 /**
6872  * Translate a kernel virtual address into a physical address.
6873  *
6874  * @param va The kernel virtual address to translate. Does not work on user
6875  *           virtual addresses.
6876  *
6877  * @return The physical address if the translation was successful, or zero if
6878  *         no valid mappings were found for the given virtual address.
6879  */
6880 pmap_paddr_t
6881 kvtophys(vm_offset_t va)
6882 {
6883 	/**
6884 	 * Attempt to do the translation first in hardware using the AT (address
6885 	 * translation) instruction. This will attempt to use the MMU to do the
6886 	 * translation for us.
6887 	 */
6888 	pmap_paddr_t pa = mmu_kvtop(va);
6889 
6890 	if (pa) {
6891 		return pa;
6892 	}
6893 
6894 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6895 	return pmap_vtophys(kernel_pmap, va);
6896 }
6897 
6898 /**
6899  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6900  * points to a non-kernel-managed physical page, then this call will panic().
6901  *
6902  * @note The output of this function is guaranteed to be a kernel-managed
6903  *       physical page, which means it's safe to pass the output directly to
6904  *       pa_index() to create a physical address index for various pmap data
6905  *       structures.
6906  *
6907  * @param va The kernel virtual address to translate. Does not work on user
6908  *           virtual addresses.
6909  *
6910  * @return The translated physical address for the given virtual address.
6911  */
6912 pmap_paddr_t
6913 kvtophys_nofail(vm_offset_t va)
6914 {
6915 	pmap_paddr_t pa = kvtophys(va);
6916 
6917 	if (!pa_valid(pa)) {
6918 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6919 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6920 	}
6921 
6922 	return pa;
6923 }
6924 
6925 pmap_paddr_t
6926 pmap_vtophys(
6927 	pmap_t pmap,
6928 	addr64_t va)
6929 {
6930 	if ((va < pmap->min) || (va >= pmap->max)) {
6931 		return 0;
6932 	}
6933 
6934 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6935 
6936 	tt_entry_t * ttp = NULL;
6937 	tt_entry_t * ttep = NULL;
6938 	tt_entry_t   tte = ARM_TTE_EMPTY;
6939 	pmap_paddr_t pa = 0;
6940 	unsigned int cur_level;
6941 
6942 	ttp = pmap->tte;
6943 
6944 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6945 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6946 
6947 		tte = *ttep;
6948 
6949 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6950 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6951 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6952 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6953 
6954 		if ((tte & valid_mask) != valid_mask) {
6955 			return (pmap_paddr_t) 0;
6956 		}
6957 
6958 		/* This detects both leaf entries and intermediate block mappings. */
6959 		if ((tte & type_mask) == type_block) {
6960 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6961 			break;
6962 		}
6963 
6964 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6965 	}
6966 
6967 	return pa;
6968 }
6969 
6970 /*
6971  *	pmap_init_pte_page - Initialize a page table page.
6972  */
6973 MARK_AS_PMAP_TEXT void
6974 pmap_init_pte_page(
6975 	pmap_t pmap,
6976 	pt_entry_t *pte_p,
6977 	vm_offset_t va,
6978 	unsigned int ttlevel,
6979 	boolean_t alloc_ptd)
6980 {
6981 	pt_desc_t   *ptdp = NULL;
6982 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6983 
6984 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6985 		if (alloc_ptd) {
6986 			/*
6987 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6988 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6989 			 * bootstrap request, so we check for an existing PTD here.
6990 			 */
6991 			ptdp = ptd_alloc(pmap);
6992 			if (ptdp == NULL) {
6993 				panic("%s: unable to allocate PTD", __func__);
6994 			}
6995 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6996 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6997 			pvh_set_flags(pvh, 0);
6998 		} else {
6999 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
7000 		}
7001 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7002 		ptdp = pvh_ptd(pvh);
7003 	} else {
7004 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7005 	}
7006 
7007 	// below barrier ensures previous updates to the page are visible to PTW before
7008 	// it is linked to the PTE of previous level
7009 	__builtin_arm_dmb(DMB_ISHST);
7010 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7011 }
7012 
7013 /*
7014  *	Routine:	pmap_expand
7015  *
7016  *	Expands a pmap to be able to map the specified virtual address.
7017  *
7018  *	Allocates new memory for the default (COARSE) translation table
7019  *	entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
7020  *	also allocates space for the corresponding pv entries.
7021  *
7022  *	Nothing should be locked.
7023  */
7024 MARK_AS_PMAP_TEXT static kern_return_t
7025 pmap_expand(
7026 	pmap_t pmap,
7027 	vm_map_address_t v,
7028 	unsigned int options,
7029 	unsigned int level)
7030 {
7031 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7032 
7033 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7034 		return KERN_INVALID_ADDRESS;
7035 	}
7036 	pmap_paddr_t    pa;
7037 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
7038 	tt_entry_t              *tte_p;
7039 	tt_entry_t              *tt_p;
7040 
7041 	pa = 0x0ULL;
7042 	tt_p =  (tt_entry_t *)NULL;
7043 
7044 	for (; ttlevel < level; ttlevel++) {
7045 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7046 			return KERN_ABORTED;
7047 		}
7048 
7049 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7050 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7051 			kern_return_t ret;
7052 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7053 				if (options & PMAP_OPTIONS_NOWAIT) {
7054 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7055 					return ret;
7056 				}
7057 #if XNU_MONITOR
7058 				panic("%s: failed to allocate tt, "
7059 				    "pmap=%p, v=%p, options=0x%x, level=%u",
7060 				    __FUNCTION__,
7061 				    pmap, (void *)v, options, level);
7062 #else
7063 				VM_PAGE_WAIT();
7064 #endif
7065 			}
7066 
7067 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7068 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7069 				return KERN_ABORTED;
7070 			}
7071 
7072 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7073 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7074 				pa = kvtophys_nofail((vm_offset_t)tt_p);
7075 				tte_p = pmap_ttne(pmap, ttlevel, v);
7076 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7077 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7078 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7079 				pa = 0x0ULL;
7080 				tt_p = (tt_entry_t *)NULL;
7081 			}
7082 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7083 		} else {
7084 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7085 		}
7086 
7087 		if (tt_p != (tt_entry_t *)NULL) {
7088 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7089 			tt_p = (tt_entry_t *)NULL;
7090 		}
7091 	}
7092 
7093 	return KERN_SUCCESS;
7094 }
7095 
7096 /*
7097  *	Routine:	pmap_gc
7098  *	Function:
7099  *              Pmap garbage collection
7100  *		Called by the pageout daemon when pages are scarce.
7101  *
7102  */
7103 void
7104 pmap_gc(void)
7105 {
7106 	/*
7107 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7108 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7109 	 * or may contain wired mappings.  However, with the relatively recent change to
7110 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7111 	 * page, it may make sense to call that function here.
7112 	 */
7113 }
7114 
7115 /*
7116  *      By default, don't attempt pmap GC more frequently
7117  *      than once / 1 minutes.
7118  */
7119 
7120 void
7121 compute_pmap_gc_throttle(
7122 	void *arg __unused)
7123 {
7124 }
7125 
7126 /*
7127  * pmap_attribute_cache_sync(vm_offset_t pa)
7128  *
7129  * Invalidates all of the instruction cache on a physical page and
7130  * pushes any dirty data from the data cache for the same physical page
7131  */
7132 
7133 kern_return_t
7134 pmap_attribute_cache_sync(
7135 	ppnum_t pp,
7136 	vm_size_t size,
7137 	__unused vm_machine_attribute_t attribute,
7138 	__unused vm_machine_attribute_val_t * value)
7139 {
7140 	if (size > PAGE_SIZE) {
7141 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7142 	} else {
7143 		cache_sync_page(pp);
7144 	}
7145 
7146 	return KERN_SUCCESS;
7147 }
7148 
7149 /*
7150  * pmap_sync_page_data_phys(ppnum_t pp)
7151  *
7152  * Invalidates all of the instruction cache on a physical page and
7153  * pushes any dirty data from the data cache for the same physical page
7154  */
7155 void
7156 pmap_sync_page_data_phys(
7157 	ppnum_t pp)
7158 {
7159 	cache_sync_page(pp);
7160 }
7161 
7162 /*
7163  * pmap_sync_page_attributes_phys(ppnum_t pp)
7164  *
7165  * Write back and invalidate all cachelines on a physical page.
7166  */
7167 void
7168 pmap_sync_page_attributes_phys(
7169 	ppnum_t pp)
7170 {
7171 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7172 }
7173 
7174 #if CONFIG_COREDUMP
7175 /* temporary workaround */
7176 boolean_t
7177 coredumpok(
7178 	vm_map_t map,
7179 	mach_vm_offset_t va)
7180 {
7181 	pt_entry_t     *pte_p;
7182 	pt_entry_t      spte;
7183 
7184 	pte_p = pmap_pte(map->pmap, va);
7185 	if (0 == pte_p) {
7186 		return FALSE;
7187 	}
7188 	if (vm_map_entry_has_device_pager(map, va)) {
7189 		return FALSE;
7190 	}
7191 	spte = *pte_p;
7192 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7193 }
7194 #endif
7195 
7196 void
7197 fillPage(
7198 	ppnum_t pn,
7199 	unsigned int fill)
7200 {
7201 	unsigned int   *addr;
7202 	int             count;
7203 
7204 	addr = (unsigned int *) phystokv(ptoa(pn));
7205 	count = PAGE_SIZE / sizeof(unsigned int);
7206 	while (count--) {
7207 		*addr++ = fill;
7208 	}
7209 }
7210 
7211 extern void     mapping_set_mod(ppnum_t pn);
7212 
7213 void
7214 mapping_set_mod(
7215 	ppnum_t pn)
7216 {
7217 	pmap_set_modify(pn);
7218 }
7219 
7220 extern void     mapping_set_ref(ppnum_t pn);
7221 
7222 void
7223 mapping_set_ref(
7224 	ppnum_t pn)
7225 {
7226 	pmap_set_reference(pn);
7227 }
7228 
7229 /*
7230  * Clear specified attribute bits.
7231  *
7232  * Try to force an arm_fast_fault() for all mappings of
7233  * the page - to force attributes to be set again at fault time.
7234  * If the forcing succeeds, clear the cached bits at the head.
7235  * Otherwise, something must have been wired, so leave the cached
7236  * attributes alone.
7237  */
7238 MARK_AS_PMAP_TEXT static void
7239 phys_attribute_clear_with_flush_range(
7240 	ppnum_t         pn,
7241 	unsigned int    bits,
7242 	int             options,
7243 	void            *arg,
7244 	pmap_tlb_flush_range_t *flush_range)
7245 {
7246 	pmap_paddr_t    pa = ptoa(pn);
7247 	vm_prot_t       allow_mode = VM_PROT_ALL;
7248 
7249 #if XNU_MONITOR
7250 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7251 		panic("%s: illegal request, "
7252 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7253 		    __FUNCTION__,
7254 		    pn, bits, options, arg, flush_range);
7255 	}
7256 #endif
7257 	if ((arg != NULL) || (flush_range != NULL)) {
7258 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7259 	}
7260 
7261 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7262 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7263 		    "invalid options",
7264 		    pn, bits, options, arg, flush_range);
7265 	}
7266 
7267 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7268 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7269 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7270 		    "should not clear 'modified' without flushing TLBs",
7271 		    pn, bits, options, arg, flush_range);
7272 	}
7273 
7274 	assert(pn != vm_page_fictitious_addr);
7275 
7276 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7277 		assert(bits == PP_ATTR_MODIFIED);
7278 
7279 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7280 		/*
7281 		 * We short circuit this case; it should not need to
7282 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7283 		 * pmap_page_protect has taken care of resetting
7284 		 * the state so that we'll see the next write as a fault to
7285 		 * the VM (i.e. we don't want a fast fault).
7286 		 */
7287 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7288 		return;
7289 	}
7290 	if (bits & PP_ATTR_REFERENCED) {
7291 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7292 	}
7293 	if (bits & PP_ATTR_MODIFIED) {
7294 		allow_mode &= ~VM_PROT_WRITE;
7295 	}
7296 
7297 	if (bits == PP_ATTR_NOENCRYPT) {
7298 		/*
7299 		 * We short circuit this case; it should not need to
7300 		 * invoke arm_force_fast_fault, so just clear and
7301 		 * return.  On ARM, this bit is just a debugging aid.
7302 		 */
7303 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7304 		return;
7305 	}
7306 
7307 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7308 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7309 	}
7310 }
7311 
7312 MARK_AS_PMAP_TEXT void
7313 phys_attribute_clear_internal(
7314 	ppnum_t         pn,
7315 	unsigned int    bits,
7316 	int             options,
7317 	void            *arg)
7318 {
7319 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7320 }
7321 
7322 #if __ARM_RANGE_TLBI__
7323 MARK_AS_PMAP_TEXT static vm_map_address_t
7324 phys_attribute_clear_twig_internal(
7325 	pmap_t pmap,
7326 	vm_map_address_t start,
7327 	vm_map_address_t end,
7328 	unsigned int bits,
7329 	unsigned int options,
7330 	pmap_tlb_flush_range_t *flush_range)
7331 {
7332 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7333 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7334 	assert(end >= start);
7335 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7336 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7337 	vm_map_address_t va = start;
7338 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7339 	tt_entry_t     *tte_p;
7340 	tte_p = pmap_tte(pmap, start);
7341 	unsigned int npages = 0;
7342 
7343 	if (tte_p == (tt_entry_t *) NULL) {
7344 		return end;
7345 	}
7346 
7347 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7348 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7349 
7350 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7351 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7352 		assert(end_pte_p >= start_pte_p);
7353 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7354 			if (__improbable(npages++ && pmap_pending_preemption())) {
7355 				return va;
7356 			}
7357 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7358 			if (pa_valid(pa)) {
7359 				ppnum_t pn = (ppnum_t) atop(pa);
7360 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7361 			}
7362 		}
7363 	}
7364 	return end;
7365 }
7366 
7367 MARK_AS_PMAP_TEXT vm_map_address_t
7368 phys_attribute_clear_range_internal(
7369 	pmap_t pmap,
7370 	vm_map_address_t start,
7371 	vm_map_address_t end,
7372 	unsigned int bits,
7373 	unsigned int options)
7374 {
7375 	if (__improbable(end < start)) {
7376 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7377 	}
7378 	validate_pmap_mutable(pmap);
7379 
7380 	vm_map_address_t va = start;
7381 	pmap_tlb_flush_range_t flush_range = {
7382 		.ptfr_pmap = pmap,
7383 		.ptfr_start = start,
7384 		.ptfr_end = end,
7385 		.ptfr_flush_needed = false
7386 	};
7387 
7388 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7389 		return va;
7390 	}
7391 
7392 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7393 
7394 	while (va < end) {
7395 		vm_map_address_t curr_end;
7396 
7397 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7398 		if (curr_end > end) {
7399 			curr_end = end;
7400 		}
7401 
7402 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7403 		if ((va < curr_end) || pmap_pending_preemption()) {
7404 			break;
7405 		}
7406 	}
7407 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7408 	if (flush_range.ptfr_flush_needed) {
7409 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7410 			flush_range.ptfr_start,
7411 			flush_range.ptfr_end - flush_range.ptfr_start,
7412 			flush_range.ptfr_pmap,
7413 			true,
7414 			false);
7415 		sync_tlb_flush();
7416 	}
7417 	return va;
7418 }
7419 
7420 static void
7421 phys_attribute_clear_range(
7422 	pmap_t pmap,
7423 	vm_map_address_t start,
7424 	vm_map_address_t end,
7425 	unsigned int bits,
7426 	unsigned int options)
7427 {
7428 	/*
7429 	 * We allow single-page requests to execute non-preemptibly,
7430 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7431 	 * operation, and there are a couple of special use cases that
7432 	 * require a non-preemptible single-page operation.
7433 	 */
7434 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7435 		pmap_verify_preemptible();
7436 	}
7437 
7438 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7439 
7440 	while (start < end) {
7441 #if XNU_MONITOR
7442 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7443 #else
7444 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7445 #endif
7446 	}
7447 
7448 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7449 }
7450 #endif /* __ARM_RANGE_TLBI__ */
7451 
7452 static void
7453 phys_attribute_clear(
7454 	ppnum_t         pn,
7455 	unsigned int    bits,
7456 	int             options,
7457 	void            *arg)
7458 {
7459 	/*
7460 	 * Do we really want this tracepoint?  It will be extremely chatty.
7461 	 * Also, should we have a corresponding trace point for the set path?
7462 	 */
7463 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7464 
7465 #if XNU_MONITOR
7466 	phys_attribute_clear_ppl(pn, bits, options, arg);
7467 #else
7468 	phys_attribute_clear_internal(pn, bits, options, arg);
7469 #endif
7470 
7471 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7472 }
7473 
7474 /*
7475  *	Set specified attribute bits.
7476  *
7477  *	Set cached value in the pv head because we have
7478  *	no per-mapping hardware support for referenced and
7479  *	modify bits.
7480  */
7481 MARK_AS_PMAP_TEXT void
7482 phys_attribute_set_internal(
7483 	ppnum_t pn,
7484 	unsigned int bits)
7485 {
7486 	pmap_paddr_t    pa = ptoa(pn);
7487 	assert(pn != vm_page_fictitious_addr);
7488 
7489 #if XNU_MONITOR
7490 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7491 		panic("%s: illegal request, "
7492 		    "pn=%u, bits=%#x",
7493 		    __FUNCTION__,
7494 		    pn, bits);
7495 	}
7496 #endif
7497 
7498 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7499 
7500 	return;
7501 }
7502 
7503 static void
7504 phys_attribute_set(
7505 	ppnum_t pn,
7506 	unsigned int bits)
7507 {
7508 #if XNU_MONITOR
7509 	phys_attribute_set_ppl(pn, bits);
7510 #else
7511 	phys_attribute_set_internal(pn, bits);
7512 #endif
7513 }
7514 
7515 
7516 /*
7517  *	Check specified attribute bits.
7518  *
7519  *	use the software cached bits (since no hw support).
7520  */
7521 static boolean_t
7522 phys_attribute_test(
7523 	ppnum_t pn,
7524 	unsigned int bits)
7525 {
7526 	pmap_paddr_t    pa = ptoa(pn);
7527 	assert(pn != vm_page_fictitious_addr);
7528 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7529 }
7530 
7531 
7532 /*
7533  *	Set the modify/reference bits on the specified physical page.
7534  */
7535 void
7536 pmap_set_modify(ppnum_t pn)
7537 {
7538 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7539 }
7540 
7541 
7542 /*
7543  *	Clear the modify bits on the specified physical page.
7544  */
7545 void
7546 pmap_clear_modify(
7547 	ppnum_t pn)
7548 {
7549 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7550 }
7551 
7552 
7553 /*
7554  *	pmap_is_modified:
7555  *
7556  *	Return whether or not the specified physical page is modified
7557  *	by any physical maps.
7558  */
7559 boolean_t
7560 pmap_is_modified(
7561 	ppnum_t pn)
7562 {
7563 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7564 }
7565 
7566 
7567 /*
7568  *	Set the reference bit on the specified physical page.
7569  */
7570 static void
7571 pmap_set_reference(
7572 	ppnum_t pn)
7573 {
7574 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7575 }
7576 
7577 /*
7578  *	Clear the reference bits on the specified physical page.
7579  */
7580 void
7581 pmap_clear_reference(
7582 	ppnum_t pn)
7583 {
7584 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7585 }
7586 
7587 
7588 /*
7589  *	pmap_is_referenced:
7590  *
7591  *	Return whether or not the specified physical page is referenced
7592  *	by any physical maps.
7593  */
7594 boolean_t
7595 pmap_is_referenced(
7596 	ppnum_t pn)
7597 {
7598 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7599 }
7600 
7601 /*
7602  * pmap_get_refmod(phys)
7603  *  returns the referenced and modified bits of the specified
7604  *  physical page.
7605  */
7606 unsigned int
7607 pmap_get_refmod(
7608 	ppnum_t pn)
7609 {
7610 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7611 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7612 }
7613 
7614 static inline unsigned int
7615 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7616 {
7617 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7618 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7619 }
7620 
7621 /*
7622  * pmap_clear_refmod(phys, mask)
7623  *  clears the referenced and modified bits as specified by the mask
7624  *  of the specified physical page.
7625  */
7626 void
7627 pmap_clear_refmod_options(
7628 	ppnum_t         pn,
7629 	unsigned int    mask,
7630 	unsigned int    options,
7631 	void            *arg)
7632 {
7633 	unsigned int    bits;
7634 
7635 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7636 	phys_attribute_clear(pn, bits, options, arg);
7637 }
7638 
7639 /*
7640  * Perform pmap_clear_refmod_options on a virtual address range.
7641  * The operation will be performed in bulk & tlb flushes will be coalesced
7642  * if possible.
7643  *
7644  * Returns true if the operation is supported on this platform.
7645  * If this function returns false, the operation is not supported and
7646  * nothing has been modified in the pmap.
7647  */
7648 bool
7649 pmap_clear_refmod_range_options(
7650 	pmap_t pmap __unused,
7651 	vm_map_address_t start __unused,
7652 	vm_map_address_t end __unused,
7653 	unsigned int mask __unused,
7654 	unsigned int options __unused)
7655 {
7656 #if __ARM_RANGE_TLBI__
7657 	unsigned int    bits;
7658 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7659 	phys_attribute_clear_range(pmap, start, end, bits, options);
7660 	return true;
7661 #else /* __ARM_RANGE_TLBI__ */
7662 #pragma unused(pmap, start, end, mask, options)
7663 	/*
7664 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7665 	 * contiguous range of addresses. This is large performance improvement on
7666 	 * platforms that support ranged tlbi instructions. But on older platforms,
7667 	 * we can only flush per-page or the entire asid. So we currently
7668 	 * only support this operation on platforms that support ranged tlbi.
7669 	 * instructions. On other platforms, we require that
7670 	 * the VM modify the bits on a per-page basis.
7671 	 */
7672 	return false;
7673 #endif /* __ARM_RANGE_TLBI__ */
7674 }
7675 
7676 void
7677 pmap_clear_refmod(
7678 	ppnum_t pn,
7679 	unsigned int mask)
7680 {
7681 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7682 }
7683 
7684 unsigned int
7685 pmap_disconnect_options(
7686 	ppnum_t pn,
7687 	unsigned int options,
7688 	void *arg)
7689 {
7690 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7691 		/*
7692 		 * On ARM, the "modified" bit is managed by software, so
7693 		 * we know up-front if the physical page is "modified",
7694 		 * without having to scan all the PTEs pointing to it.
7695 		 * The caller should have made the VM page "busy" so noone
7696 		 * should be able to establish any new mapping and "modify"
7697 		 * the page behind us.
7698 		 */
7699 		if (pmap_is_modified(pn)) {
7700 			/*
7701 			 * The page has been modified and will be sent to
7702 			 * the VM compressor.
7703 			 */
7704 			options |= PMAP_OPTIONS_COMPRESSOR;
7705 		} else {
7706 			/*
7707 			 * The page hasn't been modified and will be freed
7708 			 * instead of compressed.
7709 			 */
7710 		}
7711 	}
7712 
7713 	/* disconnect the page */
7714 	pmap_page_protect_options(pn, 0, options, arg);
7715 
7716 	/* return ref/chg status */
7717 	return pmap_get_refmod(pn);
7718 }
7719 
7720 /*
7721  *	Routine:
7722  *		pmap_disconnect
7723  *
7724  *	Function:
7725  *		Disconnect all mappings for this page and return reference and change status
7726  *		in generic format.
7727  *
7728  */
7729 unsigned int
7730 pmap_disconnect(
7731 	ppnum_t pn)
7732 {
7733 	pmap_page_protect(pn, 0);       /* disconnect the page */
7734 	return pmap_get_refmod(pn);   /* return ref/chg status */
7735 }
7736 
7737 boolean_t
7738 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7739 {
7740 	if (ptoa(first) >= vm_last_phys) {
7741 		return FALSE;
7742 	}
7743 	if (ptoa(last) < vm_first_phys) {
7744 		return FALSE;
7745 	}
7746 
7747 	return TRUE;
7748 }
7749 
7750 /*
7751  * The state maintained by the noencrypt functions is used as a
7752  * debugging aid on ARM.  This incurs some overhead on the part
7753  * of the caller.  A special case check in phys_attribute_clear
7754  * (the most expensive path) currently minimizes this overhead,
7755  * but stubbing these functions out on RELEASE kernels yields
7756  * further wins.
7757  */
7758 boolean_t
7759 pmap_is_noencrypt(
7760 	ppnum_t pn)
7761 {
7762 #if DEVELOPMENT || DEBUG
7763 	boolean_t result = FALSE;
7764 
7765 	if (!pa_valid(ptoa(pn))) {
7766 		return FALSE;
7767 	}
7768 
7769 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7770 
7771 	return result;
7772 #else
7773 #pragma unused(pn)
7774 	return FALSE;
7775 #endif
7776 }
7777 
7778 void
7779 pmap_set_noencrypt(
7780 	ppnum_t pn)
7781 {
7782 #if DEVELOPMENT || DEBUG
7783 	if (!pa_valid(ptoa(pn))) {
7784 		return;
7785 	}
7786 
7787 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7788 #else
7789 #pragma unused(pn)
7790 #endif
7791 }
7792 
7793 void
7794 pmap_clear_noencrypt(
7795 	ppnum_t pn)
7796 {
7797 #if DEVELOPMENT || DEBUG
7798 	if (!pa_valid(ptoa(pn))) {
7799 		return;
7800 	}
7801 
7802 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7803 #else
7804 #pragma unused(pn)
7805 #endif
7806 }
7807 
7808 #if XNU_MONITOR
7809 boolean_t
7810 pmap_is_monitor(ppnum_t pn)
7811 {
7812 	assert(pa_valid(ptoa(pn)));
7813 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7814 }
7815 #endif
7816 
7817 void
7818 pmap_lock_phys_page(ppnum_t pn)
7819 {
7820 #if !XNU_MONITOR
7821 	unsigned int    pai;
7822 	pmap_paddr_t    phys = ptoa(pn);
7823 
7824 	if (pa_valid(phys)) {
7825 		pai = pa_index(phys);
7826 		pvh_lock(pai);
7827 	} else
7828 #else
7829 	(void)pn;
7830 #endif
7831 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7832 }
7833 
7834 
7835 void
7836 pmap_unlock_phys_page(ppnum_t pn)
7837 {
7838 #if !XNU_MONITOR
7839 	unsigned int    pai;
7840 	pmap_paddr_t    phys = ptoa(pn);
7841 
7842 	if (pa_valid(phys)) {
7843 		pai = pa_index(phys);
7844 		pvh_unlock(pai);
7845 	} else
7846 #else
7847 	(void)pn;
7848 #endif
7849 	{ simple_unlock(&phys_backup_lock);}
7850 }
7851 
7852 MARK_AS_PMAP_TEXT static void
7853 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7854 {
7855 	if (pmap != kernel_pmap) {
7856 		cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7857 		cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7858 		    NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7859 		cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7860 		cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7861 #if __ARM_MIXED_PAGE_SIZE__
7862 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7863 #endif
7864 	}
7865 
7866 
7867 #if __ARM_MIXED_PAGE_SIZE__
7868 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7869 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7870 	}
7871 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7872 
7873 
7874 	if (pmap != kernel_pmap) {
7875 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7876 	} else if (!pmap_user_ttb_is_clear()) {
7877 		pmap_clear_user_ttb_internal();
7878 	}
7879 }
7880 
7881 MARK_AS_PMAP_TEXT void
7882 pmap_clear_user_ttb_internal(void)
7883 {
7884 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7885 }
7886 
7887 void
7888 pmap_clear_user_ttb(void)
7889 {
7890 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7891 #if XNU_MONITOR
7892 	pmap_clear_user_ttb_ppl();
7893 #else
7894 	pmap_clear_user_ttb_internal();
7895 #endif
7896 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7897 }
7898 
7899 
7900 #if defined(__arm64__)
7901 /*
7902  * Marker for use in multi-pass fast-fault PV list processing.
7903  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7904  * these functions, as compressed PTEs should never be present in PV lists.
7905  * Note that this only holds true for arm64; for arm32 we don't have enough
7906  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7907  * and WRITEABLE marker depending on whether the PTE is valid.
7908  */
7909 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7910 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7911 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7912 #endif
7913 
7914 
7915 MARK_AS_PMAP_TEXT static boolean_t
7916 arm_force_fast_fault_with_flush_range(
7917 	ppnum_t         ppnum,
7918 	vm_prot_t       allow_mode,
7919 	int             options,
7920 	pmap_tlb_flush_range_t *flush_range)
7921 {
7922 	pmap_paddr_t     phys = ptoa(ppnum);
7923 	pv_entry_t      *pve_p;
7924 	pt_entry_t      *pte_p;
7925 	unsigned int     pai;
7926 	unsigned int     pass1_updated = 0;
7927 	unsigned int     pass2_updated = 0;
7928 	boolean_t        result;
7929 	pv_entry_t     **pv_h;
7930 	bool             is_reusable;
7931 	bool             ref_fault;
7932 	bool             mod_fault;
7933 	bool             clear_write_fault = false;
7934 	bool             ref_aliases_mod = false;
7935 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7936 
7937 	assert(ppnum != vm_page_fictitious_addr);
7938 
7939 	if (!pa_valid(phys)) {
7940 		return FALSE;   /* Not a managed page. */
7941 	}
7942 
7943 	result = TRUE;
7944 	ref_fault = false;
7945 	mod_fault = false;
7946 	pai = pa_index(phys);
7947 	if (__probable(mustsynch)) {
7948 		pvh_lock(pai);
7949 	}
7950 	pv_h = pai_to_pvh(pai);
7951 
7952 #if XNU_MONITOR
7953 	if (__improbable(ppattr_pa_test_monitor(phys))) {
7954 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7955 	}
7956 #endif
7957 	pte_p = PT_ENTRY_NULL;
7958 	pve_p = PV_ENTRY_NULL;
7959 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7960 		pte_p = pvh_ptep(pv_h);
7961 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7962 		pve_p = pvh_pve_list(pv_h);
7963 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7964 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7965 	}
7966 
7967 	is_reusable = ppattr_test_reusable(pai);
7968 
7969 	/*
7970 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7971 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
7972 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7973 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
7974 	 * operation, TLB invalidation may be handled by the caller so it's possible for
7975 	 * tlb_flush_needed to be true while issue_tlbi is false.
7976 	 */
7977 	bool issue_tlbi = false;
7978 	bool tlb_flush_needed = false;
7979 
7980 	pv_entry_t *orig_pve_p = pve_p;
7981 	pt_entry_t *orig_pte_p = pte_p;
7982 	int pve_ptep_idx = 0;
7983 
7984 	/*
7985 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7986 	 * TLB invalidation in pass 2.
7987 	 */
7988 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7989 		pt_entry_t       spte;
7990 		pt_entry_t       tmplate;
7991 
7992 		if (pve_p != PV_ENTRY_NULL) {
7993 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7994 			if (pte_p == PT_ENTRY_NULL) {
7995 				goto fff_skip_pve_pass1;
7996 			}
7997 		}
7998 
7999 #ifdef PVH_FLAG_IOMMU
8000 		if (pvh_ptep_is_iommu(pte_p)) {
8001 			goto fff_skip_pve_pass1;
8002 		}
8003 #endif
8004 		if (*pte_p == ARM_PTE_EMPTY) {
8005 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8006 		}
8007 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8008 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8009 		}
8010 
8011 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8012 		const pmap_t pmap = ptdp->pmap;
8013 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8014 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8015 
8016 		assert(va >= pmap->min && va < pmap->max);
8017 
8018 		/* update pmap stats and ledgers */
8019 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8020 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8021 		if (is_altacct) {
8022 			/*
8023 			 * We do not track "reusable" status for
8024 			 * "alternate accounting" mappings.
8025 			 */
8026 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8027 		    is_reusable &&
8028 		    is_internal &&
8029 		    pmap != kernel_pmap) {
8030 			/* one less "reusable" */
8031 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8032 			/* one more "internal" */
8033 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8034 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8035 
8036 			/*
8037 			 * Since the page is being marked non-reusable, we assume that it will be
8038 			 * modified soon.  Avoid the cost of another trap to handle the fast
8039 			 * fault when we next write to this page.
8040 			 */
8041 			clear_write_fault = true;
8042 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8043 		    !is_reusable &&
8044 		    is_internal &&
8045 		    pmap != kernel_pmap) {
8046 			/* one more "reusable" */
8047 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8048 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8049 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8050 		}
8051 
8052 		bool wiredskip = pte_is_wired(*pte_p) &&
8053 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8054 
8055 		if (wiredskip) {
8056 			result = FALSE;
8057 			goto fff_skip_pve_pass1;
8058 		}
8059 
8060 		spte = *pte_p;
8061 		tmplate = spte;
8062 
8063 #if HAS_FEAT_XS
8064 		/**
8065 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8066 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8067 		 */
8068 		assert(!pte_is_xs(pt_attr, spte));
8069 #endif /* HAS_FEAT_XS */
8070 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8071 			/* read protection sets the pte to fault */
8072 			tmplate =  tmplate & ~ARM_PTE_AF;
8073 			ref_fault = true;
8074 		}
8075 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8076 			/* take away write permission if set */
8077 			if (pmap == kernel_pmap) {
8078 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8079 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8080 					pte_set_was_writeable(tmplate, true);
8081 					mod_fault = true;
8082 				}
8083 			} else {
8084 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8085 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8086 					pte_set_was_writeable(tmplate, true);
8087 					mod_fault = true;
8088 				}
8089 			}
8090 		}
8091 
8092 #if MACH_ASSERT && XNU_MONITOR
8093 		if (is_pte_xprr_protected(pmap, spte)) {
8094 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8095 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8096 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8097 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8098 				    ppnum, options, allow_mode);
8099 			}
8100 		}
8101 #endif /* MACH_ASSERT && XNU_MONITOR */
8102 
8103 		if (result && (tmplate != spte)) {
8104 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8105 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
8106 				tlb_flush_needed = true;
8107 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8108 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8109 #ifdef ARM_PTE_FF_MARKER
8110 					assert(!(spte & ARM_PTE_FF_MARKER));
8111 					tmplate |= ARM_PTE_FF_MARKER;
8112 					++pass1_updated;
8113 #endif
8114 					issue_tlbi = true;
8115 				}
8116 			}
8117 			write_pte_fast(pte_p, tmplate);
8118 		}
8119 
8120 fff_skip_pve_pass1:
8121 		pte_p = PT_ENTRY_NULL;
8122 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8123 			pve_ptep_idx = 0;
8124 			pve_p = pve_next(pve_p);
8125 		}
8126 	}
8127 
8128 	if (tlb_flush_needed) {
8129 		FLUSH_PTE_STRONG();
8130 	}
8131 
8132 	if (!issue_tlbi) {
8133 		goto fff_finish;
8134 	}
8135 
8136 	/* Pass 2: Issue any required TLB invalidations */
8137 	pve_p = orig_pve_p;
8138 	pte_p = orig_pte_p;
8139 	pve_ptep_idx = 0;
8140 
8141 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8142 		if (pve_p != PV_ENTRY_NULL) {
8143 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8144 			if (pte_p == PT_ENTRY_NULL) {
8145 				goto fff_skip_pve_pass2;
8146 			}
8147 		}
8148 
8149 #ifdef PVH_FLAG_IOMMU
8150 		if (pvh_ptep_is_iommu(pte_p)) {
8151 			goto fff_skip_pve_pass2;
8152 		}
8153 #endif
8154 
8155 #ifdef ARM_PTE_FF_MARKER
8156 		pt_entry_t spte = *pte_p;
8157 
8158 		if (!(spte & ARM_PTE_FF_MARKER)) {
8159 			goto fff_skip_pve_pass2;
8160 		} else {
8161 			spte &= (~ARM_PTE_FF_MARKER);
8162 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8163 			write_pte_fast(pte_p, spte);
8164 			++pass2_updated;
8165 		}
8166 #endif
8167 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8168 		const pmap_t pmap = ptdp->pmap;
8169 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8170 
8171 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8172 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8173 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8174 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8175 		}
8176 
8177 fff_skip_pve_pass2:
8178 		pte_p = PT_ENTRY_NULL;
8179 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8180 			pve_ptep_idx = 0;
8181 			pve_p = pve_next(pve_p);
8182 		}
8183 	}
8184 
8185 fff_finish:
8186 	if (__improbable(pass1_updated != pass2_updated)) {
8187 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8188 		    __func__, pass1_updated, pass2_updated);
8189 	}
8190 
8191 	/*
8192 	 * If we are using the same approach for ref and mod
8193 	 * faults on this PTE, do not clear the write fault;
8194 	 * this would cause both ref and mod to be set on the
8195 	 * page again, and prevent us from taking ANY read/write
8196 	 * fault on the mapping.
8197 	 */
8198 	if (clear_write_fault && !ref_aliases_mod) {
8199 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8200 	}
8201 	if (tlb_flush_needed) {
8202 		if (flush_range) {
8203 			/* Delayed flush. Signal to the caller that the flush is needed. */
8204 			flush_range->ptfr_flush_needed = true;
8205 		} else {
8206 			sync_tlb_flush();
8207 		}
8208 	}
8209 
8210 	/* update global "reusable" status for this page */
8211 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8212 		ppattr_clear_reusable(pai);
8213 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8214 		ppattr_set_reusable(pai);
8215 	}
8216 
8217 	if (mod_fault) {
8218 		ppattr_set_modfault(pai);
8219 	}
8220 	if (ref_fault) {
8221 		ppattr_set_reffault(pai);
8222 	}
8223 	if (__probable(mustsynch)) {
8224 		pvh_unlock(pai);
8225 	}
8226 	return result;
8227 }
8228 
8229 MARK_AS_PMAP_TEXT boolean_t
8230 arm_force_fast_fault_internal(
8231 	ppnum_t         ppnum,
8232 	vm_prot_t       allow_mode,
8233 	int             options)
8234 {
8235 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8236 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8237 	}
8238 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8239 }
8240 
8241 /*
8242  *	Routine:	arm_force_fast_fault
8243  *
8244  *	Function:
8245  *		Force all mappings for this page to fault according
8246  *		to the access modes allowed, so we can gather ref/modify
8247  *		bits again.
8248  */
8249 
8250 boolean_t
8251 arm_force_fast_fault(
8252 	ppnum_t         ppnum,
8253 	vm_prot_t       allow_mode,
8254 	int             options,
8255 	__unused void   *arg)
8256 {
8257 	pmap_paddr_t    phys = ptoa(ppnum);
8258 
8259 	assert(ppnum != vm_page_fictitious_addr);
8260 
8261 	if (!pa_valid(phys)) {
8262 		return FALSE;   /* Not a managed page. */
8263 	}
8264 
8265 #if XNU_MONITOR
8266 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8267 #else
8268 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8269 #endif
8270 }
8271 
8272 /*
8273  *	Routine:	arm_clear_fast_fault
8274  *
8275  *	Function:
8276  *		Clear pending force fault for all mappings for this page based on
8277  *		the observed fault type, update ref/modify bits.
8278  */
8279 MARK_AS_PMAP_TEXT static boolean_t
8280 arm_clear_fast_fault(
8281 	ppnum_t ppnum,
8282 	vm_prot_t fault_type,
8283 	pt_entry_t *pte_p)
8284 {
8285 	pmap_paddr_t    pa = ptoa(ppnum);
8286 	pv_entry_t     *pve_p;
8287 	unsigned int    pai;
8288 	boolean_t       result;
8289 	bool            tlb_flush_needed = false;
8290 	pv_entry_t    **pv_h;
8291 	unsigned int    npve = 0;
8292 	unsigned int    pass1_updated = 0;
8293 	unsigned int    pass2_updated = 0;
8294 
8295 	assert(ppnum != vm_page_fictitious_addr);
8296 
8297 	if (!pa_valid(pa)) {
8298 		return FALSE;   /* Not a managed page. */
8299 	}
8300 
8301 	result = FALSE;
8302 	pai = pa_index(pa);
8303 	pvh_assert_locked(pai);
8304 	pv_h = pai_to_pvh(pai);
8305 
8306 	pve_p = PV_ENTRY_NULL;
8307 	if (pte_p == PT_ENTRY_NULL) {
8308 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8309 			pte_p = pvh_ptep(pv_h);
8310 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8311 			pve_p = pvh_pve_list(pv_h);
8312 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8313 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8314 		}
8315 	}
8316 
8317 	pv_entry_t *orig_pve_p = pve_p;
8318 	pt_entry_t *orig_pte_p = pte_p;
8319 	int pve_ptep_idx = 0;
8320 
8321 	/*
8322 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8323 	 * TLB invalidation in pass 2.
8324 	 */
8325 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8326 		pt_entry_t spte;
8327 		pt_entry_t tmplate;
8328 
8329 		if (pve_p != PV_ENTRY_NULL) {
8330 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8331 			if (pte_p == PT_ENTRY_NULL) {
8332 				goto cff_skip_pve_pass1;
8333 			}
8334 		}
8335 
8336 #ifdef PVH_FLAG_IOMMU
8337 		if (pvh_ptep_is_iommu(pte_p)) {
8338 			goto cff_skip_pve_pass1;
8339 		}
8340 #endif
8341 		if (*pte_p == ARM_PTE_EMPTY) {
8342 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8343 		}
8344 
8345 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8346 		const pmap_t pmap = ptdp->pmap;
8347 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8348 
8349 		assert(va >= pmap->min && va < pmap->max);
8350 
8351 		spte = *pte_p;
8352 		tmplate = spte;
8353 
8354 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8355 			{
8356 				if (pmap == kernel_pmap) {
8357 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8358 				} else {
8359 					assert(pmap->type != PMAP_TYPE_NESTED);
8360 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8361 				}
8362 			}
8363 
8364 			tmplate |= ARM_PTE_AF;
8365 
8366 			pte_set_was_writeable(tmplate, false);
8367 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8368 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8369 			tmplate = spte | ARM_PTE_AF;
8370 
8371 			{
8372 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8373 			}
8374 		}
8375 
8376 #if MACH_ASSERT && XNU_MONITOR
8377 		if (is_pte_xprr_protected(pmap, spte)) {
8378 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8379 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8380 				    "ppnum=0x%x, fault_type=0x%x",
8381 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8382 				    ppnum, fault_type);
8383 			}
8384 		}
8385 #endif /* MACH_ASSERT && XNU_MONITOR */
8386 
8387 		assert(spte != ARM_PTE_TYPE_FAULT);
8388 		if (spte != tmplate) {
8389 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8390 #ifdef ARM_PTE_FF_MARKER
8391 				assert(!(spte & ARM_PTE_FF_MARKER));
8392 				tmplate |= ARM_PTE_FF_MARKER;
8393 				++pass1_updated;
8394 #endif
8395 				tlb_flush_needed = true;
8396 			}
8397 			write_pte_fast(pte_p, tmplate);
8398 			result = TRUE;
8399 		}
8400 
8401 cff_skip_pve_pass1:
8402 		pte_p = PT_ENTRY_NULL;
8403 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8404 			pve_ptep_idx = 0;
8405 			pve_p = pve_next(pve_p);
8406 			++npve;
8407 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8408 				break;
8409 			}
8410 		}
8411 	}
8412 
8413 	if (!tlb_flush_needed) {
8414 		goto cff_finish;
8415 	}
8416 
8417 	FLUSH_PTE_STRONG();
8418 
8419 	/* Pass 2: Issue any required TLB invalidations */
8420 	pve_p = orig_pve_p;
8421 	pte_p = orig_pte_p;
8422 	pve_ptep_idx = 0;
8423 	npve = 0;
8424 
8425 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8426 		if (pve_p != PV_ENTRY_NULL) {
8427 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8428 			if (pte_p == PT_ENTRY_NULL) {
8429 				goto cff_skip_pve_pass2;
8430 			}
8431 		}
8432 
8433 #ifdef PVH_FLAG_IOMMU
8434 		if (pvh_ptep_is_iommu(pte_p)) {
8435 			goto cff_skip_pve_pass2;
8436 		}
8437 #endif
8438 
8439 #ifdef ARM_PTE_FF_MARKER
8440 		pt_entry_t spte = *pte_p;
8441 
8442 		if (!(spte & ARM_PTE_FF_MARKER)) {
8443 			goto cff_skip_pve_pass2;
8444 		} else {
8445 			spte &= (~ARM_PTE_FF_MARKER);
8446 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8447 			write_pte_fast(pte_p, spte);
8448 			++pass2_updated;
8449 		}
8450 #endif
8451 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8452 		const pmap_t pmap = ptdp->pmap;
8453 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8454 
8455 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8456 		    pmap, true, false);
8457 
8458 cff_skip_pve_pass2:
8459 		pte_p = PT_ENTRY_NULL;
8460 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8461 			pve_ptep_idx = 0;
8462 			pve_p = pve_next(pve_p);
8463 			++npve;
8464 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8465 				break;
8466 			}
8467 		}
8468 	}
8469 
8470 cff_finish:
8471 	if (__improbable(pass1_updated != pass2_updated)) {
8472 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8473 		    __func__, pass1_updated, pass2_updated);
8474 	}
8475 	if (tlb_flush_needed) {
8476 		sync_tlb_flush();
8477 	}
8478 	return result;
8479 }
8480 
8481 /*
8482  * Determine if the fault was induced by software tracking of
8483  * modify/reference bits.  If so, re-enable the mapping (and set
8484  * the appropriate bits).
8485  *
8486  * Returns KERN_SUCCESS if the fault was induced and was
8487  * successfully handled.
8488  *
8489  * Returns KERN_FAILURE if the fault was not induced and
8490  * the function was unable to deal with it.
8491  *
8492  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8493  * disallows this type of access.
8494  *
8495  * Returns KERN_ABORTED if the pmap lock is taken and a
8496  * preemption is pending.
8497  *
8498  */
8499 MARK_AS_PMAP_TEXT kern_return_t
8500 arm_fast_fault_internal(
8501 	pmap_t pmap,
8502 	vm_map_address_t va,
8503 	vm_prot_t fault_type,
8504 	__unused bool was_af_fault,
8505 	__unused bool from_user)
8506 {
8507 	kern_return_t   result = KERN_FAILURE;
8508 	pt_entry_t     *ptep;
8509 	pt_entry_t      spte = ARM_PTE_TYPE_FAULT;
8510 	unsigned int    pai;
8511 	pmap_paddr_t    pa;
8512 	validate_pmap_mutable(pmap);
8513 
8514 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8515 		return KERN_ABORTED;
8516 	}
8517 
8518 	/*
8519 	 * If the entry doesn't exist, is completely invalid, or is already
8520 	 * valid, we can't fix it here.
8521 	 */
8522 
8523 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8524 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8525 	if (ptep != PT_ENTRY_NULL) {
8526 		while (true) {
8527 			spte = *((volatile pt_entry_t*)ptep);
8528 
8529 			pa = pte_to_pa(spte);
8530 
8531 			if ((spte == ARM_PTE_TYPE_FAULT) ||
8532 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8533 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8534 				return result;
8535 			}
8536 
8537 			if (!pa_valid(pa)) {
8538 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8539 #if XNU_MONITOR
8540 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8541 					return KERN_PROTECTION_FAILURE;
8542 				} else
8543 #endif
8544 				return result;
8545 			}
8546 			pai = pa_index(pa);
8547 			pvh_lock(pai);
8548 			if (*ptep == spte) {
8549 				/*
8550 				 * Double-check the spte value, as we care about the AF bit.
8551 				 * It's also possible that pmap_page_protect() transitioned the
8552 				 * PTE to compressed/empty before we grabbed the PVH lock.
8553 				 */
8554 				break;
8555 			}
8556 			pvh_unlock(pai);
8557 		}
8558 	} else {
8559 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8560 		return result;
8561 	}
8562 
8563 
8564 	if ((result != KERN_SUCCESS) &&
8565 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8566 		/*
8567 		 * An attempted access will always clear ref/mod fault state, as
8568 		 * appropriate for the fault type.  arm_clear_fast_fault will
8569 		 * update the associated PTEs for the page as appropriate; if
8570 		 * any PTEs are updated, we redrive the access.  If the mapping
8571 		 * does not actually allow for the attempted access, the
8572 		 * following fault will (hopefully) fail to update any PTEs, and
8573 		 * thus cause arm_fast_fault to decide that it failed to handle
8574 		 * the fault.
8575 		 */
8576 		if (ppattr_test_reffault(pai)) {
8577 			ppattr_clear_reffault(pai);
8578 		}
8579 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8580 			ppattr_clear_modfault(pai);
8581 		}
8582 
8583 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8584 			/*
8585 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8586 			 * cost of not doing so is a another fault in a case
8587 			 * that should already result in an exception.
8588 			 */
8589 			result = KERN_SUCCESS;
8590 		}
8591 	}
8592 
8593 	/*
8594 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8595 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8596 	 * on mappings of the same page
8597 	 */
8598 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8599 		uintptr_t ap_ro, ap_rw, ap_x;
8600 		if (pmap == kernel_pmap) {
8601 			ap_ro = ARM_PTE_AP(AP_RONA);
8602 			ap_rw = ARM_PTE_AP(AP_RWNA);
8603 			ap_x = ARM_PTE_NX;
8604 		} else {
8605 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8606 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8607 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8608 		}
8609 		/*
8610 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8611 		 * hardware they may be xPRR-protected, in which case they'll be handled
8612 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8613 		 * handling path currently does not call arm_fast_fault() without at least
8614 		 * VM_PROT_READ in fault_type.
8615 		 */
8616 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8617 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8618 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8619 				result = KERN_SUCCESS;
8620 			}
8621 		}
8622 	}
8623 
8624 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8625 		/*
8626 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8627 		 * another pending PV list operation or an excessively large PV list.
8628 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8629 		 * taking a fault on the same mapping.
8630 		 */
8631 		result = KERN_SUCCESS;
8632 	}
8633 
8634 	pvh_unlock(pai);
8635 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8636 	return result;
8637 }
8638 
8639 kern_return_t
8640 arm_fast_fault(
8641 	pmap_t pmap,
8642 	vm_map_address_t va,
8643 	vm_prot_t fault_type,
8644 	bool was_af_fault,
8645 	__unused bool from_user)
8646 {
8647 	kern_return_t   result = KERN_FAILURE;
8648 
8649 	if (va < pmap->min || va >= pmap->max) {
8650 		return result;
8651 	}
8652 
8653 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8654 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8655 	    from_user);
8656 
8657 	do {
8658 #if XNU_MONITOR
8659 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8660 #else
8661 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8662 #endif
8663 	} while (result == KERN_ABORTED);
8664 
8665 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8666 
8667 	return result;
8668 }
8669 
8670 void
8671 pmap_copy_page(
8672 	ppnum_t psrc,
8673 	ppnum_t pdst)
8674 {
8675 	bcopy_phys((addr64_t) (ptoa(psrc)),
8676 	    (addr64_t) (ptoa(pdst)),
8677 	    PAGE_SIZE);
8678 }
8679 
8680 
8681 /*
8682  *	pmap_copy_page copies the specified (machine independent) pages.
8683  */
8684 void
8685 pmap_copy_part_page(
8686 	ppnum_t psrc,
8687 	vm_offset_t src_offset,
8688 	ppnum_t pdst,
8689 	vm_offset_t dst_offset,
8690 	vm_size_t len)
8691 {
8692 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8693 	    (addr64_t) (ptoa(pdst) + dst_offset),
8694 	    len);
8695 }
8696 
8697 
8698 /*
8699  *	pmap_zero_page zeros the specified (machine independent) page.
8700  */
8701 void
8702 pmap_zero_page(
8703 	ppnum_t pn)
8704 {
8705 	assert(pn != vm_page_fictitious_addr);
8706 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8707 }
8708 
8709 /*
8710  *	pmap_zero_part_page
8711  *	zeros the specified (machine independent) part of a page.
8712  */
8713 void
8714 pmap_zero_part_page(
8715 	ppnum_t pn,
8716 	vm_offset_t offset,
8717 	vm_size_t len)
8718 {
8719 	assert(pn != vm_page_fictitious_addr);
8720 	assert(offset + len <= PAGE_SIZE);
8721 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8722 }
8723 
8724 void
8725 pmap_map_globals(
8726 	void)
8727 {
8728 	pt_entry_t      *ptep, pte;
8729 
8730 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8731 	assert(ptep != PT_ENTRY_NULL);
8732 	assert(*ptep == ARM_PTE_EMPTY);
8733 
8734 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8735 #if __ARM_KERNEL_PROTECT__
8736 	pte |= ARM_PTE_NG;
8737 #endif /* __ARM_KERNEL_PROTECT__ */
8738 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8739 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8740 	*ptep = pte;
8741 	FLUSH_PTE();
8742 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8743 
8744 #if KASAN
8745 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8746 #endif
8747 }
8748 
8749 vm_offset_t
8750 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8751 {
8752 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8753 		panic("%s: invalid index %u", __func__, index);
8754 	}
8755 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8756 }
8757 
8758 MARK_AS_PMAP_TEXT unsigned int
8759 pmap_map_cpu_windows_copy_internal(
8760 	ppnum_t pn,
8761 	vm_prot_t prot,
8762 	unsigned int wimg_bits)
8763 {
8764 	pt_entry_t      *ptep = NULL, pte;
8765 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8766 	unsigned int    cpu_num;
8767 	unsigned int    i;
8768 	vm_offset_t     cpu_copywindow_vaddr = 0;
8769 	bool            need_strong_sync = false;
8770 
8771 #if XNU_MONITOR
8772 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8773 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8774 #endif
8775 
8776 #if XNU_MONITOR
8777 #ifdef  __ARM_COHERENT_IO__
8778 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8779 		panic("%s: attempted to map a managed page, "
8780 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8781 		    __FUNCTION__,
8782 		    pn, prot, wimg_bits);
8783 	}
8784 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8785 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8786 	}
8787 
8788 #else /* __ARM_COHERENT_IO__ */
8789 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8790 #endif /* __ARM_COHERENT_IO__ */
8791 #endif /* XNU_MONITOR */
8792 	cpu_num = pmap_cpu_data->cpu_number;
8793 
8794 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8795 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8796 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8797 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8798 		if (*ptep == ARM_PTE_TYPE_FAULT) {
8799 			break;
8800 		}
8801 	}
8802 	if (i == CPUWINDOWS_MAX) {
8803 		panic("pmap_map_cpu_windows_copy: out of window");
8804 	}
8805 
8806 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8807 #if __ARM_KERNEL_PROTECT__
8808 	pte |= ARM_PTE_NG;
8809 #endif /* __ARM_KERNEL_PROTECT__ */
8810 
8811 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8812 
8813 	if (prot & VM_PROT_WRITE) {
8814 		pte |= ARM_PTE_AP(AP_RWNA);
8815 	} else {
8816 		pte |= ARM_PTE_AP(AP_RONA);
8817 	}
8818 #if HAS_FEAT_XS
8819 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8820 #endif
8821 	write_pte_fast(ptep, pte);
8822 	/*
8823 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8824 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8825 	 */
8826 	FLUSH_PTE_STRONG();
8827 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8828 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8829 
8830 	return i;
8831 }
8832 
8833 unsigned int
8834 pmap_map_cpu_windows_copy(
8835 	ppnum_t pn,
8836 	vm_prot_t prot,
8837 	unsigned int wimg_bits)
8838 {
8839 #if XNU_MONITOR
8840 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8841 #else
8842 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8843 #endif
8844 }
8845 
8846 MARK_AS_PMAP_TEXT void
8847 pmap_unmap_cpu_windows_copy_internal(
8848 	unsigned int index)
8849 {
8850 	pt_entry_t      *ptep;
8851 	unsigned int    cpu_num;
8852 	vm_offset_t     cpu_copywindow_vaddr = 0;
8853 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8854 
8855 	cpu_num = pmap_cpu_data->cpu_number;
8856 
8857 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8858 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8859 	 * (which are likely to have been on I/O memory) are complete before
8860 	 * tearing down the mapping. */
8861 	__builtin_arm_dsb(DSB_SY);
8862 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8863 	write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8864 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8865 }
8866 
8867 void
8868 pmap_unmap_cpu_windows_copy(
8869 	unsigned int index)
8870 {
8871 #if XNU_MONITOR
8872 	return pmap_unmap_cpu_windows_copy_ppl(index);
8873 #else
8874 	return pmap_unmap_cpu_windows_copy_internal(index);
8875 #endif
8876 }
8877 
8878 #if XNU_MONITOR
8879 
8880 MARK_AS_PMAP_TEXT void
8881 pmap_invoke_with_page(
8882 	ppnum_t page_number,
8883 	void *ctx,
8884 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8885 {
8886 	#pragma unused(page_number, ctx, callback)
8887 }
8888 
8889 /*
8890  * Loop over every pmap_io_range (I/O ranges marked as owned by
8891  * the PPL in the device tree) and conditionally call callback() on each range
8892  * that needs to be included in the hibernation image.
8893  *
8894  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8895  *                 context is needed in the callback.
8896  * @param callback Callback function invoked on each range (gated by flag).
8897  */
8898 MARK_AS_PMAP_TEXT void
8899 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8900 {
8901 	extern const pmap_io_range_t* io_attr_table;
8902 	extern const unsigned int num_io_rgns;
8903 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8904 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8905 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8906 		}
8907 	}
8908 }
8909 
8910 /**
8911  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8912  * PPL-owned page. Otherwise, do nothing.
8913  *
8914  * @param addr Physical address of the page to set the HASHED flag on.
8915  */
8916 MARK_AS_PMAP_TEXT void
8917 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8918 {
8919 	/* Ignore non-managed kernel memory. */
8920 	if (!pa_valid(addr)) {
8921 		return;
8922 	}
8923 
8924 	const unsigned int pai = pa_index(addr);
8925 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8926 		pv_entry_t **pv_h = pai_to_pvh(pai);
8927 
8928 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
8929 		pvh_lock(pai);
8930 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8931 		pvh_unlock(pai);
8932 	}
8933 }
8934 
8935 /**
8936  * Loop through every physical page in the system and clear out the HASHED flag
8937  * on every PPL-owned page. That flag is used to keep track of which pages have
8938  * been hashed into the hibernation image during the hibernation entry process.
8939  *
8940  * The HASHED flag needs to be cleared out between hibernation cycles because the
8941  * pv_head_table and pp_attr_table's might have been copied into the hibernation
8942  * image with the HASHED flag set on certain pages. It's important to clear the
8943  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8944  * into the hibernation image can't be compromised across hibernation cycles.
8945  */
8946 MARK_AS_PMAP_TEXT void
8947 pmap_clear_ppl_hashed_flag_all(void)
8948 {
8949 	const unsigned int last_index = pa_index(vm_last_phys);
8950 	pv_entry_t **pv_h = NULL;
8951 
8952 	for (int pai = 0; pai < last_index; ++pai) {
8953 		pv_h = pai_to_pvh(pai);
8954 
8955 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8956 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8957 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8958 			pvh_lock(pai);
8959 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8960 			pvh_unlock(pai);
8961 		}
8962 	}
8963 }
8964 
8965 /**
8966  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8967  * ppl_hib driver will call this after all wired pages have been copied into the
8968  * hibernation image.
8969  */
8970 MARK_AS_PMAP_TEXT void
8971 pmap_check_ppl_hashed_flag_all(void)
8972 {
8973 	const unsigned int last_index = pa_index(vm_last_phys);
8974 	pv_entry_t **pv_h = NULL;
8975 
8976 	for (int pai = 0; pai < last_index; ++pai) {
8977 		pv_h = pai_to_pvh(pai);
8978 
8979 		/**
8980 		 * The PMAP stacks are explicitly not saved into the image so skip checking
8981 		 * the pages that contain the PMAP stacks.
8982 		 */
8983 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8984 		    (pai < pa_index(pmap_stacks_end_pa));
8985 
8986 		if (!is_pmap_stack &&
8987 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8988 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8989 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8990 		}
8991 	}
8992 }
8993 
8994 #endif /* XNU_MONITOR */
8995 
8996 /*
8997  * Indicate that a pmap is intended to be used as a nested pmap
8998  * within one or more larger address spaces.  This must be set
8999  * before pmap_nest() is called with this pmap as the 'subordinate'.
9000  */
9001 MARK_AS_PMAP_TEXT void
9002 pmap_set_nested_internal(
9003 	pmap_t pmap)
9004 {
9005 	validate_pmap_mutable(pmap);
9006 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9007 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9008 		    __func__, pmap, pmap->type);
9009 	}
9010 
9011 #if XNU_MONITOR
9012 	/**
9013 	 * The "seq_cst" ordering of the atomic load here guarantees
9014 	 * the check below is performed after the type update above
9015 	 * is observed. Together with similar order guarantee at
9016 	 * pmap_switch_internal(), it makes sure a pmap is never
9017 	 * active-and-nested:
9018 	 *
9019 	 * pmap_set_nested() | pmap_switch()
9020 	 * --------------------------------------
9021 	 * set nested        | set active
9022 	 * store-load barrier| store-load barrier
9023 	 * assert !active    | assert !nested
9024 	 */
9025 	const int max_cpu = ml_get_max_cpu_number();
9026 	for (unsigned int i = 0; i <= max_cpu; ++i) {
9027 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9028 		if (cpu_data == NULL) {
9029 			continue;
9030 		}
9031 		if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9032 			panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9033 		}
9034 	}
9035 #endif /* XNU_MONITOR */
9036 
9037 	/**
9038 	 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
9039 	 * this pmap its own nested pmap.
9040 	 */
9041 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9042 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9043 	}
9044 
9045 	pmap_get_pt_ops(pmap)->free_id(pmap);
9046 }
9047 
9048 void
9049 pmap_set_nested(
9050 	pmap_t pmap)
9051 {
9052 #if XNU_MONITOR
9053 	pmap_set_nested_ppl(pmap);
9054 #else
9055 	pmap_set_nested_internal(pmap);
9056 #endif
9057 }
9058 
9059 bool
9060 pmap_is_nested(
9061 	pmap_t pmap)
9062 {
9063 	return pmap->type == PMAP_TYPE_NESTED;
9064 }
9065 
9066 /*
9067  * pmap_trim_range(pmap, start, end)
9068  *
9069  * pmap  = pmap to operate on
9070  * start = start of the range
9071  * end   = end of the range
9072  *
9073  * Attempts to deallocate TTEs for the given range in the nested range.
9074  */
9075 MARK_AS_PMAP_TEXT static void
9076 pmap_trim_range(
9077 	pmap_t pmap,
9078 	addr64_t start,
9079 	addr64_t end)
9080 {
9081 	addr64_t cur;
9082 	addr64_t nested_region_start;
9083 	addr64_t nested_region_end;
9084 	addr64_t adjusted_start;
9085 	addr64_t adjusted_end;
9086 	addr64_t adjust_offmask;
9087 	tt_entry_t * tte_p;
9088 	pt_entry_t * pte_p;
9089 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9090 
9091 	if (__improbable(end < start)) {
9092 		panic("%s: invalid address range, "
9093 		    "pmap=%p, start=%p, end=%p",
9094 		    __func__,
9095 		    pmap, (void*)start, (void*)end);
9096 	}
9097 
9098 	nested_region_start = pmap->nested_region_addr;
9099 	nested_region_end = nested_region_start + pmap->nested_region_size;
9100 
9101 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9102 		panic("%s: range outside nested region %p-%p, "
9103 		    "pmap=%p, start=%p, end=%p",
9104 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9105 		    pmap, (void*)start, (void*)end);
9106 	}
9107 
9108 	/* Contract the range to TT page boundaries. */
9109 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9110 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9111 	adjusted_end = end & ~adjust_offmask;
9112 
9113 	/* Iterate over the range, trying to remove TTEs. */
9114 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9115 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9116 
9117 		tte_p = pmap_tte(pmap, cur);
9118 
9119 		if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9120 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9121 
9122 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9123 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9124 				/* Deallocate for the nested map. */
9125 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9126 			} else if (pmap->type == PMAP_TYPE_USER) {
9127 				/**
9128 				 * Just remove for the parent map. If the leaf table pointed
9129 				 * to by the TTE being removed (owned by the nested pmap)
9130 				 * has any mappings, then this call will panic. This
9131 				 * enforces the policy that tables being trimmed must be
9132 				 * empty to prevent possible use-after-free attacks.
9133 				 */
9134 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9135 			} else {
9136 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9137 			}
9138 		} else {
9139 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9140 		}
9141 	}
9142 
9143 	/* Remove empty L2 TTs. */
9144 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9145 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9146 
9147 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9148 		/* For each L1 entry in our range... */
9149 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9150 
9151 		bool remove_tt1e = true;
9152 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9153 		tt_entry_t * tt2e_start;
9154 		tt_entry_t * tt2e_end;
9155 		tt_entry_t * tt2e_p;
9156 		tt_entry_t tt1e;
9157 
9158 		if (tt1e_p == NULL) {
9159 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9160 			continue;
9161 		}
9162 
9163 		tt1e = *tt1e_p;
9164 
9165 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9166 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9167 			continue;
9168 		}
9169 
9170 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9171 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9172 
9173 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9174 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9175 				/*
9176 				 * If any TTEs are populated, don't remove the
9177 				 * L1 TT.
9178 				 */
9179 				remove_tt1e = false;
9180 			}
9181 		}
9182 
9183 		if (remove_tt1e) {
9184 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9185 		} else {
9186 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9187 		}
9188 	}
9189 }
9190 
9191 /**
9192  * State machine for multi-step pmap trimming. Trimming is the action of
9193  * deallocating the TTEs of the shared region of pmaps down to a given range.
9194  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9195  * disabling preemption for too long. These steps include computing the bounds
9196  * of the shared region, trimming the head of the "grand", trimming the tail of
9197  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9198  * different conditions.
9199  *
9200  * @param grand the pmap in which the pages are nested
9201  * @param subord the pmap from which the pages are shared, or nested
9202  * @param vstart start of the used range in "grand"
9203  * @param size size of the used range
9204  * @param state the current state of the state machine
9205  *
9206  * @return the next state of the state machine, to be used in the next call
9207  *         into this function.
9208  */
9209 MARK_AS_PMAP_TEXT pmap_trim_state_t
9210 pmap_trim_internal(
9211 	pmap_t grand,
9212 	pmap_t subord,
9213 	addr64_t vstart,
9214 	uint64_t size,
9215 	pmap_trim_state_t state)
9216 {
9217 	/* Validation needs to be done regardless of state. */
9218 	addr64_t vend;
9219 
9220 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9221 		panic("%s: grand addr wraps around, "
9222 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9223 		    __func__, grand, subord, (void*)vstart, size, state);
9224 	}
9225 
9226 	validate_pmap_mutable(grand);
9227 	validate_pmap(subord);
9228 
9229 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9230 		panic("%s: subord is of non-nestable type 0x%hhx, "
9231 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9232 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9233 	}
9234 
9235 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9236 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9237 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9238 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9239 	}
9240 
9241 	if (__improbable(grand->nested_pmap != subord)) {
9242 		panic("%s: grand->nested != subord, "
9243 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9244 		    __func__, grand, subord, (void*)vstart, size, state);
9245 	}
9246 
9247 	if (__improbable((size != 0) &&
9248 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9249 		panic("%s: grand range not in nested region, "
9250 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9251 		    __func__, grand, subord, (void*)vstart, size, state);
9252 	}
9253 
9254 
9255 	/* Trimming starts with figuring out the bounds for the grand. */
9256 	if (state == PMAP_TRIM_STATE_START) {
9257 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9258 
9259 		/**
9260 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9261 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9262 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9263 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9264 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9265 		 * PMAP_TRIM_STATE_DONE.
9266 		 */
9267 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9268 			assert(subord->nested_bounds_set);
9269 
9270 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9271 			if (!grand->nested_bounds_set) {
9272 				/* Inherit the bounds from subord. */
9273 				grand->nested_region_true_start = subord->nested_region_true_start;
9274 				grand->nested_region_true_end = subord->nested_region_true_end;
9275 				grand->nested_bounds_set = true;
9276 			}
9277 
9278 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9279 
9280 			/* Now that the grand has bounds, we are done. */
9281 			return PMAP_TRIM_STATE_DONE;
9282 		}
9283 
9284 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9285 		if ((!subord->nested_bounds_set) && size) {
9286 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9287 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9288 
9289 			subord->nested_region_true_start = vstart;
9290 			subord->nested_region_true_end = vend;
9291 			subord->nested_region_true_start &= ~adjust_offmask;
9292 
9293 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9294 				panic("%s: padded true end wraps around, "
9295 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9296 				    __func__, grand, subord, (void*)vstart, size, state);
9297 			}
9298 
9299 			subord->nested_region_true_end &= ~adjust_offmask;
9300 			subord->nested_bounds_set = true;
9301 		}
9302 
9303 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9304 		if (subord->nested_bounds_set) {
9305 			/* Inherit the bounds from subord. */
9306 			grand->nested_region_true_start = subord->nested_region_true_start;
9307 			grand->nested_region_true_end = subord->nested_region_true_end;
9308 			grand->nested_bounds_set = true;
9309 
9310 			/* If we know the bounds, we can trim the pmap. */
9311 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9312 
9313 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9314 		} else {
9315 			/* Don't trim if we don't know the bounds. */
9316 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9317 
9318 			return PMAP_TRIM_STATE_DONE;
9319 		}
9320 	}
9321 
9322 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9323 	if (!grand->nested_bounds_set) {
9324 		panic("%s: !grand->nested_bounds_set, "
9325 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9326 		    __func__, grand, subord, (void*)vstart, size, state);
9327 	}
9328 
9329 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9330 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9331 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9332 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9333 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9334 			    (unsigned int)grand->nested_no_bounds_ref_state);
9335 		}
9336 
9337 #if XNU_MONITOR
9338 		if (pmap_pending_preemption()) {
9339 			return PMAP_TRIM_STATE_GRAND_AFTER;
9340 		}
9341 #endif
9342 
9343 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9344 	}
9345 
9346 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9347 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9348 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9349 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9350 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9351 			    (unsigned int)grand->nested_no_bounds_ref_state);
9352 		}
9353 
9354 #if XNU_MONITOR
9355 		if (pmap_pending_preemption()) {
9356 			return PMAP_TRIM_STATE_SUBORD;
9357 		}
9358 #endif
9359 
9360 		state = PMAP_TRIM_STATE_SUBORD;
9361 	}
9362 
9363 	/* START state is guaranteed to compute the bounds for the subord. */
9364 	if (!subord->nested_bounds_set) {
9365 		panic("%s: !subord->nested_bounds_set, "
9366 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9367 		    __func__, grand, subord, (void*)vstart, size, state);
9368 	}
9369 
9370 	if (state == PMAP_TRIM_STATE_SUBORD) {
9371 		/**
9372 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9373 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9374 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9375 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9376 		 * the state update is visible only once the preceding trim operation is complete.  An
9377 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9378 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9379 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9380 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9381 		 * of the state CAS.
9382 		 */
9383 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9384 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9385 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9386 			    (unsigned int)grand->nested_no_bounds_ref_state);
9387 		}
9388 		pmap_trim_subord(subord);
9389 	}
9390 
9391 	return PMAP_TRIM_STATE_DONE;
9392 }
9393 
9394 MARK_AS_PMAP_TEXT static void
9395 pmap_trim_self(pmap_t pmap)
9396 {
9397 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9398 		/* If we have a no bounds ref, we need to drop it. */
9399 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9400 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9401 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9402 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9403 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9404 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9405 
9406 		if (nested_bounds_set) {
9407 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9408 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9409 		}
9410 		/*
9411 		 * Try trimming the nested pmap, in case we had the
9412 		 * last reference.
9413 		 */
9414 		pmap_trim_subord(pmap->nested_pmap);
9415 	}
9416 }
9417 
9418 /*
9419  * pmap_trim_subord(grand, subord)
9420  *
9421  * grand  = pmap that we have nested subord in
9422  * subord = nested pmap we are attempting to trim
9423  *
9424  * Trims subord if possible
9425  */
9426 MARK_AS_PMAP_TEXT static void
9427 pmap_trim_subord(pmap_t subord)
9428 {
9429 	bool contract_subord = false;
9430 
9431 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9432 
9433 	subord->nested_no_bounds_refcnt--;
9434 
9435 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9436 		/* If this was the last no bounds reference, trim subord. */
9437 		contract_subord = true;
9438 	}
9439 
9440 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9441 
9442 	if (contract_subord) {
9443 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9444 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9445 	}
9446 }
9447 
9448 /**
9449  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9450  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9451  * disabling preemption for too long.
9452  *
9453  * @note When we load the shared region we always create pages tables for the
9454  *       entire region. In practice, the shared cache may use just a portion
9455  *       of that. Before we know the bounds of the shared region, it can
9456  *       already be mapped into processes. Therefore, once the bounds are
9457  *       known, "trimming" comes in handy to remove the unnecessary page
9458  *       tables in the processes the shared region is mapped in, and eventually
9459  *       those in the shared region itself. Note that the shared region must
9460  *       be trimmed after the user processes because it has the L3 entries
9461  *       everyone else is pointing to.
9462  *
9463  * @param grand the pmap in which the pages are nested
9464  * @param subord the pmap from which the pages are shared, or nested
9465  * @param vstart start of the used range in "grand"
9466  * @param size size of the used range
9467  */
9468 void
9469 pmap_trim(
9470 	pmap_t grand,
9471 	pmap_t subord,
9472 	addr64_t vstart,
9473 	uint64_t size)
9474 {
9475 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9476 
9477 #if XNU_MONITOR
9478 	/* On PPL systems, drives the state machine until its done. */
9479 	while (state != PMAP_TRIM_STATE_DONE) {
9480 		__assert_only pmap_trim_state_t old_state = state;
9481 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9482 
9483 		/* Are we making progress? */
9484 		assert(old_state != state);
9485 	}
9486 
9487 	pmap_ledger_check_balance(grand);
9488 	pmap_ledger_check_balance(subord);
9489 #else
9490 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9491 
9492 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9493 	assert(state == PMAP_TRIM_STATE_DONE);
9494 #endif
9495 }
9496 
9497 #if HAS_APPLE_PAC
9498 void *
9499 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9500 {
9501 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9502 		panic("attempt to sign user pointer without process independent key");
9503 	}
9504 
9505 	void *res = NULL;
9506 	uint64_t current_intr_state = pmap_interrupts_disable();
9507 
9508 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9509 
9510 	__compiler_materialize_and_prevent_reordering_on(value);
9511 	switch (key) {
9512 	case ptrauth_key_asia:
9513 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9514 		break;
9515 	case ptrauth_key_asda:
9516 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9517 		break;
9518 	default:
9519 		__builtin_unreachable();
9520 	}
9521 	__compiler_materialize_and_prevent_reordering_on(res);
9522 
9523 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9524 
9525 	pmap_interrupts_restore(current_intr_state);
9526 
9527 	return res;
9528 }
9529 
9530 void *
9531 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9532 {
9533 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9534 }
9535 
9536 void *
9537 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9538 {
9539 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9540 		panic("attempt to auth user pointer without process independent key");
9541 	}
9542 
9543 	void *res = NULL;
9544 	uint64_t current_intr_state = pmap_interrupts_disable();
9545 
9546 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9547 	__compiler_materialize_and_prevent_reordering_on(value);
9548 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9549 	__compiler_materialize_and_prevent_reordering_on(res);
9550 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9551 
9552 	pmap_interrupts_restore(current_intr_state);
9553 
9554 	return res;
9555 }
9556 
9557 void *
9558 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9559 {
9560 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9561 }
9562 #endif /* HAS_APPLE_PAC */
9563 
9564 /*
9565  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9566  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9567  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9568  * return value, to indicate where a preempted [un]nest operation should resume.
9569  * When the return value contains the ending address of the nested region with
9570  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9571  */
9572 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9573 
9574 /*
9575  *	kern_return_t pmap_nest(grand, subord, vstart, size)
9576  *
9577  *	grand  = the pmap that we will nest subord into
9578  *	subord = the pmap that goes into the grand
9579  *	vstart  = start of range in pmap to be inserted
9580  *	size   = Size of nest area (up to 16TB)
9581  *
9582  *	Inserts a pmap into another.  This is used to implement shared segments.
9583  *
9584  */
9585 
9586 /**
9587  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9588  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9589  * This function operates in 3 main phases:
9590  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9591  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9592  *    the mapping range are present in subord.
9593  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9594  *    contains pointers to subord's leaf-level pagetable pages for the specified
9595  *    VA range.
9596  *
9597  * This function may return early due to pending AST_URGENT preemption; if so
9598  * it will indicate the need to be re-entered.
9599  *
9600  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9601  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9602  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9603  * @param size twig-aligned size of the nesting range
9604  * @param vrestart the twig-aligned starting address of the current call.  May contain
9605  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9606  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9607  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9608  *
9609  * @return the virtual address at which to restart the operation, possibly including
9610  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9611  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9612  */
9613 MARK_AS_PMAP_TEXT vm_map_offset_t
9614 pmap_nest_internal(
9615 	pmap_t grand,
9616 	pmap_t subord,
9617 	addr64_t vstart,
9618 	uint64_t size,
9619 	vm_map_offset_t vrestart,
9620 	kern_return_t *krp)
9621 {
9622 	kern_return_t kr = KERN_FAILURE;
9623 	vm_map_offset_t vaddr;
9624 	tt_entry_t     *stte_p;
9625 	tt_entry_t     *gtte_p;
9626 	uint64_t        nested_region_unnested_table_bitmap_size;
9627 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9628 	uint64_t        new_nested_region_unnested_table_bitmap_size;
9629 	unsigned int*   new_nested_region_unnested_table_bitmap = NULL;
9630 	int             expand_options = 0;
9631 	bool            deref_subord = true;
9632 	bool            grand_locked = false;
9633 
9634 	addr64_t vend;
9635 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9636 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9637 	}
9638 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9639 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9640 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9641 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9642 	}
9643 
9644 	assert(krp != NULL);
9645 	validate_pmap_mutable(grand);
9646 	validate_pmap(subord);
9647 #if XNU_MONITOR
9648 	/*
9649 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9650 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9651 	 * be in the process of being destroyed.  If destruction is already committed,
9652 	 * then the check of ref_count below will cover us.  If destruction is initiated
9653 	 * during or after this call, then pmap_destroy() will catch the non-zero
9654 	 * nested_count.
9655 	 */
9656 	os_atomic_inc(&subord->nested_count, relaxed);
9657 	os_atomic_thread_fence(seq_cst);
9658 #endif
9659 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9660 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9661 	}
9662 
9663 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9664 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9665 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9666 	}
9667 
9668 #if XNU_MONITOR
9669 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9670 #endif
9671 
9672 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9673 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9674 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9675 		    grand, vstart, size, (unsigned long long)vrestart);
9676 	}
9677 
9678 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9679 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9680 	}
9681 
9682 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9683 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9684 	}
9685 
9686 	if (subord->nested_region_unnested_table_bitmap == NULL) {
9687 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9688 
9689 		/**
9690 		 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9691 		 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9692 		 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9693 		 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9694 		 */
9695 		nested_region_unnested_table_bitmap_size <<= 1;
9696 
9697 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9698 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9699 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9700 			    __func__, nested_region_unnested_table_bitmap_size,
9701 			    grand, subord, vstart, size);
9702 		}
9703 
9704 #if XNU_MONITOR
9705 		pmap_paddr_t pa = 0;
9706 
9707 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9708 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9709 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9710 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9711 			    grand, subord, vstart, size);
9712 		}
9713 
9714 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9715 
9716 		if (kr != KERN_SUCCESS) {
9717 			goto nest_cleanup;
9718 		}
9719 
9720 		assert(pa);
9721 
9722 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9723 #else
9724 		nested_region_unnested_table_bitmap = kalloc_data(
9725 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9726 			Z_WAITOK | Z_ZERO);
9727 #endif
9728 
9729 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9730 			kr = KERN_ABORTED;
9731 			goto nest_cleanup;
9732 		}
9733 
9734 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9735 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9736 			subord->nested_region_addr = vstart;
9737 			subord->nested_region_size = (mach_vm_offset_t) size;
9738 
9739 			/**
9740 			 * Ensure that the rest of the subord->nested_region_* fields are
9741 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
9742 			 * field (which is used as the flag to say that the rest are initialized).
9743 			 */
9744 			__builtin_arm_dmb(DMB_ISHST);
9745 			subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9746 			nested_region_unnested_table_bitmap = NULL;
9747 		}
9748 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9749 		if (nested_region_unnested_table_bitmap != NULL) {
9750 #if XNU_MONITOR
9751 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9752 #else
9753 			kfree_data(nested_region_unnested_table_bitmap,
9754 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9755 #endif
9756 			nested_region_unnested_table_bitmap = NULL;
9757 		}
9758 	}
9759 
9760 	/**
9761 	 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9762 	 * speculated before their initialization.
9763 	 */
9764 	__builtin_arm_dmb(DMB_ISHLD);
9765 
9766 	if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9767 		uint64_t        new_size;
9768 
9769 		nested_region_unnested_table_bitmap = NULL;
9770 		nested_region_unnested_table_bitmap_size = 0ULL;
9771 		new_size =  vend - subord->nested_region_addr;
9772 
9773 		new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9774 		new_nested_region_unnested_table_bitmap_size <<= 1;
9775 
9776 		if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9777 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9778 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9779 			    __func__, new_nested_region_unnested_table_bitmap_size,
9780 			    grand, subord, vstart, size);
9781 		}
9782 
9783 #if XNU_MONITOR
9784 		pmap_paddr_t pa = 0;
9785 
9786 		if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9787 			panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9788 			    "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9789 			    __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9790 			    grand, subord, vstart, new_size);
9791 		}
9792 
9793 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9794 
9795 		if (kr != KERN_SUCCESS) {
9796 			goto nest_cleanup;
9797 		}
9798 
9799 		assert(pa);
9800 
9801 		new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9802 #else
9803 		new_nested_region_unnested_table_bitmap = kalloc_data(
9804 			new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9805 			Z_WAITOK | Z_ZERO);
9806 #endif
9807 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9808 			kr = KERN_ABORTED;
9809 			goto nest_cleanup;
9810 		}
9811 
9812 		if (subord->nested_region_size < new_size) {
9813 			bcopy(subord->nested_region_unnested_table_bitmap,
9814 			    new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9815 			nested_region_unnested_table_bitmap_size  = subord->nested_region_unnested_table_bitmap_size;
9816 			nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9817 			subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9818 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9819 			subord->nested_region_size = new_size;
9820 			new_nested_region_unnested_table_bitmap = NULL;
9821 		}
9822 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9823 		if (nested_region_unnested_table_bitmap != NULL) {
9824 #if XNU_MONITOR
9825 			pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9826 #else
9827 			kfree_data(nested_region_unnested_table_bitmap,
9828 			    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9829 #endif
9830 			nested_region_unnested_table_bitmap = NULL;
9831 		}
9832 		if (new_nested_region_unnested_table_bitmap != NULL) {
9833 #if XNU_MONITOR
9834 			pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9835 #else
9836 			kfree_data(new_nested_region_unnested_table_bitmap,
9837 			    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9838 #endif
9839 			new_nested_region_unnested_table_bitmap = NULL;
9840 		}
9841 	}
9842 
9843 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9844 		kr = KERN_ABORTED;
9845 		goto nest_cleanup;
9846 	}
9847 
9848 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9849 		/**
9850 		 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9851 		 * into a nested pmap, which would then produce multiple levels of nesting.
9852 		 */
9853 		if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9854 			panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9855 		}
9856 		/*
9857 		 * If this is grand's first nesting operation, keep the reference on subord.
9858 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9859 		 */
9860 		deref_subord = false;
9861 
9862 		if (!subord->nested_bounds_set) {
9863 			/*
9864 			 * We are nesting without the shared regions bounds
9865 			 * being known.  We'll have to trim the pmap later.
9866 			 */
9867 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9868 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9869 				panic("%s: grand %p already nested", __func__, grand);
9870 			}
9871 			subord->nested_no_bounds_refcnt++;
9872 		}
9873 
9874 		if (__improbable(vstart < subord->nested_region_addr ||
9875 		    vend > (subord->nested_region_addr + subord->nested_region_size))) {
9876 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9877 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9878 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9879 		}
9880 
9881 		grand->nested_region_addr = vstart;
9882 		grand->nested_region_size = (mach_vm_offset_t) size;
9883 	} else {
9884 		if (__improbable(grand->nested_pmap != subord)) {
9885 			panic("pmap_nest() pmap %p has a nested pmap", grand);
9886 		} else if (__improbable(grand->nested_region_addr > vstart)) {
9887 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9888 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9889 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9890 		}
9891 	}
9892 
9893 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9894 	if (vaddr < subord->nested_region_true_start) {
9895 		vaddr = subord->nested_region_true_start;
9896 	}
9897 
9898 	addr64_t true_end = vend;
9899 	if (true_end > subord->nested_region_true_end) {
9900 		true_end = subord->nested_region_true_end;
9901 	}
9902 	__unused unsigned int ttecount = 0;
9903 
9904 	if (vrestart & PMAP_NEST_GRAND) {
9905 		goto nest_grand;
9906 	}
9907 
9908 	while (vaddr < true_end) {
9909 		stte_p = pmap_tte(subord, vaddr);
9910 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9911 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9912 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9913 
9914 			if (kr != KERN_SUCCESS) {
9915 				goto done;
9916 			}
9917 
9918 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9919 		}
9920 		vaddr += pt_attr_twig_size(pt_attr);
9921 		vrestart = vaddr;
9922 		++ttecount;
9923 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9924 		    pmap_pending_preemption())) {
9925 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9926 			kr = KERN_SUCCESS;
9927 			goto done;
9928 		}
9929 	}
9930 	/*
9931 	 * copy TTEs from subord pmap into grand pmap
9932 	 */
9933 
9934 	vaddr = (vm_map_offset_t) vstart;
9935 	if (vaddr < subord->nested_region_true_start) {
9936 		vaddr = subord->nested_region_true_start;
9937 	}
9938 	vrestart = vaddr | PMAP_NEST_GRAND;
9939 
9940 nest_grand:
9941 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9942 
9943 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9944 		kr = KERN_ABORTED;
9945 		goto done;
9946 	}
9947 	while (vaddr < true_end) {
9948 		gtte_p = pmap_tte(grand, vaddr);
9949 		if (gtte_p == PT_ENTRY_NULL) {
9950 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9951 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9952 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9953 				if (kr == KERN_SUCCESS) {
9954 					kr = KERN_ABORTED;
9955 				}
9956 			}
9957 
9958 			if (kr != KERN_SUCCESS) {
9959 				goto done;
9960 			}
9961 
9962 			gtte_p = pmap_tt2e(grand, vaddr);
9963 		}
9964 		/* Don't leak a page table page.  Don't violate break-before-make. */
9965 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9966 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9967 			    __func__, gtte_p, grand);
9968 		}
9969 		/**
9970 		 * It's possible that grand was trimmed by pmap_trim_internal() while the
9971 		 * lock was dropped, in which case the previously stored "true" start/end
9972 		 * will no longer be accurate.  In that case, we need to avoid nesting
9973 		 * tables outside the trimmed range, as those tables may be immediately freed
9974 		 * which would lead to a dangling page table pointer in grand.
9975 		 * Note that pmap_trim() may concurrently update grand's bounds as we are
9976 		 * making these checks, but in that case pmap_trim_range() has not yet
9977 		 * been called on grand and will wait for us to drop grand's lock, so it
9978 		 * should see any TTEs we've nested here and clear them appropriately.
9979 		 */
9980 		if (__probable((vaddr >= grand->nested_region_true_start) &&
9981 		    (vaddr < grand->nested_region_true_end))) {
9982 			stte_p = pmap_tte(subord, vaddr);
9983 			if (__improbable(stte_p == PT_ENTRY_NULL)) {
9984 				panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9985 			}
9986 			*gtte_p = *stte_p;
9987 		}
9988 
9989 		vaddr += pt_attr_twig_size(pt_attr);
9990 		vrestart = vaddr | PMAP_NEST_GRAND;
9991 		++ttecount;
9992 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9993 		    pmap_pending_preemption())) {
9994 			break;
9995 		}
9996 	}
9997 	if (vaddr >= true_end) {
9998 		vrestart = vend | PMAP_NEST_GRAND;
9999 	}
10000 
10001 	kr = KERN_SUCCESS;
10002 done:
10003 
10004 	FLUSH_PTE();
10005 	__builtin_arm_isb(ISB_SY);
10006 
10007 	if (grand_locked) {
10008 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10009 	}
10010 
10011 nest_cleanup:
10012 #if XNU_MONITOR
10013 	if (kr != KERN_SUCCESS) {
10014 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10015 		*krp = kr;
10016 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10017 	}
10018 #else
10019 	if (kr != KERN_SUCCESS) {
10020 		*krp = kr;
10021 	}
10022 #endif
10023 	if (nested_region_unnested_table_bitmap != NULL) {
10024 #if XNU_MONITOR
10025 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
10026 #else
10027 		kfree_data(nested_region_unnested_table_bitmap,
10028 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10029 #endif
10030 	}
10031 	if (new_nested_region_unnested_table_bitmap != NULL) {
10032 #if XNU_MONITOR
10033 		pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
10034 #else
10035 		kfree_data(new_nested_region_unnested_table_bitmap,
10036 		    new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
10037 #endif
10038 	}
10039 	if (deref_subord) {
10040 #if XNU_MONITOR
10041 		os_atomic_dec(&subord->nested_count, relaxed);
10042 #endif
10043 		pmap_destroy_internal(subord);
10044 	}
10045 	return vrestart;
10046 }
10047 
10048 kern_return_t
10049 pmap_nest(
10050 	pmap_t grand,
10051 	pmap_t subord,
10052 	addr64_t vstart,
10053 	uint64_t size)
10054 {
10055 	kern_return_t kr = KERN_SUCCESS;
10056 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10057 	vm_map_offset_t vend = vaddr + size;
10058 	__unused vm_map_offset_t vlast = vaddr;
10059 
10060 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10061 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10062 	    VM_KERNEL_ADDRHIDE(vstart));
10063 
10064 	pmap_verify_preemptible();
10065 #if XNU_MONITOR
10066 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
10067 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10068 		if (kr == KERN_RESOURCE_SHORTAGE) {
10069 			pmap_alloc_page_for_ppl(0);
10070 			kr = KERN_SUCCESS;
10071 		} else if (kr == KERN_ABORTED) {
10072 			/**
10073 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10074 			 * that it won't update kr when KERN_SUCCESS is to be returned.
10075 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10076 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10077 			 */
10078 			kr = KERN_SUCCESS;
10079 			continue;
10080 		} else if (kr != KERN_SUCCESS) {
10081 			break;
10082 		} else if (vaddr == vlast) {
10083 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10084 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10085 		}
10086 		vlast = vaddr;
10087 	}
10088 
10089 	pmap_ledger_check_balance(grand);
10090 	pmap_ledger_check_balance(subord);
10091 #else
10092 	/**
10093 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10094 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10095 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10096 	 */
10097 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10098 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10099 	}
10100 #endif
10101 
10102 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10103 
10104 	return kr;
10105 }
10106 
10107 /*
10108  *	kern_return_t pmap_unnest(grand, vaddr)
10109  *
10110  *	grand  = the pmap that will have the virtual range unnested
10111  *	vaddr  = start of range in pmap to be unnested
10112  *	size   = size of range in pmap to be unnested
10113  *
10114  */
10115 
10116 kern_return_t
10117 pmap_unnest(
10118 	pmap_t grand,
10119 	addr64_t vaddr,
10120 	uint64_t size)
10121 {
10122 	return pmap_unnest_options(grand, vaddr, size, 0);
10123 }
10124 
10125 /**
10126  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10127  * from a top-level pmap ('grand').  The corresponding mappings in the nested
10128  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10129  * still have the region nested.  The mappings in 'grand' will be left empty
10130  * with the assumption that they will be demand-filled by subsequent access faults.
10131  *
10132  * This function operates in 2 main phases:
10133  * 1. Iteration over the nested pmap's mappings for the specified range to mark
10134  *    them non-global.
10135  * 2. Clearing of the twig-level TTEs for the address range in grand.
10136  *
10137  * This function may return early due to pending AST_URGENT preemption; if so
10138  * it will indicate the need to be re-entered.
10139  *
10140  * @param grand pmap from which to unnest mappings
10141  * @param vaddr twig-aligned virtual address for the beginning of the nested range
10142  * @param size twig-aligned size of the nested range
10143  * @param vrestart the page-aligned starting address of the current call.  May contain
10144  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10145  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10146  *        grand is being torn down and step 1) above is not needed.
10147  *
10148  * @return the virtual address at which to restart the operation, possibly including
10149  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
10150  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10151  */
10152 MARK_AS_PMAP_TEXT vm_map_offset_t
10153 pmap_unnest_options_internal(
10154 	pmap_t grand,
10155 	addr64_t vaddr,
10156 	uint64_t size,
10157 	vm_map_offset_t vrestart,
10158 	unsigned int option)
10159 {
10160 	vm_map_offset_t start;
10161 	vm_map_offset_t addr;
10162 	tt_entry_t     *tte_p;
10163 	unsigned int    current_index;
10164 	unsigned int    start_index;
10165 	unsigned int    max_index;
10166 	unsigned int    entry_count = 0;
10167 
10168 	addr64_t vend;
10169 	addr64_t true_end;
10170 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10171 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10172 	}
10173 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10174 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10175 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10176 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10177 	}
10178 
10179 	validate_pmap_mutable(grand);
10180 
10181 	if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10182 		panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10183 	}
10184 
10185 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10186 
10187 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10188 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10189 		    (unsigned long long)vaddr, (unsigned long long)size);
10190 	}
10191 
10192 	if (__improbable(grand->nested_pmap == NULL)) {
10193 		panic("%s: %p has no nested pmap", __func__, grand);
10194 	}
10195 
10196 	true_end = vend;
10197 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10198 		true_end = grand->nested_pmap->nested_region_true_end;
10199 	}
10200 
10201 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10202 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10203 			return vrestart;
10204 		}
10205 
10206 		start = vrestart;
10207 		if (start < grand->nested_pmap->nested_region_true_start) {
10208 			start = grand->nested_pmap->nested_region_true_start;
10209 		}
10210 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10211 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10212 		bool flush_tlb = false;
10213 
10214 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10215 			pt_entry_t  *bpte, *cpte;
10216 
10217 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10218 
10219 			bpte = pmap_pte(grand->nested_pmap, addr);
10220 
10221 			/*
10222 			 * If we've re-entered this function partway through unnesting a leaf region, the
10223 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10224 			 * the run of PTEs and the adjacent "in-progress" bit will be set.
10225 			 */
10226 			if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10227 			    testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10228 				/*
10229 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10230 				 * the nested pmap in this region will now be marked non-global.  Do this
10231 				 * before marking any of the PTEs within the region as non-global to avoid
10232 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10233 				 * in the region, which could lead to a TLB conflict if a non-global entry
10234 				 * is later inserted for the same VA in a pmap which has fully unnested this
10235 				 * region.
10236 				 */
10237 				setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10238 				setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10239 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10240 					pmap_paddr_t    pa;
10241 					unsigned int    pai = 0;
10242 					boolean_t               managed = FALSE;
10243 					pt_entry_t  spte;
10244 
10245 					if ((*cpte != ARM_PTE_TYPE_FAULT)
10246 					    && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10247 						spte = *((volatile pt_entry_t*)cpte);
10248 						while (!managed) {
10249 							pa = pte_to_pa(spte);
10250 							if (!pa_valid(pa)) {
10251 								break;
10252 							}
10253 							pai = pa_index(pa);
10254 							pvh_lock(pai);
10255 							spte = *((volatile pt_entry_t*)cpte);
10256 							pa = pte_to_pa(spte);
10257 							if (pai == pa_index(pa)) {
10258 								managed = TRUE;
10259 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10260 							}
10261 							pvh_unlock(pai);
10262 						}
10263 
10264 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10265 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10266 							flush_tlb = true;
10267 						}
10268 
10269 						if (managed) {
10270 							pvh_assert_locked(pai);
10271 							pvh_unlock(pai);
10272 						}
10273 					}
10274 
10275 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10276 					vrestart = addr;
10277 					++entry_count;
10278 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10279 					    pmap_pending_preemption())) {
10280 						goto unnest_subord_done;
10281 					}
10282 				}
10283 				clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10284 			}
10285 			addr = vlim;
10286 			vrestart = addr;
10287 			++entry_count;
10288 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10289 			    pmap_pending_preemption())) {
10290 				break;
10291 			}
10292 		}
10293 
10294 unnest_subord_done:
10295 		if (flush_tlb) {
10296 			FLUSH_PTE_STRONG();
10297 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10298 		}
10299 
10300 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10301 		if (current_index < max_index) {
10302 			return vrestart;
10303 		}
10304 	}
10305 
10306 	/*
10307 	 * invalidate all pdes for segment at vaddr in pmap grand
10308 	 */
10309 	if (vrestart & PMAP_NEST_GRAND) {
10310 		addr = vrestart & ~PMAP_NEST_GRAND;
10311 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10312 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10313 		}
10314 	} else {
10315 		addr = vaddr;
10316 		vrestart = vaddr | PMAP_NEST_GRAND;
10317 	}
10318 
10319 	/**
10320 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10321 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10322 	 * upon reentry.
10323 	 */
10324 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10325 		return vrestart;
10326 	}
10327 
10328 	if (addr < grand->nested_pmap->nested_region_true_start) {
10329 		addr = grand->nested_pmap->nested_region_true_start;
10330 	}
10331 
10332 	start = addr;
10333 
10334 	while (addr < true_end) {
10335 		tte_p = pmap_tte(grand, addr);
10336 		/*
10337 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10338 		 * so it's possible that a region we're trying to unnest may not have been
10339 		 * nested in the first place.
10340 		 */
10341 		if (tte_p != NULL) {
10342 			*tte_p = ARM_TTE_TYPE_FAULT;
10343 		}
10344 		addr += pt_attr_twig_size(pt_attr);
10345 		vrestart = addr | PMAP_NEST_GRAND;
10346 		++entry_count;
10347 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10348 		    pmap_pending_preemption())) {
10349 			break;
10350 		}
10351 	}
10352 	if (addr >= true_end) {
10353 		vrestart = vend | PMAP_NEST_GRAND;
10354 	}
10355 
10356 	FLUSH_PTE_STRONG();
10357 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10358 
10359 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10360 
10361 	return vrestart;
10362 }
10363 
10364 kern_return_t
10365 pmap_unnest_options(
10366 	pmap_t grand,
10367 	addr64_t vaddr,
10368 	uint64_t size,
10369 	unsigned int option)
10370 {
10371 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10372 	vm_map_offset_t vend = vaddr + size;
10373 
10374 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10375 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10376 
10377 	pmap_verify_preemptible();
10378 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10379 #if XNU_MONITOR
10380 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10381 #else
10382 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10383 #endif
10384 	}
10385 
10386 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10387 
10388 	return KERN_SUCCESS;
10389 }
10390 
10391 boolean_t
10392 pmap_adjust_unnest_parameters(
10393 	__unused pmap_t p,
10394 	__unused vm_map_offset_t *s,
10395 	__unused vm_map_offset_t *e)
10396 {
10397 	return TRUE; /* to get to log_unnest_badness()... */
10398 }
10399 
10400 #if PMAP_FORK_NEST
10401 /**
10402  * Perform any necessary pre-nesting of the parent's shared region at fork()
10403  * time.
10404  *
10405  * @note This should only be called from vm_map_fork().
10406  *
10407  * @param old_pmap The pmap of the parent task.
10408  * @param new_pmap The pmap of the child task.
10409  * @param nesting_start An output parameter that is updated with the start
10410  *                      address of the range that was pre-nested
10411  * @param nesting_end An output parameter that is updated with the end
10412  *                      address of the range that was pre-nested
10413  *
10414  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10415  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10416  */
10417 kern_return_t
10418 pmap_fork_nest(
10419 	pmap_t old_pmap,
10420 	pmap_t new_pmap,
10421 	vm_map_offset_t *nesting_start,
10422 	vm_map_offset_t *nesting_end)
10423 {
10424 	if (old_pmap == NULL || new_pmap == NULL) {
10425 		return KERN_INVALID_ARGUMENT;
10426 	}
10427 	if (old_pmap->nested_pmap == NULL) {
10428 		return KERN_SUCCESS;
10429 	}
10430 	pmap_nest(new_pmap,
10431 	    old_pmap->nested_pmap,
10432 	    old_pmap->nested_region_addr,
10433 	    old_pmap->nested_region_size);
10434 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10435 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10436 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
10437 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10438 	    new_pmap->nested_pmap,
10439 	    new_pmap->nested_region_addr,
10440 	    new_pmap->nested_region_size,
10441 	    old_pmap->nested_pmap,
10442 	    old_pmap->nested_region_addr,
10443 	    old_pmap->nested_region_size);
10444 	*nesting_start = old_pmap->nested_region_addr;
10445 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
10446 	return KERN_SUCCESS;
10447 }
10448 #endif /* PMAP_FORK_NEST */
10449 
10450 /*
10451  * disable no-execute capability on
10452  * the specified pmap
10453  */
10454 #if DEVELOPMENT || DEBUG
10455 void
10456 pmap_disable_NX(
10457 	pmap_t pmap)
10458 {
10459 	pmap->nx_enabled = FALSE;
10460 }
10461 #else
10462 void
10463 pmap_disable_NX(
10464 	__unused pmap_t pmap)
10465 {
10466 }
10467 #endif
10468 
10469 /*
10470  * flush a range of hardware TLB entries.
10471  * NOTE: assumes the smallest TLB entry in use will be for
10472  * an ARM small page (4K).
10473  */
10474 
10475 #if __ARM_RANGE_TLBI__
10476 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10477 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10478 #else
10479 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10480 #endif // __ARM_RANGE_TLBI__
10481 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10482     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10483     "of npages to 32 bits below may truncate.");
10484 
10485 static void
10486 flush_mmu_tlb_region_asid_async(
10487 	vm_offset_t va,
10488 	size_t length,
10489 	pmap_t pmap,
10490 	bool last_level_only __unused,
10491 	bool strong __unused)
10492 {
10493 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10494 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10495 	size_t npages = length >> pmap_page_shift;
10496 	uint32_t asid;
10497 
10498 	asid = pmap->hw_asid;
10499 
10500 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10501 		boolean_t       flush_all = FALSE;
10502 
10503 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10504 			flush_all = TRUE;
10505 		}
10506 		if (flush_all) {
10507 			flush_mmu_tlb_async();
10508 		} else {
10509 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10510 		}
10511 		return;
10512 	}
10513 #if __ARM_RANGE_TLBI__
10514 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10515 		/**
10516 		 * Note that casting npages to 32 bits here is always safe thanks to
10517 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10518 		 */
10519 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10520 		if (pmap->type == PMAP_TYPE_NESTED) {
10521 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10522 		} else {
10523 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10524 		}
10525 		return;
10526 	}
10527 #endif
10528 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10529 	va = tlbi_asid(asid) | tlbi_addr(va);
10530 
10531 	if (pmap->type == PMAP_TYPE_NESTED) {
10532 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10533 	} else {
10534 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10535 	}
10536 }
10537 
10538 MARK_AS_PMAP_TEXT static void
10539 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10540 {
10541 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10542 }
10543 
10544 void
10545 flush_mmu_tlb_region(
10546 	vm_offset_t va,
10547 	unsigned length)
10548 {
10549 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10550 	sync_tlb_flush();
10551 }
10552 
10553 unsigned int
10554 pmap_cache_attributes(
10555 	ppnum_t pn)
10556 {
10557 	pmap_paddr_t    paddr;
10558 	unsigned int    pai;
10559 	unsigned int    result;
10560 	pp_attr_t       pp_attr_current;
10561 
10562 	paddr = ptoa(pn);
10563 
10564 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10565 
10566 	if (!pa_valid(paddr)) {
10567 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10568 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10569 	}
10570 
10571 	result = VM_WIMG_DEFAULT;
10572 
10573 	pai = pa_index(paddr);
10574 
10575 	pp_attr_current = pp_attr_table[pai];
10576 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10577 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10578 	}
10579 	return result;
10580 }
10581 
10582 MARK_AS_PMAP_TEXT static void
10583 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10584 {
10585 	if ((wimg_bits_prev != wimg_bits_new)
10586 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10587 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10588 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10589 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10590 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10591 		pmap_sync_page_attributes_phys(pn);
10592 	}
10593 
10594 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10595 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10596 	}
10597 }
10598 
10599 MARK_AS_PMAP_TEXT __unused void
10600 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10601 {
10602 	pmap_paddr_t paddr = ptoa(pn);
10603 	const unsigned int pai = pa_index(paddr);
10604 
10605 	if (__improbable(!pa_valid(paddr))) {
10606 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10607 	}
10608 
10609 	pvh_lock(pai);
10610 
10611 #if XNU_MONITOR
10612 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10613 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10614 	}
10615 #endif
10616 
10617 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10618 
10619 	pvh_unlock(pai);
10620 
10621 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10622 }
10623 
10624 void *
10625 pmap_map_compressor_page(ppnum_t pn)
10626 {
10627 #if __ARM_PTE_PHYSMAP__
10628 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10629 	if (cacheattr != VM_WIMG_DEFAULT) {
10630 #if XNU_MONITOR
10631 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10632 #else
10633 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10634 #endif
10635 	}
10636 #endif
10637 	return (void*)phystokv(ptoa(pn));
10638 }
10639 
10640 void
10641 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10642 {
10643 #if __ARM_PTE_PHYSMAP__
10644 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10645 	if (cacheattr != VM_WIMG_DEFAULT) {
10646 #if XNU_MONITOR
10647 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10648 #else
10649 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10650 #endif
10651 	}
10652 #endif
10653 }
10654 
10655 /**
10656  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10657  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10658  *
10659  * @param page_list List of pages to be updated.
10660  * @param cacheattr The new cache attribute.
10661  */
10662 void
10663 pmap_batch_set_cache_attributes(
10664 	const unified_page_list_t *page_list,
10665 	unsigned int cacheattr)
10666 {
10667 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10668 
10669 	if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10670 		/**
10671 		 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10672 		 * In an ideal world we would just use these iterator functions within
10673 		 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10674 		 * that means we'll need to take special care to handle pending preemption and
10675 		 * if necessary return the iterator position out to this function and then re-enter
10676 		 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10677 		 * secure manner.  Not impossible, but also not trivial, so unless someone asks for
10678 		 * this perf improvement on the PPL I'm going to take the lazy approach here.
10679 		 */
10680 		unified_page_list_iterator_t iter;
10681 
10682 		for (unified_page_list_iterator_init(page_list, &iter);
10683 		    !unified_page_list_iterator_end(&iter);
10684 		    unified_page_list_iterator_next(&iter)) {
10685 			bool is_fictitious = false;
10686 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10687 			if (__probable(!is_fictitious)) {
10688 #if XNU_MONITOR
10689 				pmap_set_cache_attributes_ppl(pn, cacheattr);
10690 #else /* !XNU_MONITOR */
10691 				pmap_set_cache_attributes_internal(pn, cacheattr);
10692 #endif /* XNU_MONITOR */
10693 			}
10694 		}
10695 		return;
10696 	}
10697 
10698 	if (page_list->upl.upl_size == 0) {
10699 		return;
10700 	}
10701 
10702 	batch_set_cache_attr_state_t states;
10703 	states.page_index = 0;
10704 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10705 	states.tlb_flush_pass_needed = false;
10706 	states.rt_cache_flush_pass_needed = false;
10707 
10708 	/* Verify we are being called from a preemptible context. */
10709 	pmap_verify_preemptible();
10710 
10711 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10712 #if XNU_MONITOR
10713 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10714 		    states, page_list->upl.upl_size, cacheattr);
10715 #else /* !XNU_MONITOR */
10716 		states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10717 		    states, page_list->upl.upl_size, cacheattr);
10718 #endif /* XNU_MONITOR */
10719 	}
10720 
10721 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10722 }
10723 
10724 /**
10725  * Flushes TLB entries associated with the page specified by paddr, but do not
10726  * issue barriers yet.
10727  *
10728  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10729  */
10730 MARK_AS_PMAP_TEXT static void
10731 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10732 {
10733 #if __ARM_PTE_PHYSMAP__
10734 	/* Flush the physical aperture mappings. */
10735 	const vm_offset_t kva = phystokv(paddr);
10736 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10737 #endif /* __ARM_PTE_PHYSMAP__ */
10738 
10739 	/* Flush the mappings tracked in the ptes. */
10740 	const unsigned int pai = pa_index(paddr);
10741 	pv_entry_t **pv_h = pai_to_pvh(pai);
10742 
10743 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10744 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10745 
10746 	pvh_assert_locked(pai);
10747 
10748 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10749 		pte_p = pvh_ptep(pv_h);
10750 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10751 		pve_p = pvh_pve_list(pv_h);
10752 		pte_p = PT_ENTRY_NULL;
10753 	}
10754 
10755 	int pve_ptep_idx = 0;
10756 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10757 		if (pve_p != PV_ENTRY_NULL) {
10758 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10759 			if (pte_p == PT_ENTRY_NULL) {
10760 				goto flush_tlb_skip_pte;
10761 			}
10762 		}
10763 
10764 #ifdef PVH_FLAG_IOMMU
10765 		if (pvh_ptep_is_iommu(pte_p)) {
10766 			goto flush_tlb_skip_pte;
10767 		}
10768 #endif /* PVH_FLAG_IOMMU */
10769 		pmap_t pmap = ptep_get_pmap(pte_p);
10770 		vm_map_address_t va = ptep_get_va(pte_p);
10771 
10772 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10773 		    pmap, true, false);
10774 
10775 flush_tlb_skip_pte:
10776 		pte_p = PT_ENTRY_NULL;
10777 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10778 			pve_ptep_idx = 0;
10779 			pve_p = pve_next(pve_p);
10780 		}
10781 	}
10782 }
10783 
10784 /**
10785  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10786  *
10787  * @param pai The Physical Address Index of the entry.
10788  * @param cacheattr The new cache attribute.
10789  */
10790 MARK_AS_PMAP_TEXT static void
10791 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10792 {
10793 	pvh_assert_locked(pai);
10794 
10795 	pp_attr_t pp_attr_current, pp_attr_template;
10796 	do {
10797 		pp_attr_current = pp_attr_table[pai];
10798 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10799 
10800 		/**
10801 		 * WIMG bits should only be updated under the PVH lock, but we should do
10802 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10803 		 */
10804 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10805 }
10806 
10807 /**
10808  * Batch updates the cache attributes of a list of pages in three passes.
10809  *
10810  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10811  * In pass two, TLB entries are flushed for each page in the list if necessary.
10812  * In pass three, caches are cleaned for each page in the list if necessary.
10813  *
10814  * When running in PPL, this function may decide to return to the caller in response
10815  * to AST_URGENT.
10816  *
10817  * @param user_page_list List of pages to be updated.
10818  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10819  * @param page_cnt Number of pages in total in user_page_list.
10820  * @param cacheattr The new cache attributes.
10821  *
10822  * @return The new state of the state machine.
10823  */
10824 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10825 pmap_batch_set_cache_attributes_internal(
10826 #if XNU_MONITOR
10827 	volatile upl_page_info_t *user_page_list,
10828 #else /* !XNU_MONITOR */
10829 	upl_page_info_array_t user_page_list,
10830 #endif /* XNU_MONITOR */
10831 	batch_set_cache_attr_state_t states,
10832 	unsigned int page_cnt,
10833 	unsigned int cacheattr)
10834 {
10835 	uint64_t page_index = states.page_index;
10836 	uint64_t state = states.state;
10837 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10838 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10839 
10840 	/* For verifying progress. */
10841 	__assert_only const uint64_t page_index_old = page_index;
10842 	__assert_only const uint64_t state_old = state;
10843 
10844 	/* Assert page_index and state are within their range. */
10845 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10846 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10847 	}
10848 
10849 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10850 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10851 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10852 		while (page_index < page_cnt) {
10853 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10854 			const pmap_paddr_t paddr = ptoa(pn);
10855 
10856 			if (!pa_valid(paddr)) {
10857 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10858 			}
10859 
10860 			const unsigned int pai = pa_index(paddr);
10861 
10862 			/* Lock the page. */
10863 			pvh_lock(pai);
10864 
10865 #if XNU_MONITOR
10866 			if (ppattr_pa_test_monitor(paddr)) {
10867 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10868 			}
10869 #endif /* XNU_MONITOR */
10870 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10871 
10872 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10873 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10874 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10875 			}
10876 
10877 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10878 
10879 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10880 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10881 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10882 			}
10883 
10884 			/* Update the cache attributes in PTE and PP_ATTR table. */
10885 			if (wimg_bits_new != wimg_bits_prev) {
10886 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10887 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10888 			}
10889 
10890 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10891 				rt_cache_flush_pass_needed = true;
10892 			}
10893 
10894 			pvh_unlock(pai);
10895 
10896 			page_index++;
10897 
10898 #if XNU_MONITOR
10899 			/**
10900 			 * Check for AST_URGENT every page, as the pve list search in cache
10901 			 * update can take non-constant time.
10902 			 */
10903 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10904 				goto pbscai_exit;
10905 			}
10906 #endif /* XNU_MONITOR */
10907 		}
10908 
10909 		/* page_index == page_cnt && !pmap_pending_preemption() */
10910 		if (tlb_flush_pass_needed) {
10911 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10912 		} else if (rt_cache_flush_pass_needed) {
10913 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10914 		} else {
10915 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10916 		}
10917 		page_index = 0;
10918 
10919 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10920 		FLUSH_PTE_STRONG();
10921 
10922 #if XNU_MONITOR
10923 		if (__improbable(pmap_pending_preemption())) {
10924 			goto pbscai_exit;
10925 		}
10926 #endif /* XNU_MONITOR */
10927 	}
10928 
10929 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10930 		/**
10931 		 * Pass 2: for each physical page and for each mapping, we need to flush
10932 		 * the TLB for it.
10933 		 */
10934 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10935 		while (page_index < page_cnt) {
10936 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10937 
10938 			const pmap_paddr_t paddr = ptoa(pn);
10939 			if (!pa_valid(paddr)) {
10940 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10941 			}
10942 
10943 			const unsigned int pai = pa_index(paddr);
10944 
10945 			pvh_lock(pai);
10946 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10947 			pvh_unlock(pai);
10948 
10949 			page_index++;
10950 
10951 #if XNU_MONITOR
10952 			/**
10953 			 * Check for AST_URGENT every page, as the pve list search in cache
10954 			 * update can take non-constant time.
10955 			 */
10956 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10957 				goto pbscai_exit;
10958 			}
10959 #endif /* XNU_MONITOR */
10960 		}
10961 
10962 #if HAS_FEAT_XS
10963 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10964 		arm64_sync_tlb(false);
10965 #else
10966 		/**
10967 		 * For targets that distinguish between mild and strong DSB, mild DSB
10968 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10969 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10970 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10971 		 */
10972 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10973 #endif
10974 
10975 		if (rt_cache_flush_pass_needed) {
10976 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10977 		} else {
10978 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10979 		}
10980 		page_index = 0;
10981 
10982 #if XNU_MONITOR
10983 		if (__improbable(pmap_pending_preemption())) {
10984 			goto pbscai_exit;
10985 		}
10986 #endif /* XNU_MONITOR */
10987 	}
10988 
10989 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10990 		/* Pass 3: Flush the cache if the page is recently set to RT */
10991 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10992 #if !XNU_MONITOR
10993 		/**
10994 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10995 		 * in the state where DC by VA instructions remain enabled.
10996 		 */
10997 		disable_preemption();
10998 #endif /* !XNU_MONITOR */
10999 
11000 		assert(get_preemption_level() > 0);
11001 
11002 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11003 		/**
11004 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11005 		 * and the host will handle cache maintenance for it. So we don't need to
11006 		 * worry about enabling the ops here for AVP.
11007 		 */
11008 		enable_dc_mva_ops();
11009 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11010 
11011 		while (page_index < page_cnt) {
11012 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11013 
11014 			if (!pa_valid(paddr)) {
11015 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11016 			}
11017 
11018 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11019 
11020 			page_index++;
11021 
11022 #if XNU_MONITOR
11023 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11024 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11025 				disable_dc_mva_ops();
11026 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11027 				goto pbscai_exit;
11028 			}
11029 #endif /* XNU_MONITOR */
11030 		}
11031 
11032 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11033 		disable_dc_mva_ops();
11034 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11035 
11036 #if !XNU_MONITOR
11037 		enable_preemption();
11038 #endif /* !XNU_MONITOR */
11039 
11040 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11041 		page_index = 0;
11042 	}
11043 
11044 #if XNU_MONITOR
11045 pbscai_exit:
11046 #endif /* XNU_MONITOR */
11047 	/* Assert page_index and state are within their range. */
11048 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11049 
11050 	/* Make sure we are making progress in this call. */
11051 	assert(page_index > page_index_old || state > state_old);
11052 
11053 	batch_set_cache_attr_state_t states_new;
11054 	states_new.page_index = page_index;
11055 	states_new.state = state;
11056 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11057 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11058 	return states_new;
11059 }
11060 
11061 MARK_AS_PMAP_TEXT static void
11062 pmap_set_cache_attributes_priv(
11063 	ppnum_t pn,
11064 	unsigned int cacheattr,
11065 	boolean_t external __unused)
11066 {
11067 	pmap_paddr_t    paddr;
11068 	unsigned int    pai;
11069 	pp_attr_t       pp_attr_current;
11070 	pp_attr_t       pp_attr_template;
11071 	unsigned int    wimg_bits_prev, wimg_bits_new;
11072 
11073 	paddr = ptoa(pn);
11074 
11075 	if (!pa_valid(paddr)) {
11076 		return;                         /* Not a managed page. */
11077 	}
11078 
11079 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
11080 		cacheattr = VM_WIMG_DEFAULT;
11081 	}
11082 
11083 	pai = pa_index(paddr);
11084 
11085 	pvh_lock(pai);
11086 
11087 #if XNU_MONITOR
11088 	if (external && ppattr_pa_test_monitor(paddr)) {
11089 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11090 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
11091 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11092 	}
11093 #endif
11094 
11095 	do {
11096 		pp_attr_current = pp_attr_table[pai];
11097 		wimg_bits_prev = VM_WIMG_DEFAULT;
11098 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11099 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11100 		}
11101 
11102 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11103 
11104 		/**
11105 		 * WIMG bits should only be updated under the PVH lock, but we should do
11106 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11107 		 */
11108 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11109 
11110 	wimg_bits_new = VM_WIMG_DEFAULT;
11111 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11112 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11113 	}
11114 
11115 	if (wimg_bits_new != wimg_bits_prev) {
11116 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
11117 	}
11118 
11119 	pvh_unlock(pai);
11120 
11121 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11122 }
11123 
11124 MARK_AS_PMAP_TEXT void
11125 pmap_set_cache_attributes_internal(
11126 	ppnum_t pn,
11127 	unsigned int cacheattr)
11128 {
11129 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11130 }
11131 
11132 void
11133 pmap_set_cache_attributes(
11134 	ppnum_t pn,
11135 	unsigned int cacheattr)
11136 {
11137 #if XNU_MONITOR
11138 	pmap_set_cache_attributes_ppl(pn, cacheattr);
11139 #else
11140 	pmap_set_cache_attributes_internal(pn, cacheattr);
11141 #endif
11142 }
11143 
11144 /**
11145  * Updates the page numbered ppnum to have attribute specified by attributes.
11146  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11147  * The necessity of the TLB flush is returned in case this function is called
11148  * in a batched manner and the TLB flush is intended to be done at a different
11149  * timing.
11150  *
11151  * @param ppnum Page Number of the page to be updated.
11152  * @param attributes The new cache attributes.
11153  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11154  *        immediately.
11155  *
11156  * @return Returns true if a TLB flush is needed for this update regardless of
11157  *         whether a flush has occurred already.
11158  */
11159 MARK_AS_PMAP_TEXT bool
11160 pmap_update_cache_attributes_locked(
11161 	ppnum_t ppnum,
11162 	unsigned attributes,
11163 	bool perform_tlbi)
11164 {
11165 	pmap_paddr_t    phys = ptoa(ppnum);
11166 	pv_entry_t      *pve_p;
11167 	pt_entry_t      *pte_p;
11168 	pv_entry_t      **pv_h;
11169 	pt_entry_t      tmplate;
11170 	unsigned int    pai;
11171 	boolean_t       tlb_flush_needed = false;
11172 
11173 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11174 
11175 	if (pmap_panic_dev_wimg_on_managed) {
11176 		switch (attributes & VM_WIMG_MASK) {
11177 		case VM_WIMG_IO:                        // nGnRnE
11178 		case VM_WIMG_POSTED:                    // nGnRE
11179 		/* supported on DRAM, but slow, so we disallow */
11180 
11181 		case VM_WIMG_POSTED_REORDERED:          // nGRE
11182 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11183 			/* unsupported on DRAM */
11184 
11185 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11186 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11187 			break;
11188 
11189 		default:
11190 			/* not device type memory, all good */
11191 
11192 			break;
11193 		}
11194 	}
11195 
11196 #if __ARM_PTE_PHYSMAP__
11197 	vm_offset_t kva = phystokv(phys);
11198 	pte_p = pmap_pte(kernel_pmap, kva);
11199 
11200 	tmplate = *pte_p;
11201 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11202 #if XNU_MONITOR
11203 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11204 #else
11205 	tmplate |= wimg_to_pte(attributes, phys);
11206 #endif
11207 	if (tmplate & ARM_PTE_HINT_MASK) {
11208 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11209 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
11210 	}
11211 
11212 	if (perform_tlbi) {
11213 		write_pte_strong(pte_p, tmplate);
11214 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11215 	} else {
11216 		write_pte_fast(pte_p, tmplate);
11217 	}
11218 	tlb_flush_needed = true;
11219 #endif
11220 
11221 	pai = pa_index(phys);
11222 
11223 	pv_h = pai_to_pvh(pai);
11224 
11225 	pte_p = PT_ENTRY_NULL;
11226 	pve_p = PV_ENTRY_NULL;
11227 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11228 		pte_p = pvh_ptep(pv_h);
11229 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11230 		pve_p = pvh_pve_list(pv_h);
11231 		pte_p = PT_ENTRY_NULL;
11232 	}
11233 
11234 	int pve_ptep_idx = 0;
11235 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11236 		vm_map_address_t va;
11237 		pmap_t          pmap;
11238 
11239 		if (pve_p != PV_ENTRY_NULL) {
11240 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11241 			if (pte_p == PT_ENTRY_NULL) {
11242 				goto cache_skip_pve;
11243 			}
11244 		}
11245 
11246 #ifdef PVH_FLAG_IOMMU
11247 		if (pvh_ptep_is_iommu(pte_p)) {
11248 			goto cache_skip_pve;
11249 		}
11250 #endif
11251 		pmap = ptep_get_pmap(pte_p);
11252 #if HAS_FEAT_XS
11253 		/**
11254 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11255 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11256 		 */
11257 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11258 #endif /* HAS_FEAT_XS */
11259 		va = ptep_get_va(pte_p);
11260 
11261 		tmplate = *pte_p;
11262 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11263 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11264 
11265 		if (perform_tlbi) {
11266 			write_pte_strong(pte_p, tmplate);
11267 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11268 			    pmap, true, false);
11269 		} else {
11270 			write_pte_fast(pte_p, tmplate);
11271 		}
11272 		tlb_flush_needed = true;
11273 
11274 cache_skip_pve:
11275 		pte_p = PT_ENTRY_NULL;
11276 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11277 			pve_ptep_idx = 0;
11278 			pve_p = pve_next(pve_p);
11279 		}
11280 	}
11281 	if (perform_tlbi && tlb_flush_needed) {
11282 #if HAS_FEAT_XS
11283 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11284 		arm64_sync_tlb(false);
11285 #else
11286 		/**
11287 		 * For targets that distinguish between mild and strong DSB, mild DSB
11288 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11289 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11290 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11291 		 */
11292 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11293 #endif
11294 	}
11295 
11296 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11297 
11298 	return tlb_flush_needed;
11299 }
11300 
11301 /**
11302  * Mark a pmap as being dedicated to use for a commpage mapping.
11303  * The pmap itself will never be activated on a CPU; its mappings will
11304  * only be embedded in userspace pmaps at a fixed virtual address.
11305  *
11306  * @param pmap the pmap to mark as belonging to a commpage.
11307  */
11308 static void
11309 pmap_set_commpage(pmap_t pmap)
11310 {
11311 #if XNU_MONITOR
11312 	assert(!pmap_ppl_locked_down);
11313 #endif
11314 	assert(pmap->type == PMAP_TYPE_USER);
11315 	pmap->type = PMAP_TYPE_COMMPAGE;
11316 	/*
11317 	 * Free the pmap's ASID.  This pmap should not ever be directly
11318 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11319 	 * ASID space contention but will also cause pmap_switch() to panic
11320 	 * if an attacker tries to activate this pmap.  Disable preemption to
11321 	 * accommodate the *_nopreempt spinlock in free_asid().
11322 	 */
11323 	mp_disable_preemption();
11324 	pmap_get_pt_ops(pmap)->free_id(pmap);
11325 	mp_enable_preemption();
11326 }
11327 
11328 static void
11329 pmap_update_tt3e(
11330 	pmap_t pmap,
11331 	vm_address_t address,
11332 	tt_entry_t template)
11333 {
11334 	tt_entry_t *ptep, pte;
11335 
11336 	ptep = pmap_tt3e(pmap, address);
11337 	if (ptep == NULL) {
11338 		panic("%s: no ptep?", __FUNCTION__);
11339 	}
11340 
11341 	pte = *ptep;
11342 	pte = tte_to_pa(pte) | template;
11343 	write_pte_strong(ptep, pte);
11344 }
11345 
11346 /* Note absence of non-global bit */
11347 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11348 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11349 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11350 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11351 
11352 /* Note absence of non-global bit and no-execute bit.  */
11353 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11354 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11355 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11356 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11357 
11358 void
11359 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11360     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11361 {
11362 	kern_return_t kr;
11363 	pmap_paddr_t data_pa = 0; // data address
11364 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11365 	pmap_paddr_t text_pa = 0; // text address
11366 
11367 	*kernel_data_addr = 0;
11368 	*kernel_text_addr = 0;
11369 	*user_text_addr = 0;
11370 
11371 #if XNU_MONITOR
11372 	data_pa = pmap_alloc_page_for_kern(0);
11373 	assert(data_pa);
11374 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11375 	ro_data_pa = pmap_alloc_page_for_kern(0);
11376 	assert(ro_data_pa);
11377 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11378 #if CONFIG_ARM_PFZ
11379 	text_pa = pmap_alloc_page_for_kern(0);
11380 	assert(text_pa);
11381 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11382 #endif
11383 
11384 #else /* XNU_MONITOR */
11385 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11386 	/*
11387 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11388 	 * mapped at page granularity, so a separate page for kernel RO data would not
11389 	 * be useful.
11390 	 */
11391 	ro_data_pa = data_pa;
11392 #if CONFIG_ARM_PFZ
11393 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11394 #endif
11395 
11396 #endif /* XNU_MONITOR */
11397 
11398 	/*
11399 	 * In order to avoid burning extra pages on mapping the shared page, we
11400 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11401 	 * translation tables from this pmap into other pmaps.  The level we
11402 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11403 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11404 	 *
11405 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11406 	 * shared cache).
11407 	 *
11408 	 * Note that we update parameters of the entry for our unique needs (NG
11409 	 * entry, etc.).
11410 	 */
11411 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11412 	assert(commpage_pmap_default != NULL);
11413 	pmap_set_commpage(commpage_pmap_default);
11414 
11415 	/* The user 64-bit mappings... */
11416 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11417 	assert(kr == KERN_SUCCESS);
11418 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11419 
11420 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11421 	assert(kr == KERN_SUCCESS);
11422 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11423 #if CONFIG_ARM_PFZ
11424 	/* User mapping of comm page text section for 64 bit mapping only
11425 	 *
11426 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11427 	 * user processes to get this page mapped in, they should never call into
11428 	 * this page.
11429 	 *
11430 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11431 	 * is slid in the same L3 as the data commpage.  It is either outside the
11432 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11433 	 * it is reserved and unavailable to mach VM for future mappings.
11434 	 */
11435 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11436 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11437 
11438 	vm_map_address_t commpage_text_va = 0;
11439 
11440 	do {
11441 		int text_leaf_index = random() % num_ptes;
11442 
11443 		// Generate a VA for the commpage text with the same root and twig index as data
11444 		// comm page, but with new leaf index we've just generated.
11445 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11446 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11447 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11448 
11449 	// Assert that this is empty
11450 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11451 	assert(ptep != PT_ENTRY_NULL);
11452 	assert(*ptep == ARM_TTE_EMPTY);
11453 
11454 	// At this point, we've found the address we want to insert our comm page at
11455 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11456 	assert(kr == KERN_SUCCESS);
11457 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11458 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11459 
11460 	*user_text_addr = commpage_text_va;
11461 #endif
11462 
11463 	/* ...and the user 32-bit mappings. */
11464 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11465 	assert(kr == KERN_SUCCESS);
11466 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11467 
11468 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11469 	assert(kr == KERN_SUCCESS);
11470 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11471 #if __ARM_MIXED_PAGE_SIZE__
11472 	/**
11473 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11474 	 * new set of page tables that point to the exact same 16K shared page as
11475 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11476 	 * the only part that contains relevant data.
11477 	 */
11478 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11479 	assert(commpage_pmap_4k != NULL);
11480 	pmap_set_commpage(commpage_pmap_4k);
11481 
11482 	/* The user 64-bit mappings... */
11483 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11484 	assert(kr == KERN_SUCCESS);
11485 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11486 
11487 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11488 	assert(kr == KERN_SUCCESS);
11489 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11490 
11491 	/* ...and the user 32-bit mapping. */
11492 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11493 	assert(kr == KERN_SUCCESS);
11494 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11495 
11496 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11497 	assert(kr == KERN_SUCCESS);
11498 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11499 #endif
11500 
11501 	/* For manipulation in kernel, go straight to physical page */
11502 	*kernel_data_addr = phystokv(data_pa);
11503 	assert(commpage_ro_data_kva == 0);
11504 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11505 	assert(commpage_text_kva == 0);
11506 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11507 }
11508 
11509 
11510 /*
11511  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11512  * with user controlled TTEs for regions that aren't explicitly reserved by the
11513  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11514  */
11515 #if (ARM_PGSHIFT == 14)
11516 /**
11517  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11518  * commpage completely above the maximum 32-bit userspace VA.
11519  */
11520 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11521 
11522 /**
11523  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11524  * userspace VAs can nest the commpage completely above the maximum 64-bit
11525  * userpace VA, but that technically isn't true on macOS. On those systems, the
11526  * commpage lives within the userspace VA range, but is protected by the VM as
11527  * a reserved region (see vm_reserved_regions[] definition for more info).
11528  */
11529 
11530 #elif (ARM_PGSHIFT == 12)
11531 /**
11532  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11533  * above the maximum userspace VA.
11534  */
11535 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11536 #else
11537 #error Nested shared page mapping is unsupported on this config
11538 #endif
11539 
11540 MARK_AS_PMAP_TEXT kern_return_t
11541 pmap_insert_commpage_internal(
11542 	pmap_t pmap)
11543 {
11544 	kern_return_t kr = KERN_SUCCESS;
11545 	vm_offset_t commpage_vaddr;
11546 	pt_entry_t *ttep, *src_ttep;
11547 	int options = 0;
11548 	pmap_t commpage_pmap = commpage_pmap_default;
11549 
11550 	/* Validate the pmap input before accessing its data. */
11551 	validate_pmap_mutable(pmap);
11552 
11553 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11554 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11555 
11556 #if __ARM_MIXED_PAGE_SIZE__
11557 #if !__ARM_16K_PG__
11558 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11559 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11560 #endif /* !__ARM_16K_PG__ */
11561 
11562 	/* Choose the correct shared page pmap to use. */
11563 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11564 	if (pmap_page_size == 16384) {
11565 		commpage_pmap = commpage_pmap_default;
11566 	} else if (pmap_page_size == 4096) {
11567 		commpage_pmap = commpage_pmap_4k;
11568 	} else {
11569 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11570 	}
11571 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11572 
11573 #if XNU_MONITOR
11574 	options |= PMAP_OPTIONS_NOWAIT;
11575 #endif /* XNU_MONITOR */
11576 
11577 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11578 #error We assume a single page.
11579 #endif
11580 
11581 	if (pmap_is_64bit(pmap)) {
11582 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11583 	} else {
11584 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11585 	}
11586 
11587 
11588 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11589 
11590 	/*
11591 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11592 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11593 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11594 	 * to "nest".
11595 	 *
11596 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11597 	 * nesting just means inserting pointers to pre-allocated tables inside of
11598 	 * the passed in pmap to allow us to share page tables (which map the shared
11599 	 * page) for every task. This saves at least one page of memory per process
11600 	 * compared to creating new page tables in every process for mapping the
11601 	 * shared page.
11602 	 */
11603 
11604 	/**
11605 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11606 	 * page's tables into place.
11607 	 */
11608 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11609 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11610 
11611 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11612 
11613 		if (kr != KERN_SUCCESS) {
11614 #if XNU_MONITOR
11615 			if (kr == KERN_RESOURCE_SHORTAGE) {
11616 				return kr;
11617 			} else
11618 #endif
11619 			if (kr == KERN_ABORTED) {
11620 				return kr;
11621 			} else {
11622 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11623 			}
11624 		}
11625 
11626 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11627 	}
11628 
11629 	if (*ttep != ARM_PTE_EMPTY) {
11630 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11631 	}
11632 
11633 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11634 
11635 	*ttep = *src_ttep;
11636 	FLUSH_PTE_STRONG();
11637 
11638 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11639 
11640 	return kr;
11641 }
11642 
11643 static void
11644 pmap_unmap_commpage(
11645 	pmap_t pmap)
11646 {
11647 	pt_entry_t *ttep;
11648 	vm_offset_t commpage_vaddr;
11649 	pmap_t commpage_pmap = commpage_pmap_default;
11650 
11651 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11652 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11653 
11654 #if __ARM_MIXED_PAGE_SIZE__
11655 #if !__ARM_16K_PG__
11656 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11657 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11658 #endif /* !__ARM_16K_PG__ */
11659 
11660 	/* Choose the correct shared page pmap to use. */
11661 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11662 	if (pmap_page_size == 16384) {
11663 		commpage_pmap = commpage_pmap_default;
11664 	} else if (pmap_page_size == 4096) {
11665 		commpage_pmap = commpage_pmap_4k;
11666 	} else {
11667 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11668 	}
11669 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11670 
11671 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11672 #error We assume a single page.
11673 #endif
11674 
11675 	if (pmap_is_64bit(pmap)) {
11676 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11677 	} else {
11678 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11679 	}
11680 
11681 
11682 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11683 
11684 	if (ttep == NULL) {
11685 		return;
11686 	}
11687 
11688 	/* It had better be mapped to the shared page. */
11689 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11690 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11691 	}
11692 
11693 	*ttep = ARM_TTE_EMPTY;
11694 	FLUSH_PTE_STRONG();
11695 
11696 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11697 	sync_tlb_flush();
11698 }
11699 
11700 void
11701 pmap_insert_commpage(
11702 	pmap_t pmap)
11703 {
11704 	kern_return_t kr = KERN_FAILURE;
11705 #if XNU_MONITOR
11706 	do {
11707 		kr = pmap_insert_commpage_ppl(pmap);
11708 
11709 		if (kr == KERN_RESOURCE_SHORTAGE) {
11710 			pmap_alloc_page_for_ppl(0);
11711 		}
11712 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11713 
11714 	pmap_ledger_check_balance(pmap);
11715 #else
11716 	do {
11717 		kr = pmap_insert_commpage_internal(pmap);
11718 	} while (kr == KERN_ABORTED);
11719 #endif
11720 
11721 	if (kr != KERN_SUCCESS) {
11722 		panic("%s: failed to insert the shared page, kr=%d, "
11723 		    "pmap=%p",
11724 		    __FUNCTION__, kr,
11725 		    pmap);
11726 	}
11727 }
11728 
11729 static boolean_t
11730 pmap_is_64bit(
11731 	pmap_t pmap)
11732 {
11733 	return pmap->is_64bit;
11734 }
11735 
11736 bool
11737 pmap_is_exotic(
11738 	pmap_t pmap __unused)
11739 {
11740 	return false;
11741 }
11742 
11743 
11744 /* ARMTODO -- an implementation that accounts for
11745  * holes in the physical map, if any.
11746  */
11747 boolean_t
11748 pmap_valid_page(
11749 	ppnum_t pn)
11750 {
11751 	return pa_valid(ptoa(pn));
11752 }
11753 
11754 boolean_t
11755 pmap_bootloader_page(
11756 	ppnum_t pn)
11757 {
11758 	pmap_paddr_t paddr = ptoa(pn);
11759 
11760 	if (pa_valid(paddr)) {
11761 		return FALSE;
11762 	}
11763 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11764 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11765 }
11766 
11767 MARK_AS_PMAP_TEXT boolean_t
11768 pmap_is_empty_internal(
11769 	pmap_t pmap,
11770 	vm_map_offset_t va_start,
11771 	vm_map_offset_t va_end)
11772 {
11773 	vm_map_offset_t block_start, block_end;
11774 	tt_entry_t *tte_p;
11775 
11776 	if (pmap == NULL) {
11777 		return TRUE;
11778 	}
11779 
11780 	validate_pmap(pmap);
11781 
11782 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11783 	unsigned int initial_not_in_kdp = not_in_kdp;
11784 
11785 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11786 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11787 	}
11788 
11789 
11790 	/* TODO: This will be faster if we increment ttep at each level. */
11791 	block_start = va_start;
11792 
11793 	while (block_start < va_end) {
11794 		pt_entry_t     *bpte_p, *epte_p;
11795 		pt_entry_t     *pte_p;
11796 
11797 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11798 		if (block_end > va_end) {
11799 			block_end = va_end;
11800 		}
11801 
11802 		tte_p = pmap_tte(pmap, block_start);
11803 		if ((tte_p != PT_ENTRY_NULL)
11804 		    && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11805 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11806 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11807 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11808 
11809 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11810 				if (*pte_p != ARM_PTE_EMPTY) {
11811 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11812 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11813 					}
11814 					return FALSE;
11815 				}
11816 			}
11817 		}
11818 		block_start = block_end;
11819 	}
11820 
11821 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11822 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11823 	}
11824 
11825 	return TRUE;
11826 }
11827 
11828 boolean_t
11829 pmap_is_empty(
11830 	pmap_t pmap,
11831 	vm_map_offset_t va_start,
11832 	vm_map_offset_t va_end)
11833 {
11834 #if XNU_MONITOR
11835 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11836 #else
11837 	return pmap_is_empty_internal(pmap, va_start, va_end);
11838 #endif
11839 }
11840 
11841 vm_map_offset_t
11842 pmap_max_offset(
11843 	boolean_t               is64,
11844 	unsigned int    option)
11845 {
11846 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11847 }
11848 
11849 vm_map_offset_t
11850 pmap_max_64bit_offset(
11851 	__unused unsigned int option)
11852 {
11853 	vm_map_offset_t max_offset_ret = 0;
11854 
11855 #if defined(__arm64__)
11856 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11857 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11858 		max_offset_ret = arm64_pmap_max_offset_default;
11859 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11860 		max_offset_ret = min_max_offset;
11861 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11862 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11863 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11864 		if (arm64_pmap_max_offset_default) {
11865 			max_offset_ret = arm64_pmap_max_offset_default;
11866 		} else if (max_mem > 0xC0000000) {
11867 			// devices with > 3GB of memory
11868 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11869 		} else if (max_mem > 0x40000000) {
11870 			// devices with > 1GB and <= 3GB of memory
11871 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11872 		} else {
11873 			// devices with <= 1 GB of memory
11874 			max_offset_ret = min_max_offset;
11875 		}
11876 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11877 		if (arm64_pmap_max_offset_default) {
11878 			// Allow the boot-arg to override jumbo size
11879 			max_offset_ret = arm64_pmap_max_offset_default;
11880 		} else {
11881 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11882 		}
11883 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11884 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11885 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11886 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11887 	} else {
11888 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11889 	}
11890 
11891 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11892 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11893 		assert(max_offset_ret >= min_max_offset);
11894 	}
11895 #else
11896 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11897 #endif
11898 
11899 	return max_offset_ret;
11900 }
11901 
11902 vm_map_offset_t
11903 pmap_max_32bit_offset(
11904 	unsigned int option)
11905 {
11906 	vm_map_offset_t max_offset_ret = 0;
11907 
11908 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11909 		max_offset_ret = arm_pmap_max_offset_default;
11910 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11911 		max_offset_ret = VM_MAX_ADDRESS;
11912 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11913 		max_offset_ret = VM_MAX_ADDRESS;
11914 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11915 		if (arm_pmap_max_offset_default) {
11916 			max_offset_ret = arm_pmap_max_offset_default;
11917 		} else if (max_mem > 0x20000000) {
11918 			max_offset_ret = VM_MAX_ADDRESS;
11919 		} else {
11920 			max_offset_ret = VM_MAX_ADDRESS;
11921 		}
11922 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11923 		max_offset_ret = VM_MAX_ADDRESS;
11924 	} else {
11925 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11926 	}
11927 
11928 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11929 	return max_offset_ret;
11930 }
11931 
11932 #if CONFIG_DTRACE
11933 /*
11934  * Constrain DTrace copyin/copyout actions
11935  */
11936 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11937 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11938 
11939 kern_return_t
11940 dtrace_copyio_preflight(
11941 	__unused addr64_t va)
11942 {
11943 	if (current_map() == kernel_map) {
11944 		return KERN_FAILURE;
11945 	} else {
11946 		return KERN_SUCCESS;
11947 	}
11948 }
11949 
11950 kern_return_t
11951 dtrace_copyio_postflight(
11952 	__unused addr64_t va)
11953 {
11954 	return KERN_SUCCESS;
11955 }
11956 #endif /* CONFIG_DTRACE */
11957 
11958 
11959 void
11960 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11961 {
11962 }
11963 
11964 
11965 void
11966 pmap_flush(
11967 	__unused pmap_flush_context *cpus_to_flush)
11968 {
11969 	/* not implemented yet */
11970 	return;
11971 }
11972 
11973 #if XNU_MONITOR
11974 
11975 /*
11976  * Enforce that the address range described by kva and nbytes is not currently
11977  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
11978  * unintentionally writing to PPL-owned memory.
11979  */
11980 void
11981 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11982 {
11983 	vm_offset_t end;
11984 	if (os_add_overflow(kva, nbytes, &end)) {
11985 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11986 	}
11987 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11988 		pmap_paddr_t pa = kvtophys_nofail(ckva);
11989 		unsigned int pai = pa_index(pa);
11990 		pp_attr_t attr;
11991 		if (__improbable(!pa_valid(pa))) {
11992 			panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11993 		}
11994 		pvh_lock(pai);
11995 		if (__improbable(ckva == phystokv(pa))) {
11996 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11997 		}
11998 		do {
11999 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12000 			if (__improbable(attr & PP_ATTR_MONITOR)) {
12001 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12002 			}
12003 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12004 		pvh_unlock(pai);
12005 		if (__improbable(kvtophys_nofail(ckva) != pa)) {
12006 			panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12007 		}
12008 	}
12009 }
12010 
12011 void
12012 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12013 {
12014 	vm_offset_t end;
12015 	if (os_add_overflow(kva, nbytes, &end)) {
12016 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12017 	}
12018 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12019 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12020 
12021 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12022 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12023 		}
12024 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12025 		ppattr_pa_clear_no_monitor(pa);
12026 	}
12027 }
12028 
12029 /**
12030  * Lock down a page, making all mappings read-only, and preventing further
12031  * mappings or removal of this particular kva's mapping. Effectively, it makes
12032  * the physical page at kva immutable (see the ppl_writable parameter for an
12033  * exception to this).
12034  *
12035  * @param kva Valid address to any mapping of the physical page to lockdown.
12036  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12037  * @param ppl_writable True if the PPL should still be able to write to the page
12038  *                     using the physical aperture mapping. False will make the
12039  *                     page read-only for both the kernel and PPL in the
12040  *                     physical aperture.
12041  */
12042 
12043 MARK_AS_PMAP_TEXT static void
12044 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12045 {
12046 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12047 }
12048 
12049 /**
12050  * Lock down a page, giving all mappings the specified maximum permissions, and
12051  * preventing further mappings or removal of this particular kva's mapping.
12052  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12053  * parameter for an exception to this).
12054  *
12055  * @param kva Valid address to any mapping of the physical page to lockdown.
12056  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12057  * @param ppl_writable True if the PPL should still be able to write to the page
12058  *                     using the physical aperture mapping. False will make the
12059  *                     page read-only for both the kernel and PPL in the
12060  *                     physical aperture.
12061  * @param prot Maximum permissions to allow in existing alias mappings
12062  */
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12065 {
12066 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12067 	const unsigned int pai = pa_index(pa);
12068 
12069 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12070 	pvh_lock(pai);
12071 	pv_entry_t **pvh = pai_to_pvh(pai);
12072 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12073 
12074 	if (__improbable(ppattr_pa_test_monitor(pa))) {
12075 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12076 	}
12077 
12078 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12079 		panic("%s: %#lx already locked down/executable (%#llx)",
12080 		    __func__, kva, (uint64_t)pvh_flags);
12081 	}
12082 
12083 
12084 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12085 
12086 	/* Update the physical aperture mapping to prevent kernel write access. */
12087 	const unsigned int new_xprr_perm =
12088 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12089 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12090 
12091 	pvh_unlock(pai);
12092 
12093 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12094 
12095 	/**
12096 	 * Double-check that the mapping didn't change physical addresses before the
12097 	 * LOCKDOWN flag was set (there is a brief window between the above
12098 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12099 	 *
12100 	 * This doesn't solve the ABA problem, but this doesn't have to since once
12101 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
12102 	 * page without the LOCKDOWN flag already set (so any future mappings can
12103 	 * only be RO, and no existing mappings can be removed).
12104 	 */
12105 	if (kvtophys_nofail(kva) != pa) {
12106 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12107 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12108 	}
12109 }
12110 
12111 /**
12112  * Helper for releasing a page from being locked down to the PPL, making it writable to the
12113  * kernel once again.
12114  *
12115  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12116  *       to unlockdown a page that was never locked down, will panic.
12117  *
12118  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
12119  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12120  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12121  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12122  *                     deviation will result in a panic.
12123  */
12124 MARK_AS_PMAP_TEXT static void
12125 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12126 {
12127 	pvh_assert_locked(pai);
12128 	pv_entry_t **pvh = pai_to_pvh(pai);
12129 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12130 
12131 	if (__improbable(!(pvh_flags & lockdown_flag))) {
12132 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12133 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12134 	}
12135 
12136 
12137 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12138 
12139 	/* Restore the pre-lockdown physical aperture mapping permissions. */
12140 	const unsigned int old_xprr_perm =
12141 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12142 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12143 }
12144 
12145 /**
12146  * Release a page from being locked down to the PPL, making it writable to the
12147  * kernel once again.
12148  *
12149  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12150  *       to unlockdown a page that was never locked down, will panic.
12151  *
12152  * @param kva Valid address to any mapping of the physical page to unlockdown.
12153  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12154  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12155  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12156  *                     deviation will result in a panic.
12157  */
12158 MARK_AS_PMAP_TEXT static void
12159 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12160 {
12161 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12162 	const unsigned int pai = pa_index(pa);
12163 
12164 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12165 	pvh_lock(pai);
12166 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12167 	pvh_unlock(pai);
12168 }
12169 
12170 #else /* XNU_MONITOR */
12171 
12172 void __unused
12173 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12174 {
12175 }
12176 
12177 void __unused
12178 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12179 {
12180 }
12181 
12182 #endif /* !XNU_MONITOR */
12183 
12184 
12185 MARK_AS_PMAP_TEXT static inline void
12186 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12187 {
12188 #if XNU_MONITOR
12189 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12190 #else
12191 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12192 #endif
12193 }
12194 
12195 MARK_AS_PMAP_TEXT static inline void
12196 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12197 {
12198 #if XNU_MONITOR
12199 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12200 #else
12201 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12202 #endif
12203 }
12204 
12205 /**
12206  * Perform basic validation checks on the destination only and
12207  * corresponding offset/sizes prior to writing to a read only allocation.
12208  *
12209  * @note Should be called before writing to an allocation from the read
12210  * only allocator.
12211  *
12212  * @param zid The ID of the zone the allocation belongs to.
12213  * @param va VA of element being modified (destination).
12214  * @param offset Offset being written to, in the element.
12215  * @param new_data_size Size of modification.
12216  *
12217  */
12218 
12219 MARK_AS_PMAP_TEXT static void
12220 pmap_ro_zone_validate_element_dst(
12221 	zone_id_t           zid,
12222 	vm_offset_t         va,
12223 	vm_offset_t         offset,
12224 	vm_size_t           new_data_size)
12225 {
12226 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12227 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12228 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12229 	}
12230 
12231 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12232 
12233 	/* Check element is from correct zone and properly aligned */
12234 	zone_require_ro(zid, elem_size, (void*)va);
12235 
12236 	if (__improbable(new_data_size > (elem_size - offset))) {
12237 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12238 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12239 	}
12240 	if (__improbable(offset >= elem_size)) {
12241 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12242 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12243 	}
12244 }
12245 
12246 
12247 /**
12248  * Perform basic validation checks on the source, destination and
12249  * corresponding offset/sizes prior to writing to a read only allocation.
12250  *
12251  * @note Should be called before writing to an allocation from the read
12252  * only allocator.
12253  *
12254  * @param zid The ID of the zone the allocation belongs to.
12255  * @param va VA of element being modified (destination).
12256  * @param offset Offset being written to, in the element.
12257  * @param new_data Pointer to new data (source).
12258  * @param new_data_size Size of modification.
12259  *
12260  */
12261 
12262 MARK_AS_PMAP_TEXT static void
12263 pmap_ro_zone_validate_element(
12264 	zone_id_t           zid,
12265 	vm_offset_t         va,
12266 	vm_offset_t         offset,
12267 	const vm_offset_t   new_data,
12268 	vm_size_t           new_data_size)
12269 {
12270 	vm_offset_t sum = 0;
12271 
12272 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12273 		panic("%s: Integer addition overflow %p + %lu = %lu",
12274 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12275 	}
12276 
12277 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12278 }
12279 
12280 /**
12281  * Ensure that physical page is locked down before writing to it.
12282  *
12283  * @note Should be called before writing to an allocation from the read
12284  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12285  * ensure that it is called after the modification.
12286  *
12287  *
12288  * @param pa Physical address of the element being modified.
12289  * @param va Virtual address of element being modified.
12290  * @param size Size of the modification.
12291  *
12292  */
12293 
12294 MARK_AS_PMAP_TEXT static void
12295 pmap_ro_zone_lock_phy_page(
12296 	const pmap_paddr_t  pa,
12297 	vm_offset_t         va,
12298 	vm_size_t           size)
12299 {
12300 	if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12301 		panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12302 		    __func__, (unsigned long long)va, (unsigned long long)size);
12303 	}
12304 	const unsigned int pai = pa_index(pa);
12305 	pvh_lock(pai);
12306 
12307 	/* Ensure that the physical page is locked down */
12308 #if XNU_MONITOR
12309 	pv_entry_t **pvh = pai_to_pvh(pai);
12310 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12311 		panic("%s: Physical page not locked down %llx", __func__, pa);
12312 	}
12313 #endif /* XNU_MONITOR */
12314 }
12315 
12316 /**
12317  * Unlock physical page after writing to it.
12318  *
12319  * @note Should be called after writing to an allocation from the read
12320  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12321  * ensure that it has been called prior to the modification.
12322  *
12323  * @param pa Physical address of the element that was modified.
12324  * @param va Virtual address of element that was modified.
12325  * @param size Size of the modification.
12326  *
12327  */
12328 
12329 MARK_AS_PMAP_TEXT static void
12330 pmap_ro_zone_unlock_phy_page(
12331 	const pmap_paddr_t  pa,
12332 	vm_offset_t         va __unused,
12333 	vm_size_t           size __unused)
12334 {
12335 	const unsigned int pai = pa_index(pa);
12336 	pvh_unlock(pai);
12337 }
12338 
12339 /**
12340  * Function to copy kauth_cred from new_data to kv.
12341  * Function defined in "kern_prot.c"
12342  *
12343  * @note Will be removed upon completion of
12344  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12345  *
12346  * @param kv Address to copy new data to.
12347  * @param new_data Pointer to new data.
12348  *
12349  */
12350 
12351 extern void
12352 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12353 
12354 /**
12355  * Zalloc-specific memcpy that writes through the physical aperture
12356  * and ensures the element being modified is from a read-only zone.
12357  *
12358  * @note Designed to work only with the zone allocator's read-only submap.
12359  *
12360  * @param zid The ID of the zone to allocate from.
12361  * @param va VA of element to be modified.
12362  * @param offset Offset from element.
12363  * @param new_data Pointer to new data.
12364  * @param new_data_size	Size of modification.
12365  *
12366  */
12367 
12368 void
12369 pmap_ro_zone_memcpy(
12370 	zone_id_t           zid,
12371 	vm_offset_t         va,
12372 	vm_offset_t         offset,
12373 	const vm_offset_t   new_data,
12374 	vm_size_t           new_data_size)
12375 {
12376 #if XNU_MONITOR
12377 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12378 #else /* XNU_MONITOR */
12379 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12380 #endif /* XNU_MONITOR */
12381 }
12382 
12383 MARK_AS_PMAP_TEXT void
12384 pmap_ro_zone_memcpy_internal(
12385 	zone_id_t             zid,
12386 	vm_offset_t           va,
12387 	vm_offset_t           offset,
12388 	const vm_offset_t     new_data,
12389 	vm_size_t             new_data_size)
12390 {
12391 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12392 
12393 	if (!new_data || new_data_size == 0) {
12394 		return;
12395 	}
12396 
12397 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12398 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12399 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12400 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12401 }
12402 
12403 /**
12404  * Zalloc-specific function to atomically mutate fields of an element that
12405  * belongs to a read-only zone, via the physcial aperture.
12406  *
12407  * @note Designed to work only with the zone allocator's read-only submap.
12408  *
12409  * @param zid The ID of the zone the element belongs to.
12410  * @param va VA of element to be modified.
12411  * @param offset Offset in element.
12412  * @param op Atomic operation to perform.
12413  * @param value	Mutation value.
12414  *
12415  */
12416 
12417 uint64_t
12418 pmap_ro_zone_atomic_op(
12419 	zone_id_t             zid,
12420 	vm_offset_t           va,
12421 	vm_offset_t           offset,
12422 	zro_atomic_op_t       op,
12423 	uint64_t              value)
12424 {
12425 #if XNU_MONITOR
12426 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12427 #else /* XNU_MONITOR */
12428 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12429 #endif /* XNU_MONITOR */
12430 }
12431 
12432 MARK_AS_PMAP_TEXT uint64_t
12433 pmap_ro_zone_atomic_op_internal(
12434 	zone_id_t             zid,
12435 	vm_offset_t           va,
12436 	vm_offset_t           offset,
12437 	zro_atomic_op_t       op,
12438 	uint64_t              value)
12439 {
12440 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12441 	vm_size_t value_size = op & 0xf;
12442 
12443 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12444 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12445 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12446 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12447 
12448 	return value;
12449 }
12450 
12451 /**
12452  * bzero for allocations from read only zones, that writes through the
12453  * physical aperture.
12454  *
12455  * @note This is called by the zfree path of all allocations from read
12456  * only zones.
12457  *
12458  * @param zid The ID of the zone the allocation belongs to.
12459  * @param va VA of element to be zeroed.
12460  * @param offset Offset in the element.
12461  * @param size	Size of allocation.
12462  *
12463  */
12464 
12465 void
12466 pmap_ro_zone_bzero(
12467 	zone_id_t       zid,
12468 	vm_offset_t     va,
12469 	vm_offset_t     offset,
12470 	vm_size_t       size)
12471 {
12472 #if XNU_MONITOR
12473 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12474 #else /* XNU_MONITOR */
12475 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12476 #endif /* XNU_MONITOR */
12477 }
12478 
12479 MARK_AS_PMAP_TEXT void
12480 pmap_ro_zone_bzero_internal(
12481 	zone_id_t       zid,
12482 	vm_offset_t     va,
12483 	vm_offset_t     offset,
12484 	vm_size_t       size)
12485 {
12486 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12487 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12488 	pmap_ro_zone_lock_phy_page(pa, va, size);
12489 	bzero((void*)phystokv(pa), size);
12490 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12491 }
12492 
12493 /**
12494  * Removes write access from the Physical Aperture.
12495  *
12496  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12497  * @note Designed to work only with the zone allocator's read-only submap.
12498  *
12499  * @param va VA of the page to restore write access to.
12500  *
12501  */
12502 MARK_AS_PMAP_TEXT static void
12503 pmap_phys_write_disable(vm_address_t va)
12504 {
12505 #if XNU_MONITOR
12506 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12507 #else /* XNU_MONITOR */
12508 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12509 #endif /* XNU_MONITOR */
12510 }
12511 
12512 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12513 
12514 MARK_AS_PMAP_TEXT mach_vm_size_t
12515 pmap_query_resident_internal(
12516 	pmap_t                  pmap,
12517 	vm_map_address_t        start,
12518 	vm_map_address_t        end,
12519 	mach_vm_size_t          *compressed_bytes_p)
12520 {
12521 	mach_vm_size_t  resident_bytes = 0;
12522 	mach_vm_size_t  compressed_bytes = 0;
12523 
12524 	pt_entry_t     *bpte, *epte;
12525 	pt_entry_t     *pte_p;
12526 	tt_entry_t     *tte_p;
12527 
12528 	if (pmap == NULL) {
12529 		return PMAP_RESIDENT_INVALID;
12530 	}
12531 
12532 	validate_pmap(pmap);
12533 
12534 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12535 
12536 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12537 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12538 	    (end % pt_attr_page_size(pt_attr)))) {
12539 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12540 	}
12541 
12542 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12543 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12544 	}
12545 
12546 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12547 	tte_p = pmap_tte(pmap, start);
12548 	if (tte_p == (tt_entry_t *) NULL) {
12549 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12550 		return PMAP_RESIDENT_INVALID;
12551 	}
12552 	if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12553 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12554 		bpte = &pte_p[pte_index(pt_attr, start)];
12555 		epte = &pte_p[pte_index(pt_attr, end)];
12556 
12557 		for (; bpte < epte; bpte++) {
12558 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12559 				compressed_bytes += pt_attr_page_size(pt_attr);
12560 			} else if (pa_valid(pte_to_pa(*bpte))) {
12561 				resident_bytes += pt_attr_page_size(pt_attr);
12562 			}
12563 		}
12564 	}
12565 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12566 
12567 	if (compressed_bytes_p) {
12568 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12569 		*compressed_bytes_p += compressed_bytes;
12570 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12571 	}
12572 
12573 	return resident_bytes;
12574 }
12575 
12576 mach_vm_size_t
12577 pmap_query_resident(
12578 	pmap_t                  pmap,
12579 	vm_map_address_t        start,
12580 	vm_map_address_t        end,
12581 	mach_vm_size_t          *compressed_bytes_p)
12582 {
12583 	mach_vm_size_t          total_resident_bytes;
12584 	mach_vm_size_t          compressed_bytes;
12585 	vm_map_address_t        va;
12586 
12587 
12588 	if (pmap == PMAP_NULL) {
12589 		if (compressed_bytes_p) {
12590 			*compressed_bytes_p = 0;
12591 		}
12592 		return 0;
12593 	}
12594 
12595 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12596 
12597 	total_resident_bytes = 0;
12598 	compressed_bytes = 0;
12599 
12600 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12601 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12602 	    VM_KERNEL_ADDRHIDE(end));
12603 
12604 	va = start;
12605 	while (va < end) {
12606 		vm_map_address_t l;
12607 		mach_vm_size_t resident_bytes;
12608 
12609 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12610 
12611 		if (l > end) {
12612 			l = end;
12613 		}
12614 #if XNU_MONITOR
12615 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12616 #else
12617 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12618 #endif
12619 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12620 			break;
12621 		}
12622 
12623 		total_resident_bytes += resident_bytes;
12624 
12625 		va = l;
12626 	}
12627 
12628 	if (compressed_bytes_p) {
12629 		*compressed_bytes_p = compressed_bytes;
12630 	}
12631 
12632 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12633 	    total_resident_bytes);
12634 
12635 	return total_resident_bytes;
12636 }
12637 
12638 #if MACH_ASSERT
12639 static void
12640 pmap_check_ledgers(
12641 	pmap_t pmap)
12642 {
12643 	int     pid;
12644 	char    *procname;
12645 
12646 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12647 		/*
12648 		 * This pmap was not or is no longer fully associated
12649 		 * with a task (e.g. the old pmap after a fork()/exec() or
12650 		 * spawn()).  Its "ledger" still points at a task that is
12651 		 * now using a different (and active) address space, so
12652 		 * we can't check that all the pmap ledgers are balanced here.
12653 		 *
12654 		 * If the "pid" is set, that means that we went through
12655 		 * pmap_set_process() in task_terminate_internal(), so
12656 		 * this task's ledger should not have been re-used and
12657 		 * all the pmap ledgers should be back to 0.
12658 		 */
12659 		return;
12660 	}
12661 
12662 	pid = pmap->pmap_pid;
12663 	procname = pmap->pmap_procname;
12664 
12665 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12666 }
12667 #endif /* MACH_ASSERT */
12668 
12669 void
12670 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12671 {
12672 }
12673 
12674 /**
12675  * The minimum shared region nesting size is used by the VM to determine when to
12676  * break up large mappings to nested regions. The smallest size that these
12677  * mappings can be broken into is determined by what page table level those
12678  * regions are being nested in at and the size of the page tables.
12679  *
12680  * For instance, if a nested region is nesting at L2 for a process utilizing
12681  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12682  * block entry).
12683  *
12684  * @param pmap The target pmap to determine the block size based on whether it's
12685  *             using 16KB or 4KB page tables.
12686  */
12687 uint64_t
12688 pmap_shared_region_size_min(__unused pmap_t pmap)
12689 {
12690 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12691 
12692 	/**
12693 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12694 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12695 	 * point to shared L3 page tables in the shared region pmap.
12696 	 */
12697 	return pt_attr_twig_size(pt_attr);
12698 }
12699 
12700 boolean_t
12701 pmap_enforces_execute_only(
12702 	pmap_t pmap)
12703 {
12704 	return pmap != kernel_pmap;
12705 }
12706 
12707 MARK_AS_PMAP_TEXT void
12708 pmap_set_vm_map_cs_enforced_internal(
12709 	pmap_t pmap,
12710 	bool new_value)
12711 {
12712 	validate_pmap_mutable(pmap);
12713 	pmap->pmap_vm_map_cs_enforced = new_value;
12714 }
12715 
12716 void
12717 pmap_set_vm_map_cs_enforced(
12718 	pmap_t pmap,
12719 	bool new_value)
12720 {
12721 #if XNU_MONITOR
12722 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12723 #else
12724 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12725 #endif
12726 }
12727 
12728 extern int cs_process_enforcement_enable;
12729 bool
12730 pmap_get_vm_map_cs_enforced(
12731 	pmap_t pmap)
12732 {
12733 	if (cs_process_enforcement_enable) {
12734 		return true;
12735 	}
12736 	return pmap->pmap_vm_map_cs_enforced;
12737 }
12738 
12739 MARK_AS_PMAP_TEXT void
12740 pmap_set_jit_entitled_internal(
12741 	__unused pmap_t pmap)
12742 {
12743 	return;
12744 }
12745 
12746 void
12747 pmap_set_jit_entitled(
12748 	pmap_t pmap)
12749 {
12750 #if XNU_MONITOR
12751 	pmap_set_jit_entitled_ppl(pmap);
12752 #else
12753 	pmap_set_jit_entitled_internal(pmap);
12754 #endif
12755 }
12756 
12757 bool
12758 pmap_get_jit_entitled(
12759 	__unused pmap_t pmap)
12760 {
12761 	return false;
12762 }
12763 
12764 MARK_AS_PMAP_TEXT void
12765 pmap_set_tpro_internal(
12766 	__unused pmap_t pmap)
12767 {
12768 	return;
12769 }
12770 
12771 void
12772 pmap_set_tpro(
12773 	pmap_t pmap)
12774 {
12775 #if XNU_MONITOR
12776 	pmap_set_tpro_ppl(pmap);
12777 #else /* XNU_MONITOR */
12778 	pmap_set_tpro_internal(pmap);
12779 #endif /* XNU_MONITOR */
12780 }
12781 
12782 bool
12783 pmap_get_tpro(
12784 	__unused pmap_t pmap)
12785 {
12786 	return false;
12787 }
12788 
12789 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12790 
12791 MARK_AS_PMAP_TEXT kern_return_t
12792 pmap_query_page_info_internal(
12793 	pmap_t          pmap,
12794 	vm_map_offset_t va,
12795 	int             *disp_p)
12796 {
12797 	pmap_paddr_t    pa;
12798 	int             disp;
12799 	unsigned int    pai;
12800 	pt_entry_t      *pte_p, pte;
12801 	pv_entry_t      **pv_h, *pve_p;
12802 
12803 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12804 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12805 		*disp_p = 0;
12806 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12807 		return KERN_INVALID_ARGUMENT;
12808 	}
12809 
12810 	validate_pmap(pmap);
12811 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12812 
12813 try_again:
12814 	disp = 0;
12815 	pte_p = pmap_pte(pmap, va);
12816 	if (pte_p == PT_ENTRY_NULL) {
12817 		goto done;
12818 	}
12819 	pte = *(volatile pt_entry_t*)pte_p;
12820 	pa = pte_to_pa(pte);
12821 	if (pa == 0) {
12822 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12823 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12824 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12825 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12826 			}
12827 		}
12828 	} else {
12829 		disp |= PMAP_QUERY_PAGE_PRESENT;
12830 		pai = pa_index(pa);
12831 		if (!pa_valid(pa)) {
12832 			goto done;
12833 		}
12834 		pvh_lock(pai);
12835 		if (pte != *(volatile pt_entry_t*)pte_p) {
12836 			/* something changed: try again */
12837 			pvh_unlock(pai);
12838 			pmap_query_page_info_retries++;
12839 			goto try_again;
12840 		}
12841 		pv_h = pai_to_pvh(pai);
12842 		pve_p = PV_ENTRY_NULL;
12843 		int pve_ptep_idx = 0;
12844 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12845 			pve_p = pvh_pve_list(pv_h);
12846 			while (pve_p != PV_ENTRY_NULL &&
12847 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12848 				pve_p = pve_next(pve_p);
12849 			}
12850 		}
12851 
12852 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12853 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12854 		} else if (ppattr_test_reusable(pai)) {
12855 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12856 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12857 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12858 		}
12859 		pvh_unlock(pai);
12860 	}
12861 
12862 done:
12863 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12864 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12865 	*disp_p = disp;
12866 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12867 	return KERN_SUCCESS;
12868 }
12869 
12870 kern_return_t
12871 pmap_query_page_info(
12872 	pmap_t          pmap,
12873 	vm_map_offset_t va,
12874 	int             *disp_p)
12875 {
12876 #if XNU_MONITOR
12877 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12878 #else
12879 	return pmap_query_page_info_internal(pmap, va, disp_p);
12880 #endif
12881 }
12882 
12883 
12884 
12885 uint32_t
12886 pmap_user_va_bits(pmap_t pmap __unused)
12887 {
12888 #if __ARM_MIXED_PAGE_SIZE__
12889 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12890 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12891 #else
12892 	return 64 - T0SZ_BOOT;
12893 #endif
12894 }
12895 
12896 uint32_t
12897 pmap_kernel_va_bits(void)
12898 {
12899 	return 64 - T1SZ_BOOT;
12900 }
12901 
12902 static vm_map_size_t
12903 pmap_user_va_size(pmap_t pmap)
12904 {
12905 	return 1ULL << pmap_user_va_bits(pmap);
12906 }
12907 
12908 
12909 
12910 bool
12911 pmap_in_ppl(void)
12912 {
12913 	// Unsupported
12914 	return false;
12915 }
12916 
12917 __attribute__((__noreturn__))
12918 void
12919 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12920 {
12921 	panic("%s called on an unsupported platform.", __FUNCTION__);
12922 }
12923 
12924 void *
12925 pmap_claim_reserved_ppl_page(void)
12926 {
12927 	// Unsupported
12928 	return NULL;
12929 }
12930 
12931 void
12932 pmap_free_reserved_ppl_page(void __unused *kva)
12933 {
12934 	// Unsupported
12935 }
12936 
12937 
12938 #if PMAP_CS_PPL_MONITOR
12939 
12940 /* Immutable part of the trust cache runtime */
12941 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12942 
12943 /* Mutable part of the trust cache runtime */
12944 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12945 
12946 /* Lock for the trust cache runtime */
12947 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12948 
12949 MARK_AS_PMAP_TEXT kern_return_t
12950 pmap_check_trust_cache_runtime_for_uuid_internal(
12951 	const uint8_t check_uuid[kUUIDSize])
12952 {
12953 	kern_return_t ret = KERN_DENIED;
12954 
12955 	if (amfi->TrustCache.version < 3) {
12956 		/* AMFI change hasn't landed in the build */
12957 		pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12958 		return KERN_NOT_SUPPORTED;
12959 	}
12960 
12961 	/* Lock the runtime as shared */
12962 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12963 
12964 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12965 		&ppl_trust_cache_rt,
12966 		check_uuid,
12967 		NULL);
12968 
12969 	/* Unlock the runtime */
12970 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12971 
12972 	if (tc_ret.error == kTCReturnSuccess) {
12973 		ret = KERN_SUCCESS;
12974 	} else if (tc_ret.error == kTCReturnNotFound) {
12975 		ret = KERN_NOT_FOUND;
12976 	} else {
12977 		ret = KERN_FAILURE;
12978 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12979 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12980 	}
12981 
12982 	return ret;
12983 }
12984 
12985 kern_return_t
12986 pmap_check_trust_cache_runtime_for_uuid(
12987 	const uint8_t check_uuid[kUUIDSize])
12988 {
12989 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12990 }
12991 
12992 MARK_AS_PMAP_TEXT kern_return_t
12993 pmap_load_trust_cache_with_type_internal(
12994 	TCType_t type,
12995 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12996 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12997 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12998 {
12999 	kern_return_t ret = KERN_DENIED;
13000 	pmap_img4_payload_t *payload = NULL;
13001 	size_t img4_payload_len = 0;
13002 	size_t payload_len_aligned = 0;
13003 	size_t manifest_len_aligned = 0;
13004 
13005 	/* Ignore the auxiliary manifest until we add support for it */
13006 	(void)img4_aux_manifest;
13007 	(void)img4_aux_manifest_len;
13008 
13009 
13010 #if PMAP_CS_INCLUDE_CODE_SIGNING
13011 	if (pmap_cs) {
13012 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13013 			panic("trust cache type not loadable from interface: %u", type);
13014 		} else if (type >= kTCTypeTotal) {
13015 			panic("attempted to load an unsupported trust cache type: %u", type);
13016 		}
13017 
13018 		/* Validate entitlement for the calling process */
13019 		if (TCTypeConfig[type].entitlementValue != NULL) {
13020 			const bool entitlement_satisfied = check_entitlement_pmap(
13021 				NULL,
13022 				"com.apple.private.pmap.load-trust-cache",
13023 				TCTypeConfig[type].entitlementValue,
13024 				false,
13025 				true);
13026 
13027 			if (entitlement_satisfied == false) {
13028 				panic("attempted to load trust cache without entitlement: %u", type);
13029 			}
13030 		}
13031 	}
13032 #endif
13033 
13034 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13035 	ret = pmap_reserve_ppl_page();
13036 	if (ret != KERN_SUCCESS) {
13037 		if (ret != KERN_RESOURCE_SHORTAGE) {
13038 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13039 		}
13040 		return ret;
13041 	}
13042 
13043 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
13044 	payload_len_aligned = round_page(pmap_img4_payload_len);
13045 	manifest_len_aligned = round_page(img4_manifest_len);
13046 
13047 	/* Ensure we have valid data passed in */
13048 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13049 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13050 
13051 	/*
13052 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13053 	 * data structure used by libTrustCache to manage the payload. We need to be able to
13054 	 * write to that data structure, so we keep the payload PPL writable.
13055 	 */
13056 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13057 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13058 
13059 	/* Should be safe to read from this now */
13060 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
13061 
13062 	/* Acquire a writable version of the trust cache data structure */
13063 	TrustCache_t *trust_cache = &payload->trust_cache;
13064 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13065 
13066 	/* Calculate the correct length of the img4 payload */
13067 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13068 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13069 	}
13070 
13071 	/* Exclusively lock the runtime */
13072 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13073 
13074 	/* Load the trust cache */
13075 	TCReturn_t tc_ret = amfi->TrustCache.load(
13076 		&ppl_trust_cache_rt,
13077 		type,
13078 		trust_cache,
13079 		(const uintptr_t)payload->img4_payload, img4_payload_len,
13080 		(const uintptr_t)img4_manifest, img4_manifest_len);
13081 
13082 	/* Unlock the runtime */
13083 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13084 
13085 	if (tc_ret.error == kTCReturnSuccess) {
13086 		ret = KERN_SUCCESS;
13087 	} else {
13088 		if (tc_ret.error == kTCReturnDuplicate) {
13089 			ret = KERN_ALREADY_IN_SET;
13090 		} else {
13091 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13092 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13093 
13094 			ret = KERN_FAILURE;
13095 		}
13096 
13097 		/* Unlock the payload data */
13098 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13099 		trust_cache = NULL;
13100 		payload = NULL;
13101 	}
13102 
13103 	/* Unlock the manifest since it is no longer needed */
13104 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13105 
13106 	/* Return the CoreCrypto reserved page back to the free list */
13107 	pmap_release_reserved_ppl_page();
13108 
13109 	return ret;
13110 }
13111 
13112 kern_return_t
13113 pmap_load_trust_cache_with_type(
13114 	TCType_t type,
13115 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13116 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13117 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13118 {
13119 	kern_return_t ret = KERN_DENIED;
13120 
13121 	ret = pmap_load_trust_cache_with_type_ppl(
13122 		type,
13123 		pmap_img4_payload, pmap_img4_payload_len,
13124 		img4_manifest, img4_manifest_len,
13125 		img4_aux_manifest, img4_aux_manifest_len);
13126 
13127 	while (ret == KERN_RESOURCE_SHORTAGE) {
13128 		/* Allocate a page from the free list */
13129 		pmap_alloc_page_for_ppl(0);
13130 
13131 		/* Attempt the call again */
13132 		ret = pmap_load_trust_cache_with_type_ppl(
13133 			type,
13134 			pmap_img4_payload, pmap_img4_payload_len,
13135 			img4_manifest, img4_manifest_len,
13136 			img4_aux_manifest, img4_aux_manifest_len);
13137 	}
13138 
13139 	return ret;
13140 }
13141 
13142 MARK_AS_PMAP_TEXT kern_return_t
13143 pmap_query_trust_cache_safe(
13144 	TCQueryType_t query_type,
13145 	const uint8_t cdhash[kTCEntryHashSize],
13146 	TrustCacheQueryToken_t *query_token)
13147 {
13148 	kern_return_t ret = KERN_NOT_FOUND;
13149 
13150 	/* Validate the query type preemptively */
13151 	if (query_type >= kTCQueryTypeTotal) {
13152 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13153 		return KERN_INVALID_ARGUMENT;
13154 	}
13155 
13156 	/* Lock the runtime as shared */
13157 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13158 
13159 	TCReturn_t tc_ret = amfi->TrustCache.query(
13160 		&ppl_trust_cache_rt,
13161 		query_type,
13162 		cdhash,
13163 		query_token);
13164 
13165 	/* Unlock the runtime */
13166 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13167 
13168 	if (tc_ret.error == kTCReturnSuccess) {
13169 		ret = KERN_SUCCESS;
13170 	} else if (tc_ret.error == kTCReturnNotFound) {
13171 		ret = KERN_NOT_FOUND;
13172 	} else {
13173 		ret = KERN_FAILURE;
13174 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13175 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13176 	}
13177 
13178 	return ret;
13179 }
13180 
13181 MARK_AS_PMAP_TEXT kern_return_t
13182 pmap_query_trust_cache_internal(
13183 	TCQueryType_t query_type,
13184 	const uint8_t cdhash[kTCEntryHashSize],
13185 	TrustCacheQueryToken_t *query_token)
13186 {
13187 	kern_return_t ret = KERN_NOT_FOUND;
13188 	TrustCacheQueryToken_t query_token_safe = {0};
13189 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13190 
13191 	/* Copy in the CDHash into PPL storage */
13192 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13193 
13194 	/* Query through the safe API since we're in the PPL now */
13195 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13196 
13197 	if (query_token != NULL) {
13198 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13199 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13200 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13201 	}
13202 
13203 	return ret;
13204 }
13205 
13206 kern_return_t
13207 pmap_query_trust_cache(
13208 	TCQueryType_t query_type,
13209 	const uint8_t cdhash[kTCEntryHashSize],
13210 	TrustCacheQueryToken_t *query_token)
13211 {
13212 	kern_return_t ret = KERN_NOT_FOUND;
13213 
13214 	ret = pmap_query_trust_cache_ppl(
13215 		query_type,
13216 		cdhash,
13217 		query_token);
13218 
13219 	return ret;
13220 }
13221 
13222 MARK_AS_PMAP_DATA bool ppl_developer_mode_set =  false;
13223 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13224 
13225 MARK_AS_PMAP_TEXT void
13226 pmap_toggle_developer_mode_internal(
13227 	bool state)
13228 {
13229 	bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13230 
13231 	/*
13232 	 * Only the following state transitions are allowed:
13233 	 * -- not set --> false
13234 	 * -- not set --> true
13235 	 * -- true --> false
13236 	 * -- true --> true
13237 	 * -- false --> false
13238 	 *
13239 	 * We never allow false --> true transitions.
13240 	 */
13241 	bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13242 
13243 	if ((current == false) && (state == true) && state_set) {
13244 		panic("PMAP_CS: attempted to enable developer mode incorrectly");
13245 	}
13246 
13247 	/* We're going to update the developer mode state, so update this first */
13248 	os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13249 
13250 	/* Update the developer mode state on the system */
13251 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13252 }
13253 
13254 void
13255 pmap_toggle_developer_mode(
13256 	bool state)
13257 {
13258 	pmap_toggle_developer_mode_ppl(state);
13259 }
13260 
13261 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13262 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13263 
13264 #pragma mark Image4 - New
13265 
13266 typedef struct _pmap_image4_dispatch {
13267 	image4_cs_trap_t selector;
13268 	image4_cs_trap_handler_t handler;
13269 } pmap_image4_dispatch_t;
13270 
13271 MARK_AS_PMAP_TEXT static errno_t
13272 _pmap_image4_monitor_trap_set_release_type(
13273 	const pmap_image4_dispatch_t *dispatch,
13274 	const void *input_data)
13275 {
13276 	/*
13277 	 * csmx_release_type --> __cs_copy
13278 	 */
13279 	image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13280 
13281 	/* Copy the input data to prevent ToCToU */
13282 	memcpy(&input, input_data, sizeof(input));
13283 
13284 	/* Dispatch to AppleImage4 */
13285 	return dispatch->handler(
13286 		dispatch->selector,
13287 		&input, sizeof(input),
13288 		NULL, NULL);
13289 }
13290 
13291 
13292 
13293 MARK_AS_PMAP_TEXT static errno_t
13294 _pmap_image4_monitor_trap_nonce_set(
13295 	const pmap_image4_dispatch_t *dispatch,
13296 	const void *input_data)
13297 {
13298 	/*
13299 	 * csmx_clear --> __cs_copy
13300 	 * csmx_cipher --> __cs_copy
13301 	 */
13302 	image4_cs_trap_argv_nonce_set_t input = {0};
13303 
13304 	/* Copy the input data to prevent ToCToU */
13305 	memcpy(&input, input_data, sizeof(input));
13306 
13307 	/* Dispatch to AppleImage4 */
13308 	return dispatch->handler(
13309 		dispatch->selector,
13310 		&input, sizeof(input),
13311 		NULL, NULL);
13312 }
13313 
13314 MARK_AS_PMAP_TEXT static errno_t
13315 _pmap_image4_monitor_trap_nonce_roll(
13316 	const pmap_image4_dispatch_t *dispatch,
13317 	const void *input_data)
13318 {
13319 	image4_cs_trap_argv_nonce_roll_t input = {0};
13320 
13321 	/* Copy the input data to prevent ToCToU */
13322 	memcpy(&input, input_data, sizeof(input));
13323 
13324 	/* Dispatch to AppleImage4 */
13325 	return dispatch->handler(
13326 		dispatch->selector,
13327 		&input, sizeof(input),
13328 		NULL, NULL);
13329 }
13330 
13331 MARK_AS_PMAP_TEXT static errno_t
13332 _pmap_image4_monitor_trap_image_activate(
13333 	const pmap_image4_dispatch_t *dispatch,
13334 	const void *input_data)
13335 {
13336 	/*
13337 	 * csmx_payload (csmx_payload_len) --> __cs_xfer
13338 	 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13339 	 */
13340 	image4_cs_trap_argv_image_activate_t input = {0};
13341 
13342 	/* Copy the input data to prevent ToCToU */
13343 	memcpy(&input, input_data, sizeof(input));
13344 
13345 	/* Validate the payload region */
13346 	pmap_cs_assert_addr(
13347 		input.csmx_payload, round_page(input.csmx_payload_len),
13348 		false, false);
13349 
13350 	/* Validate the manifest region */
13351 	pmap_cs_assert_addr(
13352 		input.csmx_manifest, round_page(input.csmx_manifest_len),
13353 		false, false);
13354 
13355 	/* Lockdown the payload region */
13356 	pmap_cs_lockdown_pages(
13357 		input.csmx_payload, round_page(input.csmx_payload_len), false);
13358 
13359 	/* Lockdown the manifest region */
13360 	pmap_cs_lockdown_pages(
13361 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13362 
13363 	/* Dispatch the handler */
13364 	errno_t err = dispatch->handler(
13365 		dispatch->selector,
13366 		&input, sizeof(input),
13367 		NULL, NULL);
13368 
13369 	/*
13370 	 * Image activation always returns the manifest back to the kernel since it isn't
13371 	 * needed once the evaluation of the image has been completed. The payload must
13372 	 * remain owned by the monitor if the activation was successful.
13373 	 */
13374 	if (err != 0) {
13375 		/* Unlock the payload region */
13376 		pmap_cs_unlockdown_pages(
13377 			input.csmx_payload, round_page(input.csmx_payload_len), false);
13378 	}
13379 
13380 	/* Unlock the manifest region */
13381 	pmap_cs_unlockdown_pages(
13382 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13383 
13384 	return err;
13385 }
13386 
13387 MARK_AS_PMAP_TEXT static errno_t
13388 _pmap_image4_monitor_trap_passthrough(
13389 	__unused const pmap_image4_dispatch_t *dispatch,
13390 	__unused const void *input_data,
13391 	__unused size_t input_size)
13392 {
13393 #if DEVELOPMENT || DEBUG || KASAN
13394 	return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13395 #else
13396 	pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13397 	return ENOSYS;
13398 #endif
13399 }
13400 
13401 MARK_AS_PMAP_TEXT errno_t
13402 pmap_image4_monitor_trap_internal(
13403 	image4_cs_trap_t selector,
13404 	const void *input_data,
13405 	size_t input_size)
13406 {
13407 	kern_return_t ret = KERN_DENIED;
13408 	errno_t err = EPERM;
13409 
13410 	/* Acquire the handler for this selector */
13411 	image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13412 	if (handler == NULL) {
13413 		pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13414 		return EINVAL;
13415 	}
13416 
13417 	/* Verify input size for the handler */
13418 	if (input_size != image4_cs_trap_vector_size(selector)) {
13419 		pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13420 		return EINVAL;
13421 	}
13422 
13423 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13424 	ret = pmap_reserve_ppl_page();
13425 	if (ret != KERN_SUCCESS) {
13426 		if (ret == KERN_RESOURCE_SHORTAGE) {
13427 			return ENOMEM;
13428 		}
13429 		pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13430 		return EPERM;
13431 	}
13432 
13433 	/* Setup dispatch parameters */
13434 	pmap_image4_dispatch_t dispatch = {
13435 		.selector = selector,
13436 		.handler = handler
13437 	};
13438 
13439 	switch (selector) {
13440 	case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13441 		err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13442 		break;
13443 
13444 	case IMAGE4_CS_TRAP_NONCE_SET:
13445 		err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13446 		break;
13447 
13448 	case IMAGE4_CS_TRAP_NONCE_ROLL:
13449 		err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13450 		break;
13451 
13452 	case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13453 		err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13454 		break;
13455 
13456 	default:
13457 		err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13458 		break;
13459 	}
13460 
13461 	/* Return the CoreCrypto reserved page back to the free list */
13462 	pmap_release_reserved_ppl_page();
13463 
13464 	return err;
13465 }
13466 
13467 errno_t
13468 pmap_image4_monitor_trap(
13469 	image4_cs_trap_t selector,
13470 	const void *input_data,
13471 	size_t input_size)
13472 {
13473 	errno_t err = EPERM;
13474 
13475 	err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13476 	while (err == ENOMEM) {
13477 		/* Allocate a page from the free list */
13478 		pmap_alloc_page_for_ppl(0);
13479 
13480 		/* Call the monitor dispatch again */
13481 		err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13482 	}
13483 
13484 	return err;
13485 }
13486 
13487 #endif /* PMAP_CS_PPL_MONITOR */
13488 
13489 #if PMAP_CS_INCLUDE_CODE_SIGNING
13490 
13491 static int
13492 pmap_cs_profiles_rbtree_compare(
13493 	void *profile0,
13494 	void *profile1)
13495 {
13496 	if (profile0 < profile1) {
13497 		return -1;
13498 	} else if (profile0 > profile1) {
13499 		return 1;
13500 	}
13501 	return 0;
13502 }
13503 
13504 /* Red-black tree for managing provisioning profiles */
13505 MARK_AS_PMAP_DATA static
13506 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13507 
13508 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13509 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13510 
13511 /* Lock for the profile red-black tree */
13512 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13513 
13514 void
13515 pmap_initialize_provisioning_profiles(void)
13516 {
13517 	/* Initialize the profiles red-black tree lock */
13518 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13519 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13520 
13521 	/* Initialize the red-black tree itself */
13522 	RB_INIT(&pmap_cs_registered_profiles);
13523 
13524 	printf("initialized PPL provisioning profile data\n");
13525 }
13526 
13527 static bool
13528 pmap_is_testflight_profile(
13529 	pmap_cs_profile_t *profile_obj)
13530 {
13531 	const char *entitlement_name = "beta-reports-active";
13532 	const size_t entitlement_length = strlen(entitlement_name);
13533 	CEQueryOperation_t query[2] = {0};
13534 
13535 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13536 	if (profile_obj->entitlements_ctx == NULL) {
13537 		return false;
13538 	}
13539 
13540 	/* Build our CoreEntitlements query */
13541 	query[0].opcode = kCEOpSelectKey;
13542 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13543 	query[0].parameters.stringParameter.length = entitlement_length;
13544 	query[1] = CEMatchBool(true);
13545 
13546 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13547 		profile_obj->entitlements_ctx,
13548 		query, 2);
13549 
13550 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13551 		return true;
13552 	}
13553 
13554 	return false;
13555 }
13556 
13557 static bool
13558 pmap_is_development_profile(
13559 	pmap_cs_profile_t *profile_obj)
13560 {
13561 	/* Check for UPP */
13562 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13563 		*profile_obj->profile_ctx,
13564 		CESelectDictValue("ProvisionsAllDevices"));
13565 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13566 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13567 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13568 			return false;
13569 		}
13570 	}
13571 
13572 	/* Check for TestFlight profile */
13573 	if (pmap_is_testflight_profile(profile_obj) == true) {
13574 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13575 		return false;
13576 	}
13577 
13578 	pmap_cs_log_info("%p: development profile", profile_obj);
13579 	return true;
13580 }
13581 
13582 static kern_return_t
13583 pmap_initialize_profile_entitlements(
13584 	pmap_cs_profile_t *profile_obj)
13585 {
13586 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13587 		*profile_obj->profile_ctx,
13588 		CESelectDictValue("Entitlements"));
13589 
13590 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13591 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13592 		profile_obj->entitlements_ctx = NULL;
13593 
13594 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13595 		return KERN_NOT_FOUND;
13596 	}
13597 
13598 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13599 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13600 
13601 	CEValidationResult ce_result = {0};
13602 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13603 		pmap_cs_core_entitlements_runtime,
13604 		&ce_result,
13605 		der_start, der_end);
13606 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13607 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13608 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13609 
13610 		return KERN_ABORTED;
13611 	}
13612 
13613 	struct CEQueryContext query_ctx = {0};
13614 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13615 		pmap_cs_core_entitlements_runtime,
13616 		ce_result,
13617 		&query_ctx);
13618 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13619 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13620 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13621 
13622 		return KERN_ABORTED;
13623 	}
13624 
13625 	/* Setup the entitlements context within the profile object */
13626 	profile_obj->entitlements_ctx_storage = query_ctx;
13627 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13628 
13629 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13630 	return KERN_SUCCESS;
13631 }
13632 
13633 kern_return_t
13634 pmap_register_provisioning_profile_internal(
13635 	const vm_address_t payload_addr,
13636 	const vm_size_t payload_size)
13637 {
13638 	kern_return_t ret = KERN_DENIED;
13639 	pmap_cs_profile_t *profile_obj = NULL;
13640 	pmap_profile_payload_t *profile_payload = NULL;
13641 	vm_size_t max_profile_blob_size = 0;
13642 	const uint8_t *profile_content = NULL;
13643 	size_t profile_content_length = 0;
13644 
13645 
13646 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13647 	ret = pmap_reserve_ppl_page();
13648 	if (ret != KERN_SUCCESS) {
13649 		if (ret != KERN_RESOURCE_SHORTAGE) {
13650 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13651 		}
13652 		return ret;
13653 	}
13654 
13655 	/* Ensure we have valid data passed in */
13656 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13657 
13658 	/*
13659 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13660 	 * data structure used by the PPL to manage the payload. We need to be able to write
13661 	 * to that data structure, so we keep the payload PPL writable.
13662 	 */
13663 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13664 
13665 	/* Should be safe to read from this now */
13666 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13667 
13668 	/* Ensure the profile blob size provided is valid */
13669 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13670 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13671 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13672 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13673 	}
13674 
13675 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13676 	const bool allow_development_root_cert = true;
13677 #else
13678 	const bool allow_development_root_cert = false;
13679 #endif
13680 
13681 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13682 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13683 		allow_development_root_cert,
13684 		&profile_content, &profile_content_length);
13685 
13686 	/* Release the PPL page allocated for CoreCrypto */
13687 	pmap_release_reserved_ppl_page();
13688 
13689 	if (ct_result != 0) {
13690 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13691 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13692 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13693 		    profile_content, profile_content_length);
13694 	}
13695 
13696 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13697 		pmap_cs_core_entitlements_runtime,
13698 		CCDER_CONSTRUCTED_SET,
13699 		false,
13700 		profile_content, profile_content + profile_content_length);
13701 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13702 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13703 	}
13704 
13705 	/* Acquire a writable version of the profile data structure */
13706 	profile_obj = &profile_payload->profile_obj_storage;
13707 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13708 
13709 	profile_obj->original_payload = profile_payload;
13710 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13711 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13712 	os_atomic_store(&profile_obj->reference_count, 0, release);
13713 
13714 	/* Setup the entitlements provisioned by the profile */
13715 	ret = pmap_initialize_profile_entitlements(profile_obj);
13716 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13717 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13718 	}
13719 
13720 	/* Setup properties of the profile */
13721 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13722 
13723 	/* Mark as validated since it passed all checks */
13724 	profile_obj->profile_validated = true;
13725 
13726 	/* Add the profile to the red-black tree */
13727 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13728 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13729 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13730 	}
13731 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13732 
13733 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13734 	return KERN_SUCCESS;
13735 }
13736 
13737 kern_return_t
13738 pmap_register_provisioning_profile(
13739 	const vm_address_t payload_addr,
13740 	const vm_size_t payload_size)
13741 {
13742 	kern_return_t ret = KERN_DENIED;
13743 
13744 	ret = pmap_register_provisioning_profile_ppl(
13745 		payload_addr,
13746 		payload_size);
13747 
13748 	while (ret == KERN_RESOURCE_SHORTAGE) {
13749 		/* Allocate a page from the free list */
13750 		pmap_alloc_page_for_ppl(0);
13751 
13752 		/* Attempt the call again */
13753 		ret = pmap_register_provisioning_profile_ppl(
13754 			payload_addr,
13755 			payload_size);
13756 	}
13757 
13758 	return ret;
13759 }
13760 
13761 kern_return_t
13762 pmap_unregister_provisioning_profile_internal(
13763 	pmap_cs_profile_t *profile_obj)
13764 {
13765 	kern_return_t ret = KERN_DENIED;
13766 
13767 	/* Lock the red-black tree exclusively */
13768 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13769 
13770 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13771 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13772 	}
13773 
13774 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13775 	if (reference_count != 0) {
13776 		ret = KERN_FAILURE;
13777 		goto exit;
13778 	}
13779 
13780 	/* Remove the profile from the red-black tree */
13781 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13782 
13783 	/* Unregistration was a success */
13784 	ret = KERN_SUCCESS;
13785 
13786 exit:
13787 	/* Unlock the red-black tree */
13788 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13789 
13790 	if (ret == KERN_SUCCESS) {
13791 		/* Get the original payload address */
13792 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13793 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13794 
13795 		/* Get the original payload size */
13796 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13797 		payload_size = round_page(payload_size);
13798 
13799 		/* Unlock the profile payload */
13800 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13801 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13802 		    profile_payload, payload_size);
13803 
13804 		profile_obj = NULL;
13805 	}
13806 	return ret;
13807 }
13808 
13809 kern_return_t
13810 pmap_unregister_provisioning_profile(
13811 	pmap_cs_profile_t *profile_obj)
13812 {
13813 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13814 }
13815 
13816 kern_return_t
13817 pmap_associate_provisioning_profile_internal(
13818 	pmap_cs_code_directory_t *cd_entry,
13819 	pmap_cs_profile_t *profile_obj)
13820 {
13821 	kern_return_t ret = KERN_DENIED;
13822 
13823 	/* Acquire the lock on the code directory */
13824 	pmap_cs_lock_code_directory(cd_entry);
13825 
13826 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13827 		pmap_cs_log_error("disallowing profile association with verified signature");
13828 		goto exit;
13829 	} else if (cd_entry->profile_obj != NULL) {
13830 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13831 		goto exit;
13832 	}
13833 
13834 	/* Lock the red-black tree as shared */
13835 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13836 
13837 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13838 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13839 	} else if (profile_obj->profile_validated == false) {
13840 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13841 	}
13842 
13843 	/* Associate the profile with the signature */
13844 	cd_entry->profile_obj = profile_obj;
13845 
13846 	/* Increment the reference count on the profile object */
13847 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13848 	if (reference_count == 0) {
13849 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13850 	}
13851 
13852 	/* Unlock the red-black tree */
13853 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13854 
13855 	/* Association was a success */
13856 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13857 	ret = KERN_SUCCESS;
13858 
13859 exit:
13860 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13861 
13862 	return ret;
13863 }
13864 
13865 kern_return_t
13866 pmap_associate_provisioning_profile(
13867 	pmap_cs_code_directory_t *cd_entry,
13868 	pmap_cs_profile_t *profile_obj)
13869 {
13870 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13871 }
13872 
13873 kern_return_t
13874 pmap_disassociate_provisioning_profile_internal(
13875 	pmap_cs_code_directory_t *cd_entry)
13876 {
13877 	pmap_cs_profile_t *profile_obj = NULL;
13878 	kern_return_t ret = KERN_DENIED;
13879 
13880 	/* Acquire the lock on the code directory */
13881 	pmap_cs_lock_code_directory(cd_entry);
13882 
13883 	if (cd_entry->profile_obj == NULL) {
13884 		ret = KERN_NOT_FOUND;
13885 		goto exit;
13886 	}
13887 	profile_obj = cd_entry->profile_obj;
13888 
13889 	/* Disassociate the profile from the signature */
13890 	cd_entry->profile_obj = NULL;
13891 
13892 	/* Disassociation was a success */
13893 	ret = KERN_SUCCESS;
13894 
13895 exit:
13896 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13897 
13898 	if (ret == KERN_SUCCESS) {
13899 		/* Decrement the reference count on the profile object */
13900 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13901 		if (reference_count == UINT32_MAX) {
13902 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13903 		}
13904 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13905 	}
13906 	return ret;
13907 }
13908 
13909 kern_return_t
13910 pmap_disassociate_provisioning_profile(
13911 	pmap_cs_code_directory_t *cd_entry)
13912 {
13913 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13914 }
13915 
13916 kern_return_t
13917 pmap_associate_kernel_entitlements_internal(
13918 	pmap_cs_code_directory_t *cd_entry,
13919 	const void *kernel_entitlements)
13920 {
13921 	kern_return_t ret = KERN_DENIED;
13922 
13923 	if (kernel_entitlements == NULL) {
13924 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13925 	}
13926 
13927 	/* Acquire the lock on the code directory */
13928 	pmap_cs_lock_code_directory(cd_entry);
13929 
13930 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13931 		ret = KERN_DENIED;
13932 		goto out;
13933 	} else if (cd_entry->kernel_entitlements != NULL) {
13934 		ret = KERN_DENIED;
13935 		goto out;
13936 	}
13937 	cd_entry->kernel_entitlements = kernel_entitlements;
13938 
13939 	/* Association was a success */
13940 	ret = KERN_SUCCESS;
13941 
13942 out:
13943 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13944 	return ret;
13945 }
13946 
13947 kern_return_t
13948 pmap_associate_kernel_entitlements(
13949 	pmap_cs_code_directory_t *cd_entry,
13950 	const void *kernel_entitlements)
13951 {
13952 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13953 }
13954 
13955 kern_return_t
13956 pmap_resolve_kernel_entitlements_internal(
13957 	pmap_t pmap,
13958 	const void **kernel_entitlements)
13959 {
13960 	const void *entitlements = NULL;
13961 	pmap_cs_code_directory_t *cd_entry = NULL;
13962 	kern_return_t ret = KERN_DENIED;
13963 
13964 	/* Validate the PMAP object */
13965 	validate_pmap(pmap);
13966 
13967 	/* Ensure no kernel PMAP */
13968 	if (pmap == kernel_pmap) {
13969 		return KERN_NOT_FOUND;
13970 	}
13971 
13972 	/* Attempt a shared lock on the PMAP */
13973 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13974 		return KERN_ABORTED;
13975 	}
13976 
13977 	/*
13978 	 * Acquire the code signature from the PMAP. This function is called when
13979 	 * performing an entitlement check, and since we've confirmed this isn't
13980 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13981 	 * with a code signature.
13982 	 */
13983 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13984 	if (cd_entry == NULL) {
13985 		ret = KERN_NOT_FOUND;
13986 		goto out;
13987 	}
13988 
13989 	entitlements = cd_entry->kernel_entitlements;
13990 	if (entitlements == NULL) {
13991 		ret = KERN_NOT_FOUND;
13992 		goto out;
13993 	}
13994 
13995 	/* Pin and write out the entitlements object pointer */
13996 	if (kernel_entitlements != NULL) {
13997 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13998 		*kernel_entitlements = entitlements;
13999 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14000 	}
14001 
14002 	/* Successfully resolved the entitlements */
14003 	ret = KERN_SUCCESS;
14004 
14005 out:
14006 	/* Unlock the code signature object */
14007 	if (cd_entry != NULL) {
14008 		lck_rw_unlock_shared(&cd_entry->rwlock);
14009 		cd_entry = NULL;
14010 	}
14011 
14012 	/* Unlock the PMAP object */
14013 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
14014 
14015 	return ret;
14016 }
14017 
14018 kern_return_t
14019 pmap_resolve_kernel_entitlements(
14020 	pmap_t pmap,
14021 	const void **kernel_entitlements)
14022 {
14023 	kern_return_t ret = KERN_DENIED;
14024 
14025 	do {
14026 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14027 	} while (ret == KERN_ABORTED);
14028 
14029 	return ret;
14030 }
14031 
14032 kern_return_t
14033 pmap_accelerate_entitlements_internal(
14034 	pmap_cs_code_directory_t *cd_entry)
14035 {
14036 	const coreentitlements_t *CoreEntitlements = NULL;
14037 	const CS_SuperBlob *superblob = NULL;
14038 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14039 	size_t signature_length = 0;
14040 	size_t acceleration_length = 0;
14041 	size_t required_length = 0;
14042 	kern_return_t ret = KERN_DENIED;
14043 
14044 	/* Setup the CoreEntitlements interface */
14045 	CoreEntitlements = &amfi->CoreEntitlements;
14046 
14047 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14048 
14049 	/* Acquire the lock on the code directory */
14050 	pmap_cs_lock_code_directory(cd_entry);
14051 
14052 	/*
14053 	 * Only reconstituted code signatures can be accelerated. This is only a policy
14054 	 * decision we make since this allows us to re-use any unused space within the
14055 	 * locked down code signature region. There is also a decent bit of validation
14056 	 * within the reconstitution function to ensure blobs are ordered and do not
14057 	 * contain any padding around them which can cause issues here.
14058 	 *
14059 	 * This also serves as a check to ensure the signature is trusted.
14060 	 */
14061 	if (cd_entry->unneeded_code_signature_unlocked == false) {
14062 		ret = KERN_DENIED;
14063 		goto out;
14064 	}
14065 
14066 	if (cd_entry->ce_ctx == NULL) {
14067 		ret = KERN_SUCCESS;
14068 		goto out;
14069 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14070 		ret = KERN_SUCCESS;
14071 		goto out;
14072 	}
14073 
14074 	/* We only support accelerating when size <= PAGE_SIZE */
14075 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14076 	if (ce_err != CoreEntitlements->kNoError) {
14077 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14078 			/* Small entitlement blobs aren't eligible */
14079 			ret = KERN_SUCCESS;
14080 			goto out;
14081 		}
14082 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14083 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14084 	} else if (acceleration_length > PAGE_SIZE) {
14085 		ret = KERN_ABORTED;
14086 		goto out;
14087 	}
14088 	assert(acceleration_length > 0);
14089 
14090 	superblob = cd_entry->superblob;
14091 	signature_length = ntohl(superblob->length);
14092 
14093 	/* Adjust the required length for the overhead structure -- can't overflow */
14094 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14095 	if (required_length > PAGE_SIZE) {
14096 		ret = KERN_ABORTED;
14097 		goto out;
14098 	}
14099 
14100 	/*
14101 	 * First we'll check if the code signature has enough space within the locked down
14102 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14103 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
14104 	 * free list.
14105 	 *
14106 	 * When we're storing the buffer within the code signature, we also need to make
14107 	 * sure we account for alignment of the buffer.
14108 	 */
14109 	const vm_address_t align_mask = sizeof(void*) - 1;
14110 	size_t required_length_within_sig = required_length + align_mask;
14111 
14112 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14113 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14114 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14115 
14116 		/* We need to resolve to the physical aperture */
14117 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14118 		acceleration_buf = (void*)phystokv(phys_addr);
14119 
14120 		/* Ensure the offset within the page wasn't lost */
14121 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14122 
14123 		acceleration_buf->allocated = false;
14124 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14125 	} else {
14126 		if (required_length <= pmap_cs_blob_limit) {
14127 			struct pmap_cs_blob *bucket = NULL;
14128 			size_t bucket_size = 0;
14129 
14130 			/* Allocate a buffer from the blob allocator */
14131 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14132 			if (ret != KERN_SUCCESS) {
14133 				goto out;
14134 			}
14135 			acceleration_buf = (void*)bucket->blob;
14136 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14137 		} else {
14138 			pmap_paddr_t phys_addr = 0;
14139 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14140 			if (ret != KERN_SUCCESS) {
14141 				goto out;
14142 			}
14143 			acceleration_buf = (void*)phystokv(phys_addr);
14144 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14145 		}
14146 		acceleration_buf->allocated = true;
14147 	}
14148 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14149 	acceleration_buf->length = acceleration_length;
14150 
14151 	/* Take the acceleration buffer lock */
14152 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14153 
14154 	/* Setup the global acceleration buffer state */
14155 	pmap_cs_acceleration_buf = acceleration_buf;
14156 
14157 	/* Accelerate the entitlements */
14158 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14159 	if (ce_err != CoreEntitlements->kNoError) {
14160 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14161 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14162 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14163 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14164 	}
14165 
14166 	/*
14167 	 * The global acceleration buffer lock is unlocked by the allocation function itself
14168 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14169 	 * an assert that the lock is unlocked here since another thread could have acquired
14170 	 * it by now.
14171 	 */
14172 	ret = KERN_SUCCESS;
14173 
14174 out:
14175 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14176 	return ret;
14177 }
14178 
14179 kern_return_t
14180 pmap_accelerate_entitlements(
14181 	pmap_cs_code_directory_t *cd_entry)
14182 {
14183 	kern_return_t ret = KERN_DENIED;
14184 
14185 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
14186 	while (ret == KERN_RESOURCE_SHORTAGE) {
14187 		/* Allocate a page for the PPL */
14188 		pmap_alloc_page_for_ppl(0);
14189 
14190 		/* Try again */
14191 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
14192 	}
14193 
14194 	return ret;
14195 }
14196 
14197 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14198 
14199 MARK_AS_PMAP_TEXT bool
14200 pmap_lookup_in_loaded_trust_caches_internal(
14201 	const uint8_t cdhash[CS_CDHASH_LEN])
14202 {
14203 	kern_return_t kr = KERN_NOT_FOUND;
14204 
14205 #if PMAP_CS_PPL_MONITOR
14206 	/*
14207 	 * If we have the PPL monitor, then this function can only be called from
14208 	 * within the PPL. Calling it directly would've caused a panic, so we can
14209 	 * assume that we're in the PPL here.
14210 	 */
14211 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14212 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14213 
14214 	kr = pmap_query_trust_cache_safe(
14215 		kTCQueryTypeLoadable,
14216 		cdhash_safe,
14217 		NULL);
14218 #else
14219 	kr = query_trust_cache(
14220 		kTCQueryTypeLoadable,
14221 		cdhash,
14222 		NULL);
14223 #endif
14224 
14225 	if (kr == KERN_SUCCESS) {
14226 		return true;
14227 	}
14228 	return false;
14229 }
14230 
14231 bool
14232 pmap_lookup_in_loaded_trust_caches(
14233 	const uint8_t cdhash[CS_CDHASH_LEN])
14234 {
14235 #if XNU_MONITOR
14236 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14237 #else
14238 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14239 #endif
14240 }
14241 
14242 MARK_AS_PMAP_TEXT uint32_t
14243 pmap_lookup_in_static_trust_cache_internal(
14244 	const uint8_t cdhash[CS_CDHASH_LEN])
14245 {
14246 	TrustCacheQueryToken_t query_token = {0};
14247 	kern_return_t kr = KERN_NOT_FOUND;
14248 	uint64_t flags = 0;
14249 	uint8_t hash_type = 0;
14250 
14251 #if PMAP_CS_PPL_MONITOR
14252 	/*
14253 	 * If we have the PPL monitor, then this function can only be called from
14254 	 * within the PPL. Calling it directly would've caused a panic, so we can
14255 	 * assume that we're in the PPL here.
14256 	 */
14257 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14258 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14259 
14260 	kr = pmap_query_trust_cache_safe(
14261 		kTCQueryTypeStatic,
14262 		cdhash_safe,
14263 		&query_token);
14264 #else
14265 	kr = query_trust_cache(
14266 		kTCQueryTypeStatic,
14267 		cdhash,
14268 		&query_token);
14269 #endif
14270 
14271 	if (kr == KERN_SUCCESS) {
14272 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
14273 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14274 
14275 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14276 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14277 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14278 	}
14279 
14280 	return 0;
14281 }
14282 
14283 uint32_t
14284 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14285 {
14286 #if XNU_MONITOR
14287 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14288 #else
14289 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
14290 #endif
14291 }
14292 
14293 #if PMAP_CS_INCLUDE_CODE_SIGNING
14294 
14295 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14296 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14297 
14298 MARK_AS_PMAP_TEXT void
14299 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14300 {
14301 
14302 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14303 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14304 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14305 
14306 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14307 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14308 }
14309 
14310 MARK_AS_PMAP_TEXT bool
14311 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14312 {
14313 	bool match = false;
14314 
14315 	/* Lockdown mode disallows compilation service */
14316 	if (ppl_lockdown_mode_enabled == true) {
14317 		return false;
14318 	}
14319 
14320 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14321 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14322 		match = true;
14323 	}
14324 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14325 
14326 	if (match) {
14327 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14328 	}
14329 
14330 	return match;
14331 }
14332 
14333 void
14334 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14335 {
14336 #if XNU_MONITOR
14337 	pmap_set_compilation_service_cdhash_ppl(cdhash);
14338 #else
14339 	pmap_set_compilation_service_cdhash_internal(cdhash);
14340 #endif
14341 }
14342 
14343 bool
14344 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14345 {
14346 #if XNU_MONITOR
14347 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
14348 #else
14349 	return pmap_match_compilation_service_cdhash_internal(cdhash);
14350 #endif
14351 }
14352 
14353 /*
14354  * As part of supporting local signing on the device, we need the PMAP layer
14355  * to store the local signing key so that PMAP_CS can validate with it. We
14356  * store it at the PMAP layer such that it is accessible to both AMFI and
14357  * PMAP_CS should they need it.
14358  */
14359 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14360 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14361 
14362 MARK_AS_PMAP_TEXT void
14363 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14364 {
14365 	bool key_set = false;
14366 
14367 	/*
14368 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14369 	 * a successful exchange means that the local signing public key has _not_ been
14370 	 * set. In case the key has been set, we panic as we would never expect the
14371 	 * kernel to attempt to set the key more than once.
14372 	 */
14373 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14374 
14375 	if (key_set) {
14376 		panic("attempted to set the local signing public key multiple times");
14377 	}
14378 
14379 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14380 	pmap_cs_log_info("set local signing public key");
14381 }
14382 
14383 void
14384 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14385 {
14386 #if XNU_MONITOR
14387 	return pmap_set_local_signing_public_key_ppl(public_key);
14388 #else
14389 	return pmap_set_local_signing_public_key_internal(public_key);
14390 #endif
14391 }
14392 
14393 uint8_t*
14394 pmap_get_local_signing_public_key(void)
14395 {
14396 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14397 
14398 	if (key_set) {
14399 		return pmap_local_signing_public_key;
14400 	}
14401 
14402 	return NULL;
14403 }
14404 
14405 /*
14406  * Locally signed applications need to be explicitly authorized by an entitled application
14407  * before we allow them to run.
14408  */
14409 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14410 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14411 
14412 MARK_AS_PMAP_TEXT void
14413 pmap_unrestrict_local_signing_internal(
14414 	const uint8_t cdhash[CS_CDHASH_LEN])
14415 {
14416 
14417 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14418 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14419 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14420 
14421 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14422 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14423 }
14424 
14425 void
14426 pmap_unrestrict_local_signing(
14427 	const uint8_t cdhash[CS_CDHASH_LEN])
14428 {
14429 #if XNU_MONITOR
14430 	return pmap_unrestrict_local_signing_ppl(cdhash);
14431 #else
14432 	return pmap_unrestrict_local_signing_internal(cdhash);
14433 #endif
14434 }
14435 
14436 #if PMAP_CS
14437 MARK_AS_PMAP_TEXT static void
14438 pmap_restrict_local_signing(void)
14439 {
14440 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14441 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14442 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14443 }
14444 
14445 MARK_AS_PMAP_TEXT static bool
14446 pmap_local_signing_restricted(
14447 	const uint8_t cdhash[CS_CDHASH_LEN])
14448 {
14449 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14450 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14451 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14452 
14453 	return ret != 0;
14454 }
14455 
14456 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14457 #endif
14458 
14459 MARK_AS_PMAP_TEXT void
14460 pmap_footprint_suspend_internal(
14461 	vm_map_t        map,
14462 	boolean_t       suspend)
14463 {
14464 #if DEVELOPMENT || DEBUG
14465 	if (suspend) {
14466 		current_thread()->pmap_footprint_suspended = TRUE;
14467 		map->pmap->footprint_was_suspended = TRUE;
14468 	} else {
14469 		current_thread()->pmap_footprint_suspended = FALSE;
14470 	}
14471 #else /* DEVELOPMENT || DEBUG */
14472 	(void) map;
14473 	(void) suspend;
14474 #endif /* DEVELOPMENT || DEBUG */
14475 }
14476 
14477 void
14478 pmap_footprint_suspend(
14479 	vm_map_t map,
14480 	boolean_t suspend)
14481 {
14482 #if XNU_MONITOR
14483 	pmap_footprint_suspend_ppl(map, suspend);
14484 #else
14485 	pmap_footprint_suspend_internal(map, suspend);
14486 #endif
14487 }
14488 
14489 MARK_AS_PMAP_TEXT void
14490 pmap_nop_internal(pmap_t pmap __unused)
14491 {
14492 	validate_pmap_mutable(pmap);
14493 }
14494 
14495 void
14496 pmap_nop(pmap_t pmap)
14497 {
14498 #if XNU_MONITOR
14499 	pmap_nop_ppl(pmap);
14500 #else
14501 	pmap_nop_internal(pmap);
14502 #endif
14503 }
14504 
14505 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14506 
14507 struct page_table_dump_header {
14508 	uint64_t pa;
14509 	uint64_t num_entries;
14510 	uint64_t start_va;
14511 	uint64_t end_va;
14512 };
14513 
14514 static kern_return_t
14515 pmap_dump_page_tables_recurse(pmap_t pmap,
14516     const tt_entry_t *ttp,
14517     unsigned int cur_level,
14518     unsigned int level_mask,
14519     uint64_t start_va,
14520     void *buf_start,
14521     void *buf_end,
14522     size_t *bytes_copied)
14523 {
14524 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14525 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14526 
14527 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14528 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14529 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14530 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14531 
14532 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14533 
14534 	if (cur_level == pt_attr_root_level(pt_attr)) {
14535 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14536 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14537 	}
14538 
14539 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14540 	const tt_entry_t *tt_end = &ttp[num_entries];
14541 
14542 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14543 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14544 	}
14545 
14546 	if (level_mask & (1U << cur_level)) {
14547 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14548 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14549 		header->num_entries = num_entries;
14550 		header->start_va = start_va;
14551 		header->end_va = start_va + (num_entries * size);
14552 
14553 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14554 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14555 	}
14556 	uint64_t current_va = start_va;
14557 
14558 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14559 		tt_entry_t tte = *ttep;
14560 
14561 		if (!(tte & valid_mask)) {
14562 			continue;
14563 		}
14564 
14565 		if ((tte & type_mask) == type_block) {
14566 			continue;
14567 		} else {
14568 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14569 				panic("%s: corrupt entry %#llx at %p, "
14570 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14571 				    __FUNCTION__, tte, ttep,
14572 				    ttp, cur_level, bufp, buf_end);
14573 			}
14574 
14575 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14576 
14577 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14578 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14579 
14580 			if (recurse_result != KERN_SUCCESS) {
14581 				return recurse_result;
14582 			}
14583 		}
14584 	}
14585 
14586 	return KERN_SUCCESS;
14587 }
14588 
14589 kern_return_t
14590 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14591 {
14592 	if (not_in_kdp) {
14593 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14594 	}
14595 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14596 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14597 }
14598 
14599 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14600 
14601 kern_return_t
14602 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14603     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14604 {
14605 	return KERN_NOT_SUPPORTED;
14606 }
14607 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14608 
14609 
14610 #ifdef CONFIG_XNUPOST
14611 #ifdef __arm64__
14612 static volatile bool pmap_test_took_fault = false;
14613 
14614 static bool
14615 pmap_test_fault_handler(arm_saved_state_t * state)
14616 {
14617 	bool retval                 = false;
14618 	uint64_t esr                = get_saved_state_esr(state);
14619 	esr_exception_class_t class = ESR_EC(esr);
14620 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14621 
14622 	if ((class == ESR_EC_DABORT_EL1) &&
14623 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14624 		pmap_test_took_fault = true;
14625 		/* return to the instruction immediately after the call to NX page */
14626 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14627 		retval = true;
14628 	}
14629 
14630 	return retval;
14631 }
14632 
14633 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14634 static NOKASAN bool
14635 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14636 {
14637 	pmap_t old_pmap = NULL;
14638 
14639 	pmap_test_took_fault = false;
14640 
14641 	/*
14642 	 * We're potentially switching pmaps without using the normal thread
14643 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14644 	 * memory accesses.
14645 	 */
14646 	uint64_t old_int_state = pmap_interrupts_disable();
14647 	mp_disable_preemption();
14648 
14649 	if (pmap != NULL) {
14650 		old_pmap = current_pmap();
14651 		pmap_switch(pmap);
14652 
14653 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14654 #if __ARM_PAN_AVAILABLE__
14655 		__builtin_arm_wsr("pan", 0);
14656 #endif /* __ARM_PAN_AVAILABLE__ */
14657 	}
14658 
14659 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14660 
14661 	if (is_write) {
14662 		*((volatile uint64_t*)(va)) = 0xdec0de;
14663 	} else {
14664 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14665 		(void)tmp;
14666 	}
14667 
14668 	/* Save the fault bool, and undo the gross stuff we did. */
14669 	bool took_fault = pmap_test_took_fault;
14670 	ml_expect_fault_end();
14671 
14672 	if (pmap != NULL) {
14673 #if __ARM_PAN_AVAILABLE__
14674 		__builtin_arm_wsr("pan", 1);
14675 #endif /* __ARM_PAN_AVAILABLE__ */
14676 
14677 		pmap_switch(old_pmap);
14678 	}
14679 
14680 	mp_enable_preemption();
14681 	pmap_interrupts_restore(old_int_state);
14682 	bool retval = (took_fault == should_fault);
14683 	return retval;
14684 }
14685 
14686 static bool
14687 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14688 {
14689 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14690 
14691 	if (!retval) {
14692 		T_FAIL("%s: %s, "
14693 		    "pmap=%p, va=%p, should_fault=%u",
14694 		    __func__, should_fault ? "did not fault" : "faulted",
14695 		    pmap, (void*)va, (unsigned)should_fault);
14696 	}
14697 
14698 	return retval;
14699 }
14700 
14701 static bool
14702 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14703 {
14704 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14705 
14706 	if (!retval) {
14707 		T_FAIL("%s: %s, "
14708 		    "pmap=%p, va=%p, should_fault=%u",
14709 		    __func__, should_fault ? "did not fault" : "faulted",
14710 		    pmap, (void*)va, (unsigned)should_fault);
14711 	}
14712 
14713 	return retval;
14714 }
14715 
14716 static bool
14717 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14718 {
14719 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14720 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14721 
14722 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14723 
14724 	if (!retval) {
14725 		T_FAIL("%s: bits=%u, "
14726 		    "pa=%p, should_be_set=%u",
14727 		    __func__, bits,
14728 		    (void*)pa, should_be_set);
14729 	}
14730 
14731 	return retval;
14732 }
14733 
14734 static __attribute__((noinline)) bool
14735 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14736 {
14737 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14738 	return retval;
14739 }
14740 
14741 static int
14742 pmap_test_test_config(unsigned int flags)
14743 {
14744 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14745 	unsigned int map_count = 0;
14746 	unsigned long page_ratio = 0;
14747 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14748 
14749 	if (!pmap) {
14750 		panic("Failed to allocate pmap");
14751 	}
14752 
14753 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14754 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14755 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14756 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14757 
14758 	if (pmap_page_size <= native_page_size) {
14759 		page_ratio = native_page_size / pmap_page_size;
14760 	} else {
14761 		/*
14762 		 * We claim to support a page_ratio of less than 1, which is
14763 		 * not currently supported by the pmap layer; panic.
14764 		 */
14765 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14766 		    "flags=%u",
14767 		    __func__, native_page_size, pmap_page_size,
14768 		    flags);
14769 	}
14770 
14771 	if (PAGE_RATIO > 1) {
14772 		/*
14773 		 * The kernel is deliberately pretending to have 16KB pages.
14774 		 * The pmap layer has code that supports this, so pretend the
14775 		 * page size is larger than it is.
14776 		 */
14777 		pmap_page_size = PAGE_SIZE;
14778 		native_page_size = PAGE_SIZE;
14779 	}
14780 
14781 	/*
14782 	 * Get two pages from the VM; one to be mapped wired, and one to be
14783 	 * mapped nonwired.
14784 	 */
14785 	vm_page_t unwired_vm_page = vm_page_grab();
14786 	vm_page_t wired_vm_page = vm_page_grab();
14787 
14788 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14789 		panic("Failed to grab VM pages");
14790 	}
14791 
14792 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14793 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14794 
14795 	pmap_paddr_t pa = ptoa(pn);
14796 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14797 
14798 	/*
14799 	 * We'll start mappings at the second twig TT.  This keeps us from only
14800 	 * using the first entry in each TT, which would trivially be address
14801 	 * 0; one of the things we will need to test is retrieving the VA for
14802 	 * a given PTE.
14803 	 */
14804 	vm_map_address_t va_base = pmap_twig_size;
14805 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14806 
14807 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14808 		/*
14809 		 * Not exactly a functional failure, but this test relies on
14810 		 * there being a spare PTE slot we can use to pin the TT.
14811 		 */
14812 		panic("Cannot pin translation table");
14813 	}
14814 
14815 	/*
14816 	 * Create the wired mapping; this will prevent the pmap layer from
14817 	 * reclaiming our test TTs, which would interfere with this test
14818 	 * ("interfere" -> "make it panic").
14819 	 */
14820 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14821 
14822 #if XNU_MONITOR
14823 	/*
14824 	 * If the PPL is enabled, make sure that the kernel cannot write
14825 	 * to PPL memory.
14826 	 */
14827 	if (!pmap_ppl_disable) {
14828 		T_LOG("Validate that kernel cannot write to PPL memory.");
14829 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14830 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14831 	}
14832 #endif
14833 
14834 	/*
14835 	 * Create read-only mappings of the nonwired page; if the pmap does
14836 	 * not use the same page size as the kernel, create multiple mappings
14837 	 * so that the kernel page is fully mapped.
14838 	 */
14839 	for (map_count = 0; map_count < page_ratio; map_count++) {
14840 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14841 	}
14842 
14843 	/* Validate that all the PTEs have the expected PA and VA. */
14844 	for (map_count = 0; map_count < page_ratio; map_count++) {
14845 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14846 
14847 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14848 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14849 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14850 		}
14851 
14852 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14853 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14854 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14855 		}
14856 	}
14857 
14858 	T_LOG("Validate that reads to our mapping do not fault.");
14859 	pmap_test_read(pmap, va_base, false);
14860 
14861 	T_LOG("Validate that writes to our mapping fault.");
14862 	pmap_test_write(pmap, va_base, true);
14863 
14864 	T_LOG("Make the first mapping writable.");
14865 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14866 
14867 	T_LOG("Validate that writes to our mapping do not fault.");
14868 	pmap_test_write(pmap, va_base, false);
14869 
14870 
14871 	T_LOG("Make the first mapping execute-only");
14872 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14873 
14874 
14875 	T_LOG("Validate that reads to our mapping do not fault.");
14876 	pmap_test_read(pmap, va_base, false);
14877 
14878 	T_LOG("Validate that writes to our mapping fault.");
14879 	pmap_test_write(pmap, va_base, true);
14880 
14881 
14882 	/*
14883 	 * For page ratios of greater than 1: validate that writes to the other
14884 	 * mappings still fault.  Remove the mappings afterwards (we're done
14885 	 * with page ratio testing).
14886 	 */
14887 	for (map_count = 1; map_count < page_ratio; map_count++) {
14888 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14889 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14890 	}
14891 
14892 	T_LOG("Mark the page unreferenced and unmodified.");
14893 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14894 	pmap_test_check_refmod(pa, 0);
14895 
14896 	/*
14897 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14898 	 * different protection/fault_type settings, and confirm that the
14899 	 * ref/mod state matches our expectations at each step.
14900 	 */
14901 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14902 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14903 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14904 
14905 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14906 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14907 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14908 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14909 
14910 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14911 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14912 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14913 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14914 
14915 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14916 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14917 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14918 
14919 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14920 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14921 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14922 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14923 
14924 	/*
14925 	 * Shared memory testing; we'll have two mappings; one read-only,
14926 	 * one read-write.
14927 	 */
14928 	vm_map_address_t rw_base = va_base;
14929 	vm_map_address_t ro_base = va_base + pmap_page_size;
14930 
14931 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14932 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14933 
14934 	/*
14935 	 * Test that we take faults as expected for unreferenced/unmodified
14936 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14937 	 * mapping permissions change as expected.
14938 	 */
14939 	T_LOG("!ref/!mod: expect no access");
14940 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14941 	pmap_test_read_write(pmap, ro_base, false, false);
14942 	pmap_test_read_write(pmap, rw_base, false, false);
14943 
14944 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14945 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14946 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14947 	pmap_test_read_write(pmap, ro_base, true, false);
14948 	pmap_test_read_write(pmap, rw_base, true, false);
14949 
14950 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14951 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14952 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14953 	pmap_test_read_write(pmap, ro_base, true, false);
14954 	pmap_test_read_write(pmap, rw_base, true, true);
14955 
14956 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14957 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14958 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14959 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14960 	pmap_test_read_write(pmap, ro_base, true, false);
14961 	pmap_test_read_write(pmap, rw_base, true, true);
14962 
14963 	T_LOG("RW protect both mappings; should not change protections.");
14964 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14965 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14966 	pmap_test_read_write(pmap, ro_base, true, false);
14967 	pmap_test_read_write(pmap, rw_base, true, true);
14968 
14969 	T_LOG("Read protect both mappings; RW mapping should become RO.");
14970 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14971 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14972 	pmap_test_read_write(pmap, ro_base, true, false);
14973 	pmap_test_read_write(pmap, rw_base, true, false);
14974 
14975 	T_LOG("RW protect the page; mappings should not change protections.");
14976 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14977 	pmap_page_protect(pn, VM_PROT_ALL);
14978 	pmap_test_read_write(pmap, ro_base, true, false);
14979 	pmap_test_read_write(pmap, rw_base, true, true);
14980 
14981 	T_LOG("Read protect the page; RW mapping should become RO.");
14982 	pmap_page_protect(pn, VM_PROT_READ);
14983 	pmap_test_read_write(pmap, ro_base, true, false);
14984 	pmap_test_read_write(pmap, rw_base, true, false);
14985 
14986 	T_LOG("Validate that disconnect removes all known mappings of the page.");
14987 	pmap_disconnect(pn);
14988 	if (!pmap_verify_free(pn)) {
14989 		T_FAIL("Page still has mappings");
14990 	}
14991 
14992 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
14993 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14994 	pmap_destroy(pmap);
14995 
14996 	T_LOG("Release the pages back to the VM.");
14997 	vm_page_lock_queues();
14998 	vm_page_free(unwired_vm_page);
14999 	vm_page_free(wired_vm_page);
15000 	vm_page_unlock_queues();
15001 
15002 	T_LOG("Testing successful!");
15003 	return 0;
15004 }
15005 #endif /* __arm64__ */
15006 
15007 kern_return_t
15008 pmap_test(void)
15009 {
15010 	T_LOG("Starting pmap_tests");
15011 #ifdef __arm64__
15012 	int flags = 0;
15013 	flags |= PMAP_CREATE_64BIT;
15014 
15015 #if __ARM_MIXED_PAGE_SIZE__
15016 	T_LOG("Testing VM_PAGE_SIZE_4KB");
15017 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15018 	T_LOG("Testing VM_PAGE_SIZE_16KB");
15019 	pmap_test_test_config(flags);
15020 #else /* __ARM_MIXED_PAGE_SIZE__ */
15021 	pmap_test_test_config(flags);
15022 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15023 
15024 #endif /* __arm64__ */
15025 	T_PASS("completed pmap_test successfully");
15026 	return KERN_SUCCESS;
15027 }
15028 #endif /* CONFIG_XNUPOST */
15029 
15030 /*
15031  * The following function should never make it to RELEASE code, since
15032  * it provides a way to get the PPL to modify text pages.
15033  */
15034 #if DEVELOPMENT || DEBUG
15035 
15036 #define ARM_UNDEFINED_INSN 0xe7f000f0
15037 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15038 
15039 /**
15040  * Forcibly overwrite executable text with an illegal instruction.
15041  *
15042  * @note Only used for xnu unit testing.
15043  *
15044  * @param pa The physical address to corrupt.
15045  *
15046  * @return KERN_SUCCESS on success.
15047  */
15048 kern_return_t
15049 pmap_test_text_corruption(pmap_paddr_t pa)
15050 {
15051 #if XNU_MONITOR
15052 	return pmap_test_text_corruption_ppl(pa);
15053 #else /* XNU_MONITOR */
15054 	return pmap_test_text_corruption_internal(pa);
15055 #endif /* XNU_MONITOR */
15056 }
15057 
15058 MARK_AS_PMAP_TEXT kern_return_t
15059 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15060 {
15061 	vm_offset_t va = phystokv(pa);
15062 	unsigned int pai = pa_index(pa);
15063 
15064 	assert(pa_valid(pa));
15065 
15066 	pvh_lock(pai);
15067 
15068 	pv_entry_t **pv_h  = pai_to_pvh(pai);
15069 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15070 #if defined(PVH_FLAG_EXEC)
15071 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15072 
15073 	if (need_ap_twiddle) {
15074 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15075 	}
15076 #endif /* defined(PVH_FLAG_EXEC) */
15077 
15078 	/*
15079 	 * The low bit in an instruction address indicates a THUMB instruction
15080 	 */
15081 	if (va & 1) {
15082 		va &= ~(vm_offset_t)1;
15083 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15084 	} else {
15085 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
15086 	}
15087 
15088 #if defined(PVH_FLAG_EXEC)
15089 	if (need_ap_twiddle) {
15090 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15091 	}
15092 #endif /* defined(PVH_FLAG_EXEC) */
15093 
15094 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15095 
15096 	pvh_unlock(pai);
15097 
15098 	return KERN_SUCCESS;
15099 }
15100 
15101 #endif /* DEVELOPMENT || DEBUG */
15102