xref: /xnu-12377.41.6/osfmk/arm/pmap/pmap.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 #include <machine/machine_routines.h>
75 
76 #include <arm/caches_internal.h>
77 #include <arm/cpu_data.h>
78 #include <arm/cpu_data_internal.h>
79 #include <arm/cpu_capabilities.h>
80 #include <arm/cpu_number.h>
81 #include <arm/machine_cpu.h>
82 #include <arm/misc_protos.h>
83 #include <arm/pmap/pmap_internal.h>
84 #include <arm/trap_internal.h>
85 
86 #include <arm64/proc_reg.h>
87 #include <pexpert/arm64/boot.h>
88 #include <arm64/ppl/sart.h>
89 #include <arm64/ppl/uat.h>
90 
91 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
92 #include <arm64/amcc_rorgn.h>
93 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94 
95 #include <pexpert/device_tree.h>
96 
97 #include <san/kasan.h>
98 #include <sys/cdefs.h>
99 
100 #if defined(HAS_APPLE_PAC)
101 #include <ptrauth.h>
102 #endif
103 
104 #ifdef CONFIG_XNUPOST
105 #include <tests/xnupost.h>
106 #endif
107 
108 
109 #if HAS_MTE
110 #error invalid configuration, you must be using CONFIG_SPTM
111 #endif
112 
113 #if HIBERNATION
114 #include <IOKit/IOHibernatePrivate.h>
115 #endif /* HIBERNATION */
116 
117 #define PMAP_L1_MAX_ENTRY (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) >> ARM_TT_L1_SHIFT)
118 #define PMAP_ROOT_ALLOC_SIZE ((PMAP_L1_MAX_ENTRY + 1) * sizeof(tt_entry_t))
119 
120 #ifndef __ARM64_PMAP_SUBPAGE_L1__
121 _Static_assert(ARM_PGBYTES == PMAP_ROOT_ALLOC_SIZE, "Unexpected L1 Size");
122 #endif
123 
124 #if __ARM_VMSA__ != 8
125 #error Unknown __ARM_VMSA__
126 #endif
127 
128 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
129 
130 extern u_int32_t random(void); /* from <libkern/libkern.h> */
131 
132 static bool alloc_asid(pmap_t pmap);
133 static void free_asid(pmap_t pmap);
134 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
135 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
136 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
137 
138 const struct page_table_ops native_pt_ops =
139 {
140 	.alloc_id = alloc_asid,
141 	.free_id = free_asid,
142 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
143 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
144 	.wimg_to_pte = wimg_to_pte,
145 };
146 
147 const struct page_table_level_info pmap_table_level_info_16k[] =
148 {
149 	[0] = {
150 		.size       = ARM_16K_TT_L0_SIZE,
151 		.offmask    = ARM_16K_TT_L0_OFFMASK,
152 		.shift      = ARM_16K_TT_L0_SHIFT,
153 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
154 		.valid_mask = ARM_TTE_VALID,
155 		.type_mask  = ARM_TTE_TYPE_MASK,
156 		.type_block = ARM_TTE_TYPE_BLOCK
157 	},
158 	[1] = {
159 		.size       = ARM_16K_TT_L1_SIZE,
160 		.offmask    = ARM_16K_TT_L1_OFFMASK,
161 		.shift      = ARM_16K_TT_L1_SHIFT,
162 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
163 		.valid_mask = ARM_TTE_VALID,
164 		.type_mask  = ARM_TTE_TYPE_MASK,
165 		.type_block = ARM_TTE_TYPE_BLOCK
166 	},
167 	[2] = {
168 		.size       = ARM_16K_TT_L2_SIZE,
169 		.offmask    = ARM_16K_TT_L2_OFFMASK,
170 		.shift      = ARM_16K_TT_L2_SHIFT,
171 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
172 		.valid_mask = ARM_TTE_VALID,
173 		.type_mask  = ARM_TTE_TYPE_MASK,
174 		.type_block = ARM_TTE_TYPE_BLOCK
175 	},
176 	[3] = {
177 		.size       = ARM_16K_TT_L3_SIZE,
178 		.offmask    = ARM_16K_TT_L3_OFFMASK,
179 		.shift      = ARM_16K_TT_L3_SHIFT,
180 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
181 		.valid_mask = ARM_PTE_TYPE_VALID,
182 		.type_mask  = ARM_TTE_TYPE_MASK,
183 		.type_block = ARM_TTE_TYPE_L3BLOCK
184 	}
185 };
186 
187 const struct page_table_level_info pmap_table_level_info_4k[] =
188 {
189 	[0] = {
190 		.size       = ARM_4K_TT_L0_SIZE,
191 		.offmask    = ARM_4K_TT_L0_OFFMASK,
192 		.shift      = ARM_4K_TT_L0_SHIFT,
193 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
194 		.valid_mask = ARM_TTE_VALID,
195 		.type_mask  = ARM_TTE_TYPE_MASK,
196 		.type_block = ARM_TTE_TYPE_BLOCK
197 	},
198 	[1] = {
199 		.size       = ARM_4K_TT_L1_SIZE,
200 		.offmask    = ARM_4K_TT_L1_OFFMASK,
201 		.shift      = ARM_4K_TT_L1_SHIFT,
202 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
203 		.valid_mask = ARM_TTE_VALID,
204 		.type_mask  = ARM_TTE_TYPE_MASK,
205 		.type_block = ARM_TTE_TYPE_BLOCK
206 	},
207 	[2] = {
208 		.size       = ARM_4K_TT_L2_SIZE,
209 		.offmask    = ARM_4K_TT_L2_OFFMASK,
210 		.shift      = ARM_4K_TT_L2_SHIFT,
211 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
212 		.valid_mask = ARM_TTE_VALID,
213 		.type_mask  = ARM_TTE_TYPE_MASK,
214 		.type_block = ARM_TTE_TYPE_BLOCK
215 	},
216 	[3] = {
217 		.size       = ARM_4K_TT_L3_SIZE,
218 		.offmask    = ARM_4K_TT_L3_OFFMASK,
219 		.shift      = ARM_4K_TT_L3_SHIFT,
220 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
221 		.valid_mask = ARM_PTE_TYPE_VALID,
222 		.type_mask  = ARM_TTE_TYPE_MASK,
223 		.type_block = ARM_TTE_TYPE_L3BLOCK
224 	}
225 };
226 
227 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
228 {
229 	[0] = { /* Unused */
230 		.size       = ARM_4K_TT_L0_SIZE,
231 		.offmask    = ARM_4K_TT_L0_OFFMASK,
232 		.shift      = ARM_4K_TT_L0_SHIFT,
233 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
234 		.valid_mask = ARM_TTE_VALID,
235 		.type_mask  = ARM_TTE_TYPE_MASK,
236 		.type_block = ARM_TTE_TYPE_BLOCK
237 	},
238 	[1] = { /* Concatenated, so index mask is larger than normal */
239 		.size       = ARM_4K_TT_L1_SIZE,
240 		.offmask    = ARM_4K_TT_L1_OFFMASK,
241 		.shift      = ARM_4K_TT_L1_SHIFT,
242 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
243 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
244 #else
245 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
246 #endif
247 		.valid_mask = ARM_TTE_VALID,
248 		.type_mask  = ARM_TTE_TYPE_MASK,
249 		.type_block = ARM_TTE_TYPE_BLOCK
250 	},
251 	[2] = {
252 		.size       = ARM_4K_TT_L2_SIZE,
253 		.offmask    = ARM_4K_TT_L2_OFFMASK,
254 		.shift      = ARM_4K_TT_L2_SHIFT,
255 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
256 		.valid_mask = ARM_TTE_VALID,
257 		.type_mask  = ARM_TTE_TYPE_MASK,
258 		.type_block = ARM_TTE_TYPE_BLOCK
259 	},
260 	[3] = {
261 		.size       = ARM_4K_TT_L3_SIZE,
262 		.offmask    = ARM_4K_TT_L3_OFFMASK,
263 		.shift      = ARM_4K_TT_L3_SHIFT,
264 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
265 		.valid_mask = ARM_PTE_TYPE_VALID,
266 		.type_mask  = ARM_TTE_TYPE_MASK,
267 		.type_block = ARM_TTE_TYPE_L3BLOCK
268 	}
269 };
270 
271 const struct page_table_attr pmap_pt_attr_4k = {
272 	.pta_level_info = pmap_table_level_info_4k,
273 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
274 #if __ARM_MIXED_PAGE_SIZE__
275 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
276 #else /* __ARM_MIXED_PAGE_SIZE__ */
277 #if __ARM_16K_PG__
278 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
279 #else /* __ARM_16K_PG__ */
280 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
281 #endif /* __ARM_16K_PG__ */
282 #endif /* __ARM_MIXED_PAGE_SIZE__ */
283 	.pta_max_level  = PMAP_TT_L3_LEVEL,
284 	.pta_ops = &native_pt_ops,
285 	.ap_ro = ARM_PTE_AP(AP_RORO),
286 	.ap_rw = ARM_PTE_AP(AP_RWRW),
287 	.ap_rona = ARM_PTE_AP(AP_RONA),
288 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
289 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
290 	.ap_x = ARM_PTE_PNX,
291 #if __ARM_MIXED_PAGE_SIZE__
292 	.pta_tcr_value  = TCR_EL1_4KB,
293 #endif /* __ARM_MIXED_PAGE_SIZE__ */
294 	.pta_page_size  = 4096,
295 	.pta_pagezero_size = 4096,
296 	.pta_page_shift = 12,
297 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
298 };
299 
300 const struct page_table_attr pmap_pt_attr_16k = {
301 	.pta_level_info = pmap_table_level_info_16k,
302 	.pta_root_level = PMAP_TT_L1_LEVEL,
303 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
304 	.pta_max_level  = PMAP_TT_L3_LEVEL,
305 	.pta_ops = &native_pt_ops,
306 	.ap_ro = ARM_PTE_AP(AP_RORO),
307 	.ap_rw = ARM_PTE_AP(AP_RWRW),
308 	.ap_rona = ARM_PTE_AP(AP_RONA),
309 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
310 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
311 	.ap_x = ARM_PTE_PNX,
312 #if __ARM_MIXED_PAGE_SIZE__
313 	.pta_tcr_value  = TCR_EL1_16KB,
314 #endif /* __ARM_MIXED_PAGE_SIZE__ */
315 	.pta_page_size  = 16384,
316 	.pta_pagezero_size = 16384,
317 	.pta_page_shift = 14,
318 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
319 };
320 
321 #if __ARM_16K_PG__
322 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
323 #else /* !__ARM_16K_PG__ */
324 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
325 #endif /* !__ARM_16K_PG__ */
326 
327 
328 #if DEVELOPMENT || DEBUG
329 int vm_footprint_suspend_allowed = 1;
330 
331 extern int pmap_ledgers_panic;
332 extern int pmap_ledgers_panic_leeway;
333 
334 #endif /* DEVELOPMENT || DEBUG */
335 
336 #if DEVELOPMENT || DEBUG
337 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
338 	(current_thread()->pmap_footprint_suspended)
339 #else /* DEVELOPMENT || DEBUG */
340 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
341 #endif /* DEVELOPMENT || DEBUG */
342 
343 
344 /*
345  * Represents a tlb range that will be flushed before exiting
346  * the ppl.
347  * Used by phys_attribute_clear_range to defer flushing pages in
348  * this range until the end of the operation.
349  */
350 typedef struct pmap_tlb_flush_range {
351 	pmap_t ptfr_pmap;
352 	vm_map_address_t ptfr_start;
353 	vm_map_address_t ptfr_end;
354 	bool ptfr_flush_needed;
355 } pmap_tlb_flush_range_t;
356 
357 #if XNU_MONITOR
358 /*
359  * PPL External References.
360  */
361 extern vm_offset_t   segPPLDATAB;
362 extern unsigned long segSizePPLDATA;
363 extern vm_offset_t   segPPLTEXTB;
364 extern unsigned long segSizePPLTEXT;
365 extern vm_offset_t   segPPLDATACONSTB;
366 extern unsigned long segSizePPLDATACONST;
367 
368 
369 /*
370  * PPL Global Variables
371  */
372 
373 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
374 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
375 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
376 #else
377 const boolean_t pmap_ppl_disable = FALSE;
378 #endif
379 
380 /*
381  * Indicates if the PPL has started applying APRR.
382  * This variable is accessed from various assembly trampolines, so be sure to change
383  * those if you change the size or layout of this variable.
384  */
385 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
386 
387 extern void *pmap_stacks_start;
388 extern void *pmap_stacks_end;
389 
390 #endif /* !XNU_MONITOR */
391 
392 
393 
394 /* Virtual memory region for early allocation */
395 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
396 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
397 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
398 
399 extern uint8_t bootstrap_pagetables[];
400 
401 extern unsigned int not_in_kdp;
402 
403 extern vm_offset_t first_avail;
404 
405 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
406 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
407 extern vm_offset_t     static_memory_end;
408 
409 extern const vm_map_address_t physmap_base;
410 extern const vm_map_address_t physmap_end;
411 
412 extern int maxproc, hard_maxproc;
413 
414 /* The number of address bits one TTBR can cover. */
415 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
416 
417 /*
418  * The bounds on our TTBRs.  These are for sanity checking that
419  * an address is accessible by a TTBR before we attempt to map it.
420  */
421 
422 /* The level of the root of a page table. */
423 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
424 
425 /* The number of entries in the root TT of a page table. */
426 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
427 
428 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
429 const pmap_t    kernel_pmap = &kernel_pmap_store;
430 
431 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
432 
433 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
434 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
435 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
436 
437 typedef struct tt_free_entry {
438 	struct tt_free_entry    *next;
439 } tt_free_entry_t;
440 
441 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
442 
443 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
444 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
445 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
446 #define FREE_PAGE_SIZE_TT_MAX   4
447 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
448 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
449 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
450 
451 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
452 
453 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
454 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
455 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
456 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
457 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
458 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
459 
460 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
462 
463 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
464 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
465 
466 /* Lock group used for all pmap object locks. */
467 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
468 
469 #if DEVELOPMENT || DEBUG
470 int nx_enabled = 1;                                     /* enable no-execute protection */
471 int allow_data_exec  = 0;                               /* No apps may execute data */
472 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
473 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
474 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
475 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
476 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
477 #else /* DEVELOPMENT || DEBUG */
478 const int nx_enabled = 1;                                       /* enable no-execute protection */
479 const int allow_data_exec  = 0;                         /* No apps may execute data */
480 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
481 #endif /* DEVELOPMENT || DEBUG */
482 
483 /**
484  * This variable is set true during hibernation entry to protect pmap data structures
485  * during image copying, and reset false on hibernation exit.
486  */
487 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
488 
489 #if MACH_ASSERT
490 static void pmap_check_ledgers(pmap_t pmap);
491 #else
492 static inline void
pmap_check_ledgers(__unused pmap_t pmap)493 pmap_check_ledgers(__unused pmap_t pmap)
494 {
495 }
496 #endif /* MACH_ASSERT */
497 
498 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
499 
500 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
501 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
502 
503 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
504 
505 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
506 #if defined(__arm64__)
507 /* end of shared region + 512MB for various purposes */
508 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
509 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
510     "Minimum address space size outside allowable range");
511 
512 // Max offset is 15.375GB for devices with "large" memory config
513 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
514 // Max offset is 11.375GB for devices with "small" memory config
515 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
516 
517 
518 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
519     "Large device address space size outside allowable range");
520 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
521     "Small device address space size outside allowable range");
522 
523 #  ifdef XNU_TARGET_OS_OSX
524 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
525 #  else
526 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
527 #  endif
528 #endif /* __arm64__ */
529 
530 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
531 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
532 #else
533 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
534 #endif
535 
536 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
537 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
538 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
539 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
540 #if !HAS_16BIT_ASID
541 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
542 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
543 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
544 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
545 #else
546 static uint16_t last_allocated_asid = 0;
547 #endif /* !HAS_16BIT_ASID */
548 
549 #if HAS_SPECRES_DEBUGGING
550 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
551 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
552 #endif /* HAS_SPECRES_DEBUGGING */
553 
554 
555 #if __ARM_MIXED_PAGE_SIZE__
556 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
557 #endif
558 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
559 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
560 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
561 
562 /* PTE Define Macros */
563 
564 #define ARM_PTE_IS_COMPRESSED(x, p) \
565 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
566 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
567 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
568 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
569 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
570 
571 #define pte_is_wired(pte)                                                               \
572 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
573 
574 #define pte_was_writeable(pte) \
575 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
576 
577 #define pte_set_was_writeable(pte, was_writeable) \
578 	do {                                         \
579 	        if ((was_writeable)) {               \
580 	                (pte) |= ARM_PTE_WRITEABLE;  \
581 	        } else {                             \
582 	                (pte) &= ~ARM_PTE_WRITEABLE; \
583 	        }                                    \
584 	} while(0)
585 
586 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)587 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
588 {
589 	if (wired) {
590 		*ptep |= ARM_PTE_WIRED;
591 	} else {
592 		*ptep &= ~ARM_PTE_WIRED;
593 	}
594 	/*
595 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
596 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
597 	 * never reclaimed.
598 	 */
599 	if (pmap == kernel_pmap) {
600 		return;
601 	}
602 	unsigned short *ptd_wiredcnt_ptr;
603 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
604 	if (wired) {
605 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
606 	} else {
607 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
608 		if (__improbable(prev_wired == 0)) {
609 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
610 		}
611 	}
612 }
613 
614 #if HAS_FEAT_XS
615 
616 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)617 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
618 {
619 	if (__improbable(pt_attr->stage2)) {
620 		return false;
621 	}
622 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
623 	case CACHE_ATTRINDX_DISABLE_XS:
624 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
625 		return true;
626 	default:
627 		return false;
628 	}
629 }
630 
631 #endif /* HAS_FEAT_XS */
632 
633 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
634 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
635 	arm64_sync_tlb(strong);                                                                               \
636 }
637 
638 /*
639  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
640  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
641  * will observe the updated PTE.
642  */
643 #define FLUSH_PTE()                                                                     \
644 	__builtin_arm_dmb(DMB_ISH);
645 
646 /*
647  * Synchronize updates to PTEs that were previously valid and thus may be cached in
648  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
649  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
650  * program order will not issue until the DSB completes.  Prior loads may be reordered
651  * after the barrier, but their behavior should not be materially affected by the
652  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
653  * matter for loads until the access is re-driven well after the TLB update is
654  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
655  * we should be in a position to handle access faults.  For "voluntary" PTE access
656  * restriction due to unmapping or protection, the decision to restrict access should
657  * have a data dependency on prior loads in order to avoid a data race.
658  */
659 #define FLUSH_PTE_STRONG()                                                             \
660 	__builtin_arm_dsb(DSB_ISHST);
661 
662 /**
663  * Write enough page table entries to map a single VM page. On systems where the
664  * VM page size does not match the hardware page size, multiple page table
665  * entries will need to be written.
666  *
667  * @note This function does not emit a barrier to ensure these page table writes
668  *       have completed before continuing. This is commonly needed. In the case
669  *       where a DMB or DSB barrier is needed, then use the write_pte() and
670  *       write_pte_strong() functions respectively instead of this one.
671  *
672  * @param ptep Pointer to the first page table entry to update.
673  * @param pte The value to write into each page table entry. In the case that
674  *            multiple PTEs are updated to a non-empty value, then the address
675  *            in this value will automatically be incremented for each PTE
676  *            write.
677  */
678 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)679 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
680 {
681 	/**
682 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
683 	 * systems, which is why it's checked at runtime instead of compile time.
684 	 * The "unreachable" warning needs to be suppressed because it still is a
685 	 * compile time constant on some systems.
686 	 */
687 	__unreachable_ok_push
688 	if (TEST_PAGE_RATIO_4) {
689 		if (((uintptr_t)ptep) & 0x1f) {
690 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
691 			    __func__, ptep, (void*)pte);
692 		}
693 
694 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
695 			/**
696 			 * If we're writing an empty/compressed PTE value, then don't
697 			 * auto-increment the address for each PTE write.
698 			 */
699 			*ptep = pte;
700 			*(ptep + 1) = pte;
701 			*(ptep + 2) = pte;
702 			*(ptep + 3) = pte;
703 		} else {
704 			*ptep = pte;
705 			*(ptep + 1) = pte | 0x1000;
706 			*(ptep + 2) = pte | 0x2000;
707 			*(ptep + 3) = pte | 0x3000;
708 		}
709 	} else {
710 		*ptep = pte;
711 	}
712 	__unreachable_ok_pop
713 }
714 
715 /**
716  * Writes enough page table entries to map a single VM page and then ensures
717  * those writes complete by executing a Data Memory Barrier.
718  *
719  * @note The DMB issued by this function is not strong enough to protect against
720  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
721  *       instruction is going to immediately be called after this write, it's
722  *       recommended to call write_pte_strong() instead of this function.
723  *
724  * See the function header for write_pte_fast() for more details on the
725  * parameters.
726  */
727 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)728 write_pte(pt_entry_t *ptep, pt_entry_t pte)
729 {
730 	write_pte_fast(ptep, pte);
731 	FLUSH_PTE();
732 }
733 
734 /**
735  * Writes enough page table entries to map a single VM page and then ensures
736  * those writes complete by executing a Data Synchronization Barrier. This
737  * barrier provides stronger guarantees than the DMB executed by write_pte().
738  *
739  * @note This function is useful if you're going to immediately flush the TLB
740  *       after making the PTE write. A DSB is required to protect against the
741  *       TLB invalidate being reordered before the PTE write.
742  *
743  * See the function header for write_pte_fast() for more details on the
744  * parameters.
745  */
746 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)747 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
748 {
749 	write_pte_fast(ptep, pte);
750 	FLUSH_PTE_STRONG();
751 }
752 
753 /**
754  * Retrieve the pmap structure for the thread running on the current CPU.
755  */
756 pmap_t
current_pmap()757 current_pmap()
758 {
759 	const pmap_t current = vm_map_pmap(current_thread()->map);
760 
761 	assert(current != NULL);
762 
763 #if XNU_MONITOR
764 	/**
765 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
766 	 * decided by kernel-writable memory. This function is used in various parts
767 	 * of the PPL, and besides validating that the pointer returned by this
768 	 * function is indeed a pmap structure, it's also important to ensure that
769 	 * it's actually the current thread's pmap. This is because different pmaps
770 	 * will have access to different entitlements based on the code signature of
771 	 * their loaded process. So if a different user pmap is set in the current
772 	 * thread structure (in an effort to bypass code signing restrictions), even
773 	 * though the structure would validate correctly as it is a real pmap
774 	 * structure, it should fail here.
775 	 *
776 	 * This only needs to occur for user pmaps because the kernel pmap's root
777 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
778 	 * changed so it'd be redundant to check), and its code signing fields are
779 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
780 	 * it shouldn't be possible to set those fields. Due to that, an attacker
781 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
782 	 * this check won't accomplish anything as it doesn't provide any extra code
783 	 * signing entitlements.
784 	 */
785 	if ((current != kernel_pmap) &&
786 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
787 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
788 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
789 	}
790 #endif /* XNU_MONITOR */
791 
792 	return current;
793 }
794 
795 #if DEVELOPMENT || DEBUG
796 
797 /*
798  * Trace levels are controlled by a bitmask in which each
799  * level can be enabled/disabled by the (1<<level) position
800  * in the boot arg
801  * Level 0: PPL extension functionality
802  * Level 1: pmap lifecycle (create/destroy/switch)
803  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
804  * Level 3: internal state management (attributes/fast-fault)
805  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
806  */
807 
808 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
809 
810 #define PMAP_TRACE(level, ...) \
811 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
812 	        KDBG_RELEASE(__VA_ARGS__); \
813 	}
814 #else /* DEVELOPMENT || DEBUG */
815 
816 #define PMAP_TRACE(level, ...)
817 
818 #endif /* DEVELOPMENT || DEBUG */
819 
820 
821 /*
822  * Internal function prototypes (forward declarations).
823  */
824 
825 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
826 
827 static void pmap_set_reference(ppnum_t pn);
828 
829 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
830 
831 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
832 
833 static kern_return_t pmap_expand(
834 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
835 
836 static int pmap_remove_range(
837 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
838 
839 static tt_entry_t *pmap_tt1_allocate(
840 	pmap_t, vm_size_t, unsigned int);
841 
842 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
843 
844 static void pmap_tt1_deallocate(
845 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
846 
847 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
848 
849 static kern_return_t pmap_tt_allocate(
850 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
851 
852 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
853 
854 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
855 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
856 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
857 
858 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
859 
860 
861 static void pmap_unmap_commpage(
862 	pmap_t pmap);
863 
864 static boolean_t
865 pmap_is_64bit(pmap_t);
866 
867 
868 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
869 
870 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
871 
872 static bool pmap_update_cache_attributes_locked(
873 	ppnum_t, unsigned, bool);
874 
875 static boolean_t arm_clear_fast_fault(
876 	ppnum_t ppnum,
877 	vm_prot_t fault_type,
878 	pt_entry_t *pte_p);
879 
880 static void pmap_trim_self(pmap_t pmap);
881 static void pmap_trim_subord(pmap_t subord);
882 
883 
884 /*
885  * Temporary prototypes, while we wait for pmap_enter to move to taking an
886  * address instead of a page number.
887  */
888 static kern_return_t
889 pmap_enter_addr(
890 	pmap_t pmap,
891 	vm_map_address_t v,
892 	pmap_paddr_t pa,
893 	vm_prot_t prot,
894 	vm_prot_t fault_type,
895 	unsigned int flags,
896 	boolean_t wired);
897 
898 kern_return_t
899 pmap_enter_options_addr(
900 	pmap_t pmap,
901 	vm_map_address_t v,
902 	pmap_paddr_t pa,
903 	vm_prot_t prot,
904 	vm_prot_t fault_type,
905 	unsigned int flags,
906 	boolean_t wired,
907 	unsigned int options,
908 	__unused void   *arg,
909 	__unused pmap_mapping_type_t mapping_type);
910 
911 #ifdef CONFIG_XNUPOST
912 kern_return_t pmap_test(void);
913 #endif /* CONFIG_XNUPOST */
914 
915 PMAP_SUPPORT_PROTOTYPES(
916 	kern_return_t,
917 	arm_fast_fault, (pmap_t pmap,
918 	vm_map_address_t va,
919 	vm_prot_t fault_type,
920 	bool was_af_fault,
921 	bool from_user), ARM_FAST_FAULT_INDEX);
922 
923 PMAP_SUPPORT_PROTOTYPES(
924 	boolean_t,
925 	arm_force_fast_fault, (ppnum_t ppnum,
926 	vm_prot_t allow_mode,
927 	int options), ARM_FORCE_FAST_FAULT_INDEX);
928 
929 MARK_AS_PMAP_TEXT static boolean_t
930 arm_force_fast_fault_with_flush_range(
931 	ppnum_t ppnum,
932 	vm_prot_t allow_mode,
933 	int options,
934 	pmap_tlb_flush_range_t *flush_range);
935 
936 /**
937  * Definition of the states driving the batch cache attributes update
938  * state machine.
939  */
940 typedef struct {
941 	uint64_t page_index : 32,           /* The page index to be operated on */
942 	    state : 8,                      /* The current state of the update machine */
943 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
944 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
945 	:0;
946 } batch_set_cache_attr_state_t;
947 
948 /* Possible values of the "state" field. */
949 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
950 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
951 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
952 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
953 
954 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
955 
956 PMAP_SUPPORT_PROTOTYPES(
957 	batch_set_cache_attr_state_t,
958 	pmap_batch_set_cache_attributes, (
959 #if XNU_MONITOR
960 		volatile upl_page_info_t *user_page_list,
961 #else /* !XNU_MONITOR */
962 		upl_page_info_array_t user_page_list,
963 #endif /* XNU_MONITOR */
964 		batch_set_cache_attr_state_t state,
965 		unsigned int page_cnt,
966 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
967 
968 PMAP_SUPPORT_PROTOTYPES(
969 	kern_return_t,
970 	pmap_change_wiring, (pmap_t pmap,
971 	vm_map_address_t v,
972 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
973 
974 PMAP_SUPPORT_PROTOTYPES(
975 	pmap_t,
976 	pmap_create_options, (ledger_t ledger,
977 	vm_map_size_t size,
978 	unsigned int flags,
979 	kern_return_t * kr), PMAP_CREATE_INDEX);
980 
981 PMAP_SUPPORT_PROTOTYPES(
982 	void,
983 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
984 
985 PMAP_SUPPORT_PROTOTYPES(
986 	kern_return_t,
987 	pmap_enter_options, (pmap_t pmap,
988 	vm_map_address_t v,
989 	pmap_paddr_t pa,
990 	vm_prot_t prot,
991 	vm_prot_t fault_type,
992 	unsigned int flags,
993 	boolean_t wired,
994 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
995 
996 PMAP_SUPPORT_PROTOTYPES(
997 	pmap_paddr_t,
998 	pmap_find_pa, (pmap_t pmap,
999 	addr64_t va), PMAP_FIND_PA_INDEX);
1000 
1001 PMAP_SUPPORT_PROTOTYPES(
1002 	kern_return_t,
1003 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1004 
1005 
1006 PMAP_SUPPORT_PROTOTYPES(
1007 	boolean_t,
1008 	pmap_is_empty, (pmap_t pmap,
1009 	vm_map_offset_t va_start,
1010 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1011 
1012 
1013 PMAP_SUPPORT_PROTOTYPES(
1014 	unsigned int,
1015 	pmap_map_cpu_windows_copy, (ppnum_t pn,
1016 	vm_prot_t prot,
1017 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1018 
1019 PMAP_SUPPORT_PROTOTYPES(
1020 	void,
1021 	pmap_ro_zone_memcpy, (zone_id_t zid,
1022 	vm_offset_t va,
1023 	vm_offset_t offset,
1024 	const vm_offset_t new_data,
1025 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1026 
1027 PMAP_SUPPORT_PROTOTYPES(
1028 	uint64_t,
1029 	pmap_ro_zone_atomic_op, (zone_id_t zid,
1030 	vm_offset_t va,
1031 	vm_offset_t offset,
1032 	zro_atomic_op_t op,
1033 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1034 
1035 PMAP_SUPPORT_PROTOTYPES(
1036 	void,
1037 	pmap_ro_zone_bzero, (zone_id_t zid,
1038 	vm_offset_t va,
1039 	vm_offset_t offset,
1040 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1041 
1042 PMAP_SUPPORT_PROTOTYPES(
1043 	kern_return_t,
1044 	pmap_set_shared_region, (pmap_t grand,
1045 	pmap_t subord,
1046 	addr64_t vstart,
1047 	uint64_t size), PMAP_SET_SHARED_REGION_INDEX);
1048 
1049 PMAP_SUPPORT_PROTOTYPES(
1050 	vm_map_offset_t,
1051 	pmap_nest, (pmap_t grand,
1052 	pmap_t subord,
1053 	addr64_t vstart,
1054 	uint64_t size,
1055 	vm_map_offset_t vrestart,
1056 	kern_return_t * krp), PMAP_NEST_INDEX);
1057 
1058 PMAP_SUPPORT_PROTOTYPES(
1059 	void,
1060 	pmap_page_protect_options, (ppnum_t ppnum,
1061 	vm_prot_t prot,
1062 	unsigned int options,
1063 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1064 
1065 PMAP_SUPPORT_PROTOTYPES(
1066 	vm_map_address_t,
1067 	pmap_protect_options, (pmap_t pmap,
1068 	vm_map_address_t start,
1069 	vm_map_address_t end,
1070 	vm_prot_t prot,
1071 	unsigned int options,
1072 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1073 
1074 PMAP_SUPPORT_PROTOTYPES(
1075 	kern_return_t,
1076 	pmap_query_page_info, (pmap_t pmap,
1077 	vm_map_offset_t va,
1078 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1079 
1080 PMAP_SUPPORT_PROTOTYPES(
1081 	mach_vm_size_t,
1082 	pmap_query_resident, (pmap_t pmap,
1083 	vm_map_address_t start,
1084 	vm_map_address_t end,
1085 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1086 
1087 PMAP_SUPPORT_PROTOTYPES(
1088 	void,
1089 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1090 
1091 PMAP_SUPPORT_PROTOTYPES(
1092 	vm_map_address_t,
1093 	pmap_remove_options, (pmap_t pmap,
1094 	vm_map_address_t start,
1095 	vm_map_address_t end,
1096 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1097 
1098 
1099 PMAP_SUPPORT_PROTOTYPES(
1100 	void,
1101 	pmap_set_cache_attributes, (ppnum_t pn,
1102 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1103 
1104 PMAP_SUPPORT_PROTOTYPES(
1105 	void,
1106 	pmap_update_compressor_page, (ppnum_t pn,
1107 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1108 
1109 PMAP_SUPPORT_PROTOTYPES(
1110 	void,
1111 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1112 
1113 #if MACH_ASSERT || XNU_MONITOR
1114 PMAP_SUPPORT_PROTOTYPES(
1115 	void,
1116 	pmap_set_process, (pmap_t pmap,
1117 	int pid,
1118 	char *procname), PMAP_SET_PROCESS_INDEX);
1119 #endif
1120 
1121 PMAP_SUPPORT_PROTOTYPES(
1122 	void,
1123 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1124 
1125 PMAP_SUPPORT_PROTOTYPES(
1126 	vm_map_offset_t,
1127 	pmap_unnest_options, (pmap_t grand,
1128 	addr64_t vaddr,
1129 	uint64_t size,
1130 	vm_map_offset_t vrestart,
1131 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1132 
1133 PMAP_SUPPORT_PROTOTYPES(
1134 	void,
1135 	phys_attribute_set, (ppnum_t pn,
1136 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1137 
1138 PMAP_SUPPORT_PROTOTYPES(
1139 	void,
1140 	phys_attribute_clear, (ppnum_t pn,
1141 	unsigned int bits,
1142 	int options,
1143 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1144 
1145 #if __ARM_RANGE_TLBI__
1146 PMAP_SUPPORT_PROTOTYPES(
1147 	vm_map_address_t,
1148 	phys_attribute_clear_range, (pmap_t pmap,
1149 	vm_map_address_t start,
1150 	vm_map_address_t end,
1151 	unsigned int bits,
1152 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1153 #endif /* __ARM_RANGE_TLBI__ */
1154 
1155 
1156 PMAP_SUPPORT_PROTOTYPES(
1157 	void,
1158 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1159 
1160 PMAP_SUPPORT_PROTOTYPES(
1161 	void,
1162 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1163 
1164 PMAP_SUPPORT_PROTOTYPES(
1165 	void,
1166 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1167 
1168 PMAP_SUPPORT_PROTOTYPES(
1169 	void,
1170 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1171 
1172 PMAP_SUPPORT_PROTOTYPES(
1173 	void,
1174 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1175 
1176 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1177 PMAP_SUPPORT_PROTOTYPES(
1178 	void,
1179 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1180 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1181 
1182 /* Definition of the states used by pmap_trim(). */
1183 typedef enum {
1184 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1185 	PMAP_TRIM_STATE_START = 0,
1186 
1187 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1188 	PMAP_TRIM_STATE_GRAND_BEFORE,
1189 
1190 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1191 	PMAP_TRIM_STATE_GRAND_AFTER,
1192 
1193 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1194 	PMAP_TRIM_STATE_SUBORD,
1195 
1196 	/* Marks that trimming is finished. */
1197 	PMAP_TRIM_STATE_DONE,
1198 
1199 	/* Sentry enum for sanity checks. */
1200 	PMAP_TRIM_STATE_COUNT,
1201 } pmap_trim_state_t;
1202 
1203 PMAP_SUPPORT_PROTOTYPES(
1204 	pmap_trim_state_t,
1205 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1206 
1207 #if HAS_APPLE_PAC
1208 PMAP_SUPPORT_PROTOTYPES(
1209 	void *,
1210 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1211 PMAP_SUPPORT_PROTOTYPES(
1212 	void *,
1213 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1214 #endif /* HAS_APPLE_PAC */
1215 
1216 
1217 
1218 
1219 PMAP_SUPPORT_PROTOTYPES(
1220 	kern_return_t,
1221 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1222 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1223 
1224 PMAP_SUPPORT_PROTOTYPES(
1225 	kern_return_t,
1226 	pmap_load_trust_cache_with_type, (TCType_t type,
1227 	const vm_address_t pmap_img4_payload,
1228 	const vm_size_t pmap_img4_payload_len,
1229 	const vm_address_t img4_manifest,
1230 	const vm_size_t img4_manifest_len,
1231 	const vm_address_t img4_aux_manifest,
1232 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1233 
1234 PMAP_SUPPORT_PROTOTYPES(
1235 	void,
1236 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1237 
1238 PMAP_SUPPORT_PROTOTYPES(
1239 	kern_return_t,
1240 	pmap_query_trust_cache, (TCQueryType_t query_type,
1241 	const uint8_t cdhash[kTCEntryHashSize],
1242 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1243 
1244 PMAP_SUPPORT_PROTOTYPES(
1245 	errno_t,
1246 	pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1247 	const void *input_data,
1248 	size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1249 
1250 #if PMAP_CS_INCLUDE_CODE_SIGNING
1251 
1252 PMAP_SUPPORT_PROTOTYPES(
1253 	kern_return_t,
1254 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1255 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1256 
1257 PMAP_SUPPORT_PROTOTYPES(
1258 	kern_return_t,
1259 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1260 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1261 
1262 PMAP_SUPPORT_PROTOTYPES(
1263 	kern_return_t,
1264 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1265 	pmap_cs_profile_t * profile_obj),
1266 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1267 
1268 PMAP_SUPPORT_PROTOTYPES(
1269 	kern_return_t,
1270 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1271 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1272 
1273 PMAP_SUPPORT_PROTOTYPES(
1274 	kern_return_t,
1275 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1276 	const void *kernel_entitlements),
1277 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1278 
1279 PMAP_SUPPORT_PROTOTYPES(
1280 	kern_return_t,
1281 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1282 	const void **kernel_entitlements),
1283 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1284 
1285 PMAP_SUPPORT_PROTOTYPES(
1286 	kern_return_t,
1287 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1288 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1289 
1290 PMAP_SUPPORT_PROTOTYPES(
1291 	kern_return_t,
1292 	pmap_cs_allow_invalid, (pmap_t pmap),
1293 	PMAP_CS_ALLOW_INVALID_INDEX);
1294 
1295 PMAP_SUPPORT_PROTOTYPES(
1296 	void,
1297 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1298 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1299 
1300 PMAP_SUPPORT_PROTOTYPES(
1301 	bool,
1302 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1303 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1304 
1305 PMAP_SUPPORT_PROTOTYPES(
1306 	void,
1307 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1308 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1309 
1310 PMAP_SUPPORT_PROTOTYPES(
1311 	void,
1312 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1313 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1314 
1315 #endif
1316 
1317 PMAP_SUPPORT_PROTOTYPES(
1318 	uint32_t,
1319 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1320 
1321 PMAP_SUPPORT_PROTOTYPES(
1322 	bool,
1323 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1324 
1325 PMAP_SUPPORT_PROTOTYPES(
1326 	void,
1327 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1328 
1329 void pmap_footprint_suspend(vm_map_t    map,
1330     boolean_t   suspend);
1331 PMAP_SUPPORT_PROTOTYPES(
1332 	void,
1333 	pmap_footprint_suspend, (vm_map_t map,
1334 	boolean_t suspend),
1335 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1336 
1337 
1338 
1339 
1340 #if DEVELOPMENT || DEBUG
1341 PMAP_SUPPORT_PROTOTYPES(
1342 	kern_return_t,
1343 	pmap_test_text_corruption, (pmap_paddr_t),
1344 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1345 #endif /* DEVELOPMENT || DEBUG */
1346 
1347 /*
1348  * The low global vector page is mapped at a fixed alias.
1349  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1350  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1351  * to check both addresses anyway for backward compatibility. So for now
1352  * we leave H6 and H7 where they were.
1353  */
1354 #if (ARM_PGSHIFT == 14)
1355 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1356 #else
1357 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1358 #endif
1359 
1360 
1361 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1362 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1363 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1364 
1365 #if XNU_MONITOR
1366 
1367 #if __has_feature(ptrauth_calls)
1368 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1369 #else
1370 #define __ptrauth_ppl_handler
1371 #endif
1372 
1373 /*
1374  * Table of function pointers used for PPL dispatch.
1375  */
1376 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1377 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1378 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1379 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1380 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1381 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1382 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1383 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1384 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1385 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1386 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1387 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1388 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1389 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1390 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1391 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1392 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1393 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1394 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1395 	[PMAP_SET_SHARED_REGION_INDEX] = pmap_set_shared_region_internal,
1396 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1397 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1398 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1399 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1400 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1401 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1402 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1403 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1404 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1405 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1406 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1407 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1408 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1409 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1410 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1411 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1412 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1413 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1414 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1415 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1416 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1417 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1418 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1419 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1420 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1421 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1422 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1423 	[PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1424 #if PMAP_CS_INCLUDE_CODE_SIGNING
1425 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1426 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1427 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1428 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1429 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1430 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1431 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1432 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1433 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1434 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1435 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1436 #endif
1437 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1438 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1439 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1440 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1441 #if HAS_APPLE_PAC
1442 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1443 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1444 #endif /* HAS_APPLE_PAC */
1445 #if __ARM_RANGE_TLBI__
1446 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1447 #endif /* __ARM_RANGE_TLBI__ */
1448 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1449 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1450 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1451 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1452 
1453 #if DEVELOPMENT || DEBUG
1454 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1455 #endif /* DEVELOPMENT || DEBUG */
1456 
1457 };
1458 #endif
1459 
1460 #if XNU_MONITOR
1461 /**
1462  * A convenience function for setting protections on a single physical
1463  * aperture or static region mapping without invalidating the TLB.
1464  *
1465  * @note This function does not perform any TLB invalidations. That must be done
1466  *       separately to be able to safely use the updated mapping.
1467  *
1468  * @note This function understands the difference between the VM page size and
1469  *       the kernel page size and will update multiple PTEs if the sizes differ.
1470  *       In other words, enough PTEs will always get updated to change the
1471  *       permissions on a PAGE_SIZE amount of memory.
1472  *
1473  * @note The PVH lock for the physical page represented by this mapping must
1474  *       already be locked.
1475  *
1476  * @note This function assumes the caller has already verified that the PTE
1477  *       pointer does indeed point to a physical aperture or static region page
1478  *       table. Please validate your inputs before passing it along to this
1479  *       function.
1480  *
1481  * @param ptep Pointer to the physical aperture or static region page table to
1482  *             update with a new XPRR index.
1483  * @param expected_perm The XPRR index that is expected to already exist at the
1484  *                      current mapping. If the current index doesn't match this
1485  *                      then the system will panic.
1486  * @param new_perm The new XPRR index to update the mapping with.
1487  */
1488 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1489 pmap_set_pte_xprr_perm(
1490 	pt_entry_t * const ptep,
1491 	unsigned int expected_perm,
1492 	unsigned int new_perm)
1493 {
1494 	assert(ptep != NULL);
1495 
1496 	pt_entry_t spte = *ptep;
1497 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1498 
1499 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1500 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1501 		    __func__, ptep, new_perm, expected_perm);
1502 	}
1503 
1504 	/**
1505 	 * The PTE involved should be valid, should not have the hint bit set, and
1506 	 * should have the expected XPRR index.
1507 	 */
1508 	if (__improbable(!pte_is_valid(spte))) {
1509 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1510 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1511 		    __func__, ptep, spte, new_perm, expected_perm);
1512 	}
1513 
1514 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1515 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1516 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1517 		    __func__, ptep, spte, new_perm, expected_perm);
1518 	}
1519 
1520 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1521 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1522 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1523 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1524 	}
1525 
1526 	pt_entry_t template = spte;
1527 	template &= ~ARM_PTE_XPRR_MASK;
1528 	template |= xprr_perm_to_pte(new_perm);
1529 
1530 	write_pte_strong(ptep, template);
1531 }
1532 
1533 /**
1534  * Update the protections on a single physical aperture mapping and invalidate
1535  * the TLB so the mapping can be used.
1536  *
1537  * @note The PVH lock for the physical page must already be locked.
1538  *
1539  * @param pai The physical address index of the page whose physical aperture
1540  *            mapping will be updated with new permissions.
1541  * @param expected_perm The XPRR index that is expected to already exist at the
1542  *                      current mapping. If the current index doesn't match this
1543  *                      then the system will panic.
1544  * @param new_perm The new XPRR index to update the mapping with.
1545  */
1546 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1547 pmap_set_xprr_perm(
1548 	unsigned int pai,
1549 	unsigned int expected_perm,
1550 	unsigned int new_perm)
1551 {
1552 	pvh_assert_locked(pai);
1553 
1554 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1555 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1556 
1557 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1558 
1559 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1560 	sync_tlb_flush();
1561 }
1562 
1563 /**
1564  * Update the protections on a range of physical aperture or static region
1565  * mappings and invalidate the TLB so the mappings can be used.
1566  *
1567  * @note Static region mappings can only be updated before machine_lockdown().
1568  *       Physical aperture mappings can be updated at any time.
1569  *
1570  * @param start The starting virtual address of the static region or physical
1571  *              aperture range whose permissions will be updated.
1572  * @param end The final (inclusive) virtual address of the static region or
1573  *            physical aperture range whose permissions will be updated.
1574  * @param expected_perm The XPRR index that is expected to already exist at the
1575  *                      current mappings. If the current indices don't match
1576  *                      this then the system will panic.
1577  * @param new_perm The new XPRR index to update the mappings with.
1578  */
1579 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1580 pmap_set_range_xprr_perm(
1581 	vm_address_t start,
1582 	vm_address_t end,
1583 	unsigned int expected_perm,
1584 	unsigned int new_perm)
1585 {
1586 	/**
1587 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1588 	 */
1589 	if (__improbable((start | end) & ARM_PGMASK)) {
1590 		panic_plain("%s: start or end not page aligned, "
1591 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1592 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1593 	}
1594 
1595 	if (__improbable(start > end)) {
1596 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1597 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1598 	}
1599 
1600 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1601 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1602 
1603 	if (__improbable(!(in_physmap || in_static))) {
1604 		panic_plain("%s: address not in static region or physical aperture, "
1605 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1606 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1607 	}
1608 
1609 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1610 		panic_plain("%s: invalid XPRR index, "
1611 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1612 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1613 	}
1614 
1615 	/*
1616 	 * Walk over the PTEs for the given range, and set the protections on those
1617 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1618 	 * one twig entry (whichever twig entry currently maps "va").
1619 	 */
1620 	vm_address_t va = start;
1621 	while (va < end) {
1622 		/**
1623 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1624 		 * PTEs from va to tte_va_end will have their permissions updated.
1625 		 */
1626 		vm_address_t tte_va_end =
1627 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1628 
1629 		if (tte_va_end > end) {
1630 			tte_va_end = end;
1631 		}
1632 
1633 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1634 
1635 		if (ttep == NULL) {
1636 			panic_plain("%s: physical aperture or static region tte is NULL, "
1637 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1638 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1639 		}
1640 
1641 		tt_entry_t tte = *ttep;
1642 
1643 		if (!tte_is_valid_table(tte)) {
1644 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1645 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1646 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1647 		}
1648 
1649 		/* Walk over the given L3 page table page and update the PTEs. */
1650 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1651 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1652 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1653 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1654 
1655 		/**
1656 		 * The current PTE pointer is incremented by the page ratio (ratio of
1657 		 * VM page size to kernel hardware page size) because one call to
1658 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1659 		 * a PAGE_SIZE worth of hardware pages.
1660 		 */
1661 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1662 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1663 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1664 			pvh_lock(pai);
1665 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1666 			pvh_unlock(pai);
1667 		}
1668 
1669 		va = tte_va_end;
1670 	}
1671 
1672 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1673 }
1674 
1675 #endif /* XNU_MONITOR */
1676 
1677 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1678 PMAP_ZINFO_PALLOC(
1679 	pmap_t pmap, int bytes)
1680 {
1681 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1682 }
1683 
1684 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1685 PMAP_ZINFO_PFREE(
1686 	pmap_t pmap,
1687 	int bytes)
1688 {
1689 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1690 }
1691 
1692 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1693 pmap_tt_ledger_credit(
1694 	pmap_t          pmap,
1695 	vm_size_t       size)
1696 {
1697 	if (pmap != kernel_pmap) {
1698 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1699 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1700 	}
1701 }
1702 
1703 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1704 pmap_tt_ledger_debit(
1705 	pmap_t          pmap,
1706 	vm_size_t       size)
1707 {
1708 	if (pmap != kernel_pmap) {
1709 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1710 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1711 	}
1712 }
1713 
1714 static inline void
pmap_update_plru(uint16_t asid_index __unused)1715 pmap_update_plru(uint16_t asid_index __unused)
1716 {
1717 #if !HAS_16BIT_ASID
1718 	if (__probable(pmap_asid_plru)) {
1719 		unsigned plru_index = asid_index >> 6;
1720 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1721 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1722 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1723 		}
1724 	}
1725 #endif /* !HAS_16BIT_ASID */
1726 }
1727 
1728 static bool
alloc_asid(pmap_t pmap)1729 alloc_asid(pmap_t pmap)
1730 {
1731 	int vasid = -1;
1732 	uint16_t hw_asid;
1733 
1734 	pmap_simple_lock(&asid_lock);
1735 
1736 #if !HAS_16BIT_ASID
1737 	if (__probable(pmap_asid_plru)) {
1738 		unsigned plru_index = 0;
1739 		uint64_t lowest_gen = asid_plru_generation[0];
1740 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1741 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1742 			if (asid_plru_generation[i] < lowest_gen) {
1743 				plru_index = i;
1744 				lowest_gen = asid_plru_generation[i];
1745 				lowest_gen_bitmap = asid_plru_bitmap[i];
1746 			}
1747 		}
1748 
1749 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1750 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1751 			if (temp_plru) {
1752 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1753 #if DEVELOPMENT || DEBUG
1754 				++pmap_asid_hits;
1755 #endif
1756 				break;
1757 			}
1758 		}
1759 	}
1760 #else
1761 	/**
1762 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1763 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1764 	 * However, we first try to allocate starting from the position of the most-recently allocated
1765 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1766 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1767 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1768 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1769 	 * logic, without requiring prohibitively expensive RCTX instructions.
1770 	 */
1771 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1772 #endif /* !HAS_16BIT_ASID */
1773 	if (__improbable(vasid < 0)) {
1774 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1775 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1776 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1777 #if DEVELOPMENT || DEBUG
1778 		++pmap_asid_misses;
1779 #endif
1780 	}
1781 	if (__improbable(vasid < 0)) {
1782 		pmap_simple_unlock(&asid_lock);
1783 		return false;
1784 	}
1785 	assert((uint32_t)vasid < pmap_max_asids);
1786 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1787 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1788 #if HAS_16BIT_ASID
1789 	last_allocated_asid = (uint16_t)vasid;
1790 #endif /* HAS_16BIT_ASID */
1791 	pmap_simple_unlock(&asid_lock);
1792 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1793 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1794 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1795 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1796 		 * reassign to a reserved VASID. */
1797 		assert(pmap->sw_asid < UINT8_MAX);
1798 		pmap->sw_asid = UINT8_MAX;
1799 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1800 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1801 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1802 		assert(hw_asid < MAX_HW_ASIDS);
1803 	}
1804 	pmap_update_plru(hw_asid);
1805 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1806 #if __ARM_KERNEL_PROTECT__
1807 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1808 #endif
1809 	pmap->hw_asid = hw_asid;
1810 	return true;
1811 }
1812 
1813 static void
free_asid(pmap_t pmap)1814 free_asid(pmap_t pmap)
1815 {
1816 	unsigned int vasid;
1817 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1818 	if (__improbable(hw_asid == 0)) {
1819 		return;
1820 	}
1821 
1822 #if __ARM_KERNEL_PROTECT__
1823 	hw_asid >>= 1;
1824 #endif
1825 	hw_asid -= 1;
1826 
1827 #if HAS_16BIT_ASID
1828 	vasid = hw_asid;
1829 #else
1830 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1831 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1832 	} else {
1833 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1834 	}
1835 
1836 	if (__probable(pmap_asid_plru)) {
1837 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1838 	}
1839 #endif /* HAS_16BIT_ASID */
1840 	pmap_simple_lock(&asid_lock);
1841 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1842 	bitmap_set(&asid_bitmap[0], vasid);
1843 	pmap_simple_unlock(&asid_lock);
1844 }
1845 
1846 
1847 boolean_t
pmap_valid_address(pmap_paddr_t addr)1848 pmap_valid_address(
1849 	pmap_paddr_t addr)
1850 {
1851 	return pa_valid(addr);
1852 }
1853 
1854 
1855 
1856 
1857 
1858 
1859 /*
1860  *      Map memory at initialization.  The physical addresses being
1861  *      mapped are not managed and are never unmapped.
1862  *
1863  *      For now, VM is already on, we only need to map the
1864  *      specified memory.
1865  */
1866 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1867 pmap_map(
1868 	vm_map_address_t virt,
1869 	vm_offset_t start,
1870 	vm_offset_t end,
1871 	vm_prot_t prot,
1872 	unsigned int flags)
1873 {
1874 	kern_return_t   kr;
1875 	vm_size_t       ps;
1876 
1877 	ps = PAGE_SIZE;
1878 	while (start < end) {
1879 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1880 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1881 
1882 		if (kr != KERN_SUCCESS) {
1883 			panic("%s: failed pmap_enter, "
1884 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1885 			    __FUNCTION__,
1886 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1887 		}
1888 
1889 		virt += ps;
1890 		start += ps;
1891 	}
1892 	return virt;
1893 }
1894 
1895 #if XNU_MONITOR
1896 /**
1897  * Remove kernel writeablity from an IO PTE value if the page is owned by
1898  * guarded mode software.
1899  *
1900  * @param paddr The physical address of the page which has to be non-DRAM.
1901  * @param tmplate The PTE value to be evaluated.
1902  *
1903  * @return A new PTE value with permission bits modified.
1904  */
1905 static inline
1906 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1907 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1908 {
1909 	assert(!pa_valid(paddr));
1910 
1911 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1912 
1913 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1914 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1915 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1916 		switch (xprr_perm) {
1917 		case XPRR_KERN_RO_PERM:
1918 			break;
1919 		case XPRR_KERN_RW_PERM:
1920 			tmplate &= ~ARM_PTE_XPRR_MASK;
1921 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1922 			break;
1923 		default:
1924 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1925 		}
1926 	}
1927 
1928 	return tmplate;
1929 }
1930 #endif /* XNU_MONITOR */
1931 
1932 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1933 pmap_map_bd_with_options(
1934 	vm_map_address_t virt,
1935 	vm_offset_t start,
1936 	vm_offset_t end,
1937 	vm_prot_t prot,
1938 	int32_t options)
1939 {
1940 	pt_entry_t      mem_attr;
1941 
1942 	if (__improbable(start & PAGE_MASK)) {
1943 		panic("%s: start 0x%lx is not page aligned", __func__, start);
1944 	}
1945 
1946 	if (__improbable(end & PAGE_MASK)) {
1947 		panic("%s: end 0x%lx is not page aligned", __func__, end);
1948 	}
1949 
1950 	if (__improbable(!gDramBase || !gDramSize)) {
1951 		panic("%s: gDramBase/gDramSize not initialized", __func__);
1952 	}
1953 
1954 	const bool first_page_is_dram = is_dram_addr(start);
1955 	for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1956 		if (first_page_is_dram != is_dram_addr(pa)) {
1957 			panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1958 			    __func__, pa, first_page_is_dram ? "is not" : "is");
1959 		}
1960 	}
1961 
1962 	switch (options & PMAP_MAP_BD_MASK) {
1963 	case PMAP_MAP_BD_WCOMB:
1964 		if (is_dram_addr(start)) {
1965 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1966 		} else {
1967 #if HAS_FEAT_XS
1968 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1969 #else /* HAS_FEAT_XS */
1970 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1971 #endif /* HAS_FEAT_XS */
1972 #if DEBUG || DEVELOPMENT
1973 			pmap_wcrt_on_non_dram_count_increment_atomic();
1974 #endif /* DEBUG || DEVELOPMENT */
1975 		}
1976 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1977 		break;
1978 	case PMAP_MAP_BD_POSTED:
1979 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1980 		break;
1981 	case PMAP_MAP_BD_POSTED_REORDERED:
1982 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1983 		break;
1984 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1985 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1986 		break;
1987 	default:
1988 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1989 		break;
1990 	}
1991 
1992 	/* not cacheable and not buffered */
1993 	pt_entry_t tmplate = pa_to_pte(start)
1994 	    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1995 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1996 	    | mem_attr;
1997 
1998 #if __ARM_KERNEL_PROTECT__
1999 	tmplate |= ARM_PTE_NG;
2000 #endif /* __ARM_KERNEL_PROTECT__ */
2001 
2002 	vm_map_address_t vaddr = virt;
2003 	vm_offset_t paddr = start;
2004 	while (paddr < end) {
2005 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
2006 		if (ptep == PT_ENTRY_NULL) {
2007 			panic("pmap_map_bd");
2008 		}
2009 
2010 		/**
2011 		 * For every iteration, the paddr encoded in tmplate is incrementing,
2012 		 * but we always start with the original AP bits defined at the top
2013 		 * of the function in tmplate and only modify the AP bits in the pte
2014 		 * variable.
2015 		 */
2016 		pt_entry_t pte;
2017 #if XNU_MONITOR
2018 		if (!pa_valid(paddr)) {
2019 			pte = pmap_construct_io_pte(paddr, tmplate);
2020 		} else {
2021 			pte = tmplate;
2022 		}
2023 #else /* !XNU_MONITOR */
2024 		pte = tmplate;
2025 #endif
2026 
2027 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2028 		write_pte_strong(ptep, pte);
2029 
2030 		pte_increment_pa(tmplate);
2031 		vaddr += PAGE_SIZE;
2032 		paddr += PAGE_SIZE;
2033 	}
2034 
2035 	if (end >= start) {
2036 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
2037 	}
2038 
2039 	return vaddr;
2040 }
2041 
2042 /*
2043  *      Back-door routine for mapping kernel VM at initialization.
2044  *      Useful for mapping memory outside the range
2045  *      [vm_first_phys, vm_last_phys] (i.e., devices).
2046  *      Otherwise like pmap_map.
2047  */
2048 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2049 pmap_map_bd(
2050 	vm_map_address_t virt,
2051 	vm_offset_t start,
2052 	vm_offset_t end,
2053 	vm_prot_t prot)
2054 {
2055 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
2056 }
2057 
2058 /*
2059  *      Back-door routine for mapping kernel VM at initialization.
2060  *      Useful for mapping memory specific physical addresses in early
2061  *      boot (i.e., before kernel_map is initialized).
2062  *
2063  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
2064  */
2065 
2066 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2067 pmap_map_high_window_bd(
2068 	vm_offset_t pa_start,
2069 	vm_size_t len,
2070 	vm_prot_t prot)
2071 {
2072 	pt_entry_t              *ptep, pte;
2073 	vm_map_address_t        va_start = VREGION1_START;
2074 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
2075 	vm_map_address_t        va_end;
2076 	vm_map_address_t        va;
2077 	vm_size_t               offset;
2078 
2079 	offset = pa_start & PAGE_MASK;
2080 	pa_start -= offset;
2081 	len += offset;
2082 
2083 	if (len > (va_max - va_start)) {
2084 		panic("%s: area too large, "
2085 		    "pa_start=%p, len=%p, prot=0x%x",
2086 		    __FUNCTION__,
2087 		    (void*)pa_start, (void*)len, prot);
2088 	}
2089 
2090 scan:
2091 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2092 		ptep = pmap_pte(kernel_pmap, va_start);
2093 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2094 		if (!pte_is_valid(*ptep)) {
2095 			break;
2096 		}
2097 	}
2098 	if (va_start > va_max) {
2099 		panic("%s: insufficient pages, "
2100 		    "pa_start=%p, len=%p, prot=0x%x",
2101 		    __FUNCTION__,
2102 		    (void*)pa_start, (void*)len, prot);
2103 	}
2104 
2105 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2106 		ptep = pmap_pte(kernel_pmap, va_end);
2107 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2108 		if (pte_is_valid(*ptep)) {
2109 			va_start = va_end + PAGE_SIZE;
2110 			goto scan;
2111 		}
2112 	}
2113 
2114 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2115 		ptep = pmap_pte(kernel_pmap, va);
2116 		pte = pa_to_pte(pa_start)
2117 		    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2118 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2119 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2120 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2121 #if __ARM_KERNEL_PROTECT__
2122 		pte |= ARM_PTE_NG;
2123 #endif /* __ARM_KERNEL_PROTECT__ */
2124 		write_pte_strong(ptep, pte);
2125 	}
2126 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2127 #if KASAN
2128 	kasan_notify_address(va_start, len);
2129 #endif
2130 	return va_start;
2131 }
2132 
2133 static uint32_t
pmap_compute_max_asids(void)2134 pmap_compute_max_asids(void)
2135 {
2136 	DTEntry entry;
2137 	void const *prop = NULL;
2138 	uint32_t max_asids;
2139 	int err;
2140 	unsigned int prop_size;
2141 
2142 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2143 	assert(err == kSuccess);
2144 
2145 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2146 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2147 		 * we can choose a more flexible default value here. */
2148 		return MAX_ASIDS;
2149 	}
2150 
2151 	if (prop_size != sizeof(max_asids)) {
2152 		panic("pmap-max-asids property is not a 32-bit integer");
2153 	}
2154 
2155 	max_asids = *((uint32_t const *)prop);
2156 #if HAS_16BIT_ASID
2157 	if (max_asids > MAX_HW_ASIDS) {
2158 		panic("pmap-max-asids 0x%x too large", max_asids);
2159 	}
2160 #else
2161 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2162 	max_asids = (max_asids + 63) & ~63UL;
2163 
2164 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2165 		/* currently capped by size of pmap->sw_asid */
2166 		panic("pmap-max-asids 0x%x too large", max_asids);
2167 	}
2168 #endif /* HAS_16BIT_ASID */
2169 	if (max_asids == 0) {
2170 		panic("pmap-max-asids cannot be zero");
2171 	}
2172 	return max_asids;
2173 }
2174 
2175 #if __arm64__
2176 /*
2177  * pmap_get_arm64_prot
2178  *
2179  * return effective armv8 VMSA block protections including
2180  * table AP/PXN/XN overrides of a pmap entry
2181  *
2182  */
2183 
2184 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2185 pmap_get_arm64_prot(
2186 	pmap_t pmap,
2187 	vm_offset_t addr)
2188 {
2189 	tt_entry_t tte = 0;
2190 	unsigned int level = 0;
2191 	uint64_t effective_prot_bits = 0;
2192 	uint64_t aggregate_tte = 0;
2193 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2194 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2195 
2196 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2197 		tte = *pmap_ttne(pmap, level, addr);
2198 
2199 		if (!(tte & ARM_TTE_VALID)) {
2200 			return 0;
2201 		}
2202 
2203 		if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
2204 			/* Block or page mapping; both have the same protection bit layout. */
2205 			break;
2206 		} else if (tte_is_table(tte)) {
2207 			/* All of the table bits we care about are overrides, so just OR them together. */
2208 			aggregate_tte |= tte;
2209 		}
2210 	}
2211 
2212 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2213 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2214 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2215 
2216 	/* Start with the PTE bits. */
2217 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2218 
2219 	/* Table AP bits mask out block/page AP bits */
2220 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2221 
2222 	/* XN/PXN bits can be OR'd in. */
2223 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2224 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2225 
2226 	return effective_prot_bits;
2227 }
2228 #endif /* __arm64__ */
2229 
2230 /**
2231  * Helper macros for accessing the "unnested" and "in-progress" bits in
2232  * pmap->nested_region_unnested_table_bitmap.
2233  */
2234 #define UNNEST_BIT(index) ((index) * 2)
2235 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2236 
2237 
2238 /*
2239  *	Bootstrap the system enough to run with virtual memory.
2240  *
2241  *	The early VM initialization code has already allocated
2242  *	the first CPU's translation table and made entries for
2243  *	all the one-to-one mappings to be found there.
2244  *
2245  *	We must set up the kernel pmap structures, the
2246  *	physical-to-virtual translation lookup tables for the
2247  *	physical memory to be managed (between avail_start and
2248  *	avail_end).
2249  *
2250  *	Map the kernel's code and data, and allocate the system page table.
2251  *	Page_size must already be set.
2252  *
2253  *	Parameters:
2254  *	first_avail	first available physical page -
2255  *			   after kernel page tables
2256  *	avail_start	PA of first managed physical page
2257  *	avail_end	PA of last managed physical page
2258  */
2259 
2260 void
pmap_bootstrap(vm_offset_t vstart)2261 pmap_bootstrap(
2262 	vm_offset_t vstart)
2263 {
2264 	vm_map_offset_t maxoffset;
2265 
2266 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2267 
2268 #if XNU_MONITOR
2269 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
2270 	pmap_ppl_disable = ml_unsafe_kernel_text();
2271 #endif
2272 
2273 #endif /* XNU_MONITOR */
2274 
2275 #if DEVELOPMENT || DEBUG
2276 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2277 		kprintf("Kernel traces for pmap operations enabled\n");
2278 	}
2279 #endif
2280 
2281 	/*
2282 	 *	Initialize the kernel pmap.
2283 	 */
2284 #if ARM_PARAMETERIZED_PMAP
2285 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2286 #endif /* ARM_PARAMETERIZED_PMAP */
2287 #if HAS_APPLE_PAC
2288 	kernel_pmap->disable_jop = 0;
2289 #endif /* HAS_APPLE_PAC */
2290 	kernel_pmap->tte = cpu_tte;
2291 	kernel_pmap->ttep = cpu_ttep;
2292 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2293 	kernel_pmap->max = UINTPTR_MAX;
2294 	os_atomic_init(&kernel_pmap->ref_count, 1);
2295 #if XNU_MONITOR
2296 	os_atomic_init(&kernel_pmap->nested_count, 0);
2297 #endif
2298 	kernel_pmap->nx_enabled = TRUE;
2299 #ifdef  __arm64__
2300 	kernel_pmap->is_64bit = TRUE;
2301 #else
2302 	kernel_pmap->is_64bit = FALSE;
2303 #endif
2304 #if CONFIG_ROSETTA
2305 	kernel_pmap->is_rosetta = FALSE;
2306 #endif
2307 
2308 	kernel_pmap->nested_region_addr = 0x0ULL;
2309 	kernel_pmap->nested_region_size = 0x0ULL;
2310 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2311 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2312 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2313 
2314 	kernel_pmap->hw_asid = 0;
2315 	kernel_pmap->sw_asid = 0;
2316 
2317 	pmap_lock_init(kernel_pmap);
2318 
2319 	pmap_max_asids = pmap_compute_max_asids();
2320 #if HAS_16BIT_ASID
2321 	asid_chunk_size = MAX_HW_ASIDS;
2322 #else
2323 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2324 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2325 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2326 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2327 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2328 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2329 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2330 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2331 #endif /* HAS_16BIT_ASIDS */
2332 
2333 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2334 
2335 #if HAS_SPECRES_DEBUGGING
2336 	PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2337 
2338 	if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2339 		panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2340 	}
2341 #endif /* HAS_SPECRES_DEBUGGING */
2342 
2343 	/**
2344 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2345 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2346 	 * space for these data structures.
2347 	 */
2348 	pmap_data_bootstrap();
2349 
2350 	/**
2351 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2352 	 */
2353 	sart_bootstrap();
2354 
2355 	/**
2356 	 * Don't make any assumptions about the alignment of avail_start before this
2357 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2358 	 */
2359 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2360 
2361 	const pmap_paddr_t pmap_struct_start = avail_start;
2362 
2363 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2364 	avail_start = round_page(avail_start + asid_table_size);
2365 
2366 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2367 
2368 	vm_first_phys = gPhysBase;
2369 	vm_last_phys = trunc_page(avail_end);
2370 
2371 	queue_init(&map_pmap_list);
2372 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2373 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2374 	free_page_size_tt_count = 0;
2375 	free_page_size_tt_max = 0;
2376 	free_tt_list = TT_FREE_ENTRY_NULL;
2377 	free_tt_count = 0;
2378 	free_tt_max = 0;
2379 
2380 	virtual_space_start = vstart;
2381 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2382 
2383 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2384 #if !HAS_16BIT_ASID
2385 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2386 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2387 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2388 #endif /* !HAS_16BIT_ASID */
2389 
2390 
2391 
2392 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2393 		maxoffset = trunc_page(maxoffset);
2394 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2395 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2396 			arm_pmap_max_offset_default = maxoffset;
2397 		}
2398 	}
2399 #if defined(__arm64__)
2400 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2401 		maxoffset = trunc_page(maxoffset);
2402 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2403 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2404 			arm64_pmap_max_offset_default = maxoffset;
2405 		}
2406 	}
2407 #endif
2408 
2409 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2410 
2411 
2412 #if PMAP_CS_PPL_MONITOR
2413 	/* Initialize the PPL trust cache read-write lock */
2414 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2415 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2416 #endif
2417 
2418 #if DEVELOPMENT || DEBUG
2419 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2420 	    &vm_footprint_suspend_allowed,
2421 	    sizeof(vm_footprint_suspend_allowed));
2422 #endif /* DEVELOPMENT || DEBUG */
2423 
2424 #if KASAN
2425 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2426 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2427 #endif /* KASAN */
2428 
2429 	/**
2430 	 * Ensure that avail_start is always left on a page boundary. The calling
2431 	 * code might not perform any alignment before allocating page tables so
2432 	 * this is important.
2433 	 */
2434 	avail_start = round_page(avail_start);
2435 }
2436 
2437 #if XNU_MONITOR
2438 
2439 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2440 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2441 {
2442 	pmap_paddr_t cur_pa;
2443 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2444 		assert(pa_valid(cur_pa));
2445 		ppattr_pa_set_monitor(cur_pa);
2446 	}
2447 }
2448 
2449 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2450 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2451     pmap_paddr_t end_pa,
2452     unsigned int expected_perm,
2453     unsigned int new_perm)
2454 {
2455 	vm_offset_t start_va = phystokv(start_pa);
2456 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2457 
2458 	pa_set_range_monitor(start_pa, end_pa);
2459 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2460 }
2461 
2462 static void
pmap_lockdown_kc(void)2463 pmap_lockdown_kc(void)
2464 {
2465 	extern vm_offset_t vm_kernelcache_base;
2466 	extern vm_offset_t vm_kernelcache_top;
2467 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2468 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2469 	pmap_paddr_t cur_pa = start_pa;
2470 	vm_offset_t cur_va = vm_kernelcache_base;
2471 	while (cur_pa < end_pa) {
2472 		vm_size_t range_size = end_pa - cur_pa;
2473 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2474 		if (ptov_va != cur_va) {
2475 			/*
2476 			 * If the physical address maps back to a virtual address that is non-linear
2477 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2478 			 * reclaimed by the OS and should therefore not be locked down.
2479 			 */
2480 			cur_pa += range_size;
2481 			cur_va += range_size;
2482 			continue;
2483 		}
2484 		unsigned int pai = pa_index(cur_pa);
2485 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2486 
2487 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2488 
2489 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2490 			panic("pai %d already locked down", pai);
2491 		}
2492 
2493 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2494 		cur_pa += ARM_PGBYTES;
2495 		cur_va += ARM_PGBYTES;
2496 	}
2497 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
2498 	extern uint64_t ctrr_ro_test;
2499 	extern uint64_t ctrr_nx_test;
2500 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2501 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2502 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2503 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2504 	}
2505 #endif
2506 }
2507 
2508 void
pmap_static_allocations_done(void)2509 pmap_static_allocations_done(void)
2510 {
2511 	pmap_paddr_t monitor_start_pa;
2512 	pmap_paddr_t monitor_end_pa;
2513 
2514 	/*
2515 	 * Protect the bootstrap (V=P and V->P) page tables.
2516 	 *
2517 	 * These bootstrap allocations will be used primarily for page tables.
2518 	 * If we wish to secure the page tables, we need to start by marking
2519 	 * these bootstrap allocations as pages that we want to protect.
2520 	 */
2521 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2522 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2523 
2524 	/* The bootstrap page tables are mapped RW at boostrap. */
2525 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2526 
2527 	/*
2528 	 * We use avail_start as a pointer to the first address that has not
2529 	 * been reserved for bootstrap, so we know which pages to give to the
2530 	 * virtual memory layer.
2531 	 */
2532 	monitor_start_pa = first_avail_phys;
2533 	monitor_end_pa = avail_start;
2534 
2535 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2536 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2537 
2538 	/*
2539 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2540 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2541 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2542 	 * they can't be allocated for other uses.  We don't need a special xPRR
2543 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2544 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2545 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2546 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2547 	 * to believe we are dealing with an user XO page upon performing a translation.
2548 	 */
2549 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2550 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2551 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2552 
2553 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2554 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2555 
2556 	/* PPL data is RW for the PPL, RO for the kernel. */
2557 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2558 
2559 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2560 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2561 
2562 	/* PPL text is RX for the PPL, RO for the kernel. */
2563 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2564 
2565 
2566 	/*
2567 	 * In order to support DTrace, the save areas for the PPL must be
2568 	 * writable.  This is due to the fact that DTrace will try to update
2569 	 * register state.
2570 	 */
2571 	if (pmap_ppl_disable) {
2572 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2573 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2574 
2575 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2576 	}
2577 
2578 
2579 	if (segSizePPLDATACONST > 0) {
2580 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2581 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2582 
2583 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2584 	}
2585 
2586 	/*
2587 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2588 	 * precaution.  The real RW mappings are at a different location with guard pages.
2589 	 */
2590 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2591 
2592 	/* Prevent remapping of the kernelcache */
2593 	pmap_lockdown_kc();
2594 }
2595 
2596 
2597 void
pmap_lockdown_ppl(void)2598 pmap_lockdown_ppl(void)
2599 {
2600 	/* Mark the PPL as being locked down. */
2601 
2602 	mp_disable_preemption(); // for _nopreempt locking operations
2603 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2604 	if (commpage_text_kva != 0) {
2605 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2606 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2607 	}
2608 	mp_enable_preemption();
2609 
2610 	/* Write-protect the kernel RO commpage. */
2611 #error "XPRR configuration error"
2612 }
2613 #endif /* XNU_MONITOR */
2614 
2615 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2616 pmap_virtual_space(
2617 	vm_offset_t *startp,
2618 	vm_offset_t *endp
2619 	)
2620 {
2621 	*startp = virtual_space_start;
2622 	*endp = virtual_space_end;
2623 }
2624 
2625 
2626 __mockable boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2627 pmap_virtual_region(
2628 	unsigned int region_select,
2629 	vm_map_offset_t *startp,
2630 	vm_map_size_t *size
2631 	)
2632 {
2633 	boolean_t       ret = FALSE;
2634 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2635 	if (region_select == 0) {
2636 		/*
2637 		 * In this config, the bootstrap mappings should occupy their own L2
2638 		 * TTs, as they should be immutable after boot.  Having the associated
2639 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2640 		 * while allowing the rest of the kernel address range to be remapped.
2641 		 */
2642 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2643 #if defined(ARM_LARGE_MEMORY)
2644 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2645 #else
2646 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2647 #endif
2648 		ret = TRUE;
2649 	}
2650 
2651 #if defined(ARM_LARGE_MEMORY)
2652 	if (region_select == 1) {
2653 		*startp = VREGION1_START;
2654 		*size = VREGION1_SIZE;
2655 		ret = TRUE;
2656 	}
2657 #endif
2658 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2659 #if defined(ARM_LARGE_MEMORY)
2660 	/* For large memory systems with no KTRR/CTRR */
2661 	if (region_select == 0) {
2662 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2663 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2664 		ret = TRUE;
2665 	}
2666 
2667 	if (region_select == 1) {
2668 		*startp = VREGION1_START;
2669 		*size = VREGION1_SIZE;
2670 		ret = TRUE;
2671 	}
2672 #else /* !defined(ARM_LARGE_MEMORY) */
2673 	unsigned long low_global_vr_mask = 0;
2674 	vm_map_size_t low_global_vr_size = 0;
2675 
2676 	if (region_select == 0) {
2677 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2678 		if (!TEST_PAGE_SIZE_4K) {
2679 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2680 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2681 		} else {
2682 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2683 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2684 		}
2685 		ret = TRUE;
2686 	}
2687 	if (region_select == 1) {
2688 		*startp = VREGION1_START;
2689 		*size = VREGION1_SIZE;
2690 		ret = TRUE;
2691 	}
2692 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2693 	if (!TEST_PAGE_SIZE_4K) {
2694 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2695 		low_global_vr_size = 0x2000000;
2696 	} else {
2697 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2698 		low_global_vr_size = 0x800000;
2699 	}
2700 
2701 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2702 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2703 		*size = low_global_vr_size;
2704 		ret = TRUE;
2705 	}
2706 
2707 	if (region_select == 3) {
2708 		/* In this config, we allow the bootstrap mappings to occupy the same
2709 		 * page table pages as the heap.
2710 		 */
2711 		*startp = VM_MIN_KERNEL_ADDRESS;
2712 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2713 		ret = TRUE;
2714 	}
2715 #endif /* defined(ARM_LARGE_MEMORY) */
2716 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2717 	return ret;
2718 }
2719 
2720 /*
2721  * Routines to track and allocate physical pages during early boot.
2722  * On most systems that memory runs from first_avail through to avail_end
2723  * with no gaps.
2724  *
2725  * If the system supports ECC and ecc_bad_pages_count > 0, we
2726  * need to skip those pages.
2727  */
2728 
2729 static unsigned int avail_page_count = 0;
2730 static bool need_ram_ranges_init = true;
2731 
2732 
2733 /**
2734  * Checks to see if a given page is in
2735  * the array of known bad pages
2736  *
2737  * @param ppn page number to check
2738  */
2739 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2740 pmap_is_bad_ram(__unused ppnum_t ppn)
2741 {
2742 	return false;
2743 }
2744 
2745 /**
2746  * Prepare bad ram pages to be skipped.
2747  */
2748 
2749 /*
2750  * Initialize the count of available pages. No lock needed here,
2751  * as this code is called while kernel boot up is single threaded.
2752  */
2753 static void
initialize_ram_ranges(void)2754 initialize_ram_ranges(void)
2755 {
2756 	pmap_paddr_t first = first_avail;
2757 	pmap_paddr_t end = avail_end;
2758 
2759 	assert(first <= end);
2760 	assert(first == (first & ~PAGE_MASK));
2761 	assert(end == (end & ~PAGE_MASK));
2762 	avail_page_count = atop(end - first);
2763 
2764 	need_ram_ranges_init = false;
2765 }
2766 
2767 unsigned int
pmap_free_pages(void)2768 pmap_free_pages(
2769 	void)
2770 {
2771 	if (need_ram_ranges_init) {
2772 		initialize_ram_ranges();
2773 	}
2774 	return avail_page_count;
2775 }
2776 
2777 unsigned int
pmap_free_pages_span(void)2778 pmap_free_pages_span(
2779 	void)
2780 {
2781 	if (need_ram_ranges_init) {
2782 		initialize_ram_ranges();
2783 	}
2784 	return (unsigned int)atop(avail_end - first_avail);
2785 }
2786 
2787 
2788 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2789 pmap_next_page_hi(
2790 	ppnum_t            * pnum,
2791 	__unused boolean_t might_free)
2792 {
2793 	return pmap_next_page(pnum);
2794 }
2795 
2796 
2797 boolean_t
pmap_next_page(ppnum_t * pnum)2798 pmap_next_page(
2799 	ppnum_t *pnum)
2800 {
2801 	if (need_ram_ranges_init) {
2802 		initialize_ram_ranges();
2803 	}
2804 
2805 
2806 	if (first_avail != avail_end) {
2807 		*pnum = (ppnum_t)atop(first_avail);
2808 		first_avail += PAGE_SIZE;
2809 		assert(avail_page_count > 0);
2810 		--avail_page_count;
2811 		return TRUE;
2812 	}
2813 	assert(avail_page_count == 0);
2814 	return FALSE;
2815 }
2816 
2817 
2818 /**
2819  * Helper function to check wheter the given physical
2820  * page number is a restricted page.
2821  *
2822  * @param pn the physical page number to query.
2823  */
2824 bool
pmap_is_page_restricted(__unused ppnum_t pn)2825 pmap_is_page_restricted(__unused ppnum_t pn)
2826 {
2827 	return false;
2828 }
2829 
2830 /*
2831  *	Initialize the pmap module.
2832  *	Called by vm_init, to initialize any structures that the pmap
2833  *	system needs to map virtual memory.
2834  */
2835 void
pmap_init(void)2836 pmap_init(
2837 	void)
2838 {
2839 	/*
2840 	 *	Protect page zero in the kernel map.
2841 	 *	(can be overruled by permanent transltion
2842 	 *	table entries at page zero - see arm_vm_init).
2843 	 */
2844 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2845 
2846 	pmap_initialized = TRUE;
2847 
2848 	/*
2849 	 *	Create the zone of physical maps
2850 	 *	and the physical-to-virtual entries.
2851 	 */
2852 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2853 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2854 
2855 
2856 	/*
2857 	 *	Initialize the pmap object (for tracking the vm_page_t
2858 	 *	structures for pages we allocate to be page tables in
2859 	 *	pmap_expand().
2860 	 */
2861 	_vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2862 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2863 
2864 	/*
2865 	 * The values of [hard_]maxproc may have been scaled, make sure
2866 	 * they are still less than the value of pmap_max_asids.
2867 	 */
2868 	if ((uint32_t)maxproc > pmap_max_asids) {
2869 		maxproc = pmap_max_asids;
2870 	}
2871 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2872 		hard_maxproc = pmap_max_asids;
2873 	}
2874 }
2875 
2876 /**
2877  * Verify that a given physical page contains no mappings (outside of the
2878  * default physical aperture mapping).
2879  *
2880  * @param ppnum Physical page number to check there are no mappings to.
2881  *
2882  * @return True if there are no mappings, false otherwise or if the page is not
2883  *         kernel-managed.
2884  */
2885 bool
pmap_verify_free(ppnum_t ppnum)2886 pmap_verify_free(ppnum_t ppnum)
2887 {
2888 	const pmap_paddr_t pa = ptoa(ppnum);
2889 
2890 	assert(pa != vm_page_fictitious_addr);
2891 
2892 	/* Only mappings to kernel-managed physical memory are tracked. */
2893 	if (!pa_valid(pa)) {
2894 		return false;
2895 	}
2896 
2897 	const unsigned int pai = pa_index(pa);
2898 	pv_entry_t **pvh = pai_to_pvh(pai);
2899 
2900 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2901 }
2902 
2903 #if MACH_ASSERT
2904 /**
2905  * Verify that a given physical page contains no mappings (outside of the
2906  * default physical aperture mapping) and if it does, then panic.
2907  *
2908  * @note It's recommended to use pmap_verify_free() directly when operating in
2909  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2910  *       normally being called from outside of the PPL, and the pv_head_table
2911  *       can't be modified outside of the PPL).
2912  *
2913  * @param ppnum Physical page number to check there are no mappings to.
2914  */
2915 void
pmap_assert_free(ppnum_t ppnum)2916 pmap_assert_free(ppnum_t ppnum)
2917 {
2918 	const pmap_paddr_t pa = ptoa(ppnum);
2919 
2920 	/* Only mappings to kernel-managed physical memory are tracked. */
2921 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2922 		return;
2923 	}
2924 
2925 	const unsigned int pai = pa_index(pa);
2926 	pv_entry_t **pvh = pai_to_pvh(pai);
2927 
2928 	/**
2929 	 * This function is always called from outside of the PPL. Because of this,
2930 	 * the PVH entry can't be locked. This function is generally only called
2931 	 * before the VM reclaims a physical page and shouldn't be creating new
2932 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2933 	 * the worst case is that the system will panic in another way, and we were
2934 	 * already about to panic anyway.
2935 	 */
2936 
2937 	/**
2938 	 * Since pmap_verify_free() returned false, that means there is at least one
2939 	 * mapping left. Let's get some extra info on the first mapping we find to
2940 	 * dump in the panic string (the common case is that there is one spare
2941 	 * mapping that was never unmapped).
2942 	 */
2943 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2944 
2945 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2946 		first_ptep = pvh_ptep(pvh);
2947 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2948 		pv_entry_t *pvep = pvh_pve_list(pvh);
2949 
2950 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2951 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2952 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2953 			if (first_ptep != PT_ENTRY_NULL) {
2954 				break;
2955 			}
2956 		}
2957 
2958 		/* The PVE should have at least one valid PTE. */
2959 		assert(first_ptep != PT_ENTRY_NULL);
2960 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2961 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2962 		    __func__, pvh, pai);
2963 	} else {
2964 		/**
2965 		 * The mapping disappeared between here and the pmap_verify_free() call.
2966 		 * The only way that can happen is if the VM was racing this call with
2967 		 * a call that unmaps PTEs. Operations on this page should not be
2968 		 * occurring at the same time as this check, and unfortunately we can't
2969 		 * lock the PVH entry to prevent it, so just panic instead.
2970 		 */
2971 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2972 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2973 		    __func__, pvh, pai);
2974 	}
2975 
2976 	/* Panic with a unique string identifying the first bad mapping and owner. */
2977 	{
2978 		/* First PTE is mapped by the main CPUs. */
2979 		pmap_t pmap = ptep_get_pmap(first_ptep);
2980 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2981 
2982 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2983 		    "%s CPU mapping (pmap: %p)",
2984 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2985 	}
2986 }
2987 #endif /* MACH_ASSERT */
2988 
2989 inline void
pmap_recycle_page(ppnum_t pn)2990 pmap_recycle_page(ppnum_t pn)
2991 {
2992 	const bool is_freed = pmap_verify_free(pn);
2993 
2994 	if (__improbable(!is_freed)) {
2995 		/*
2996 		 * There is a redundancy here, but we are going to panic anyways,
2997 		 * and ASSERT_PMAP_FREE traces useful information. So, we keep this
2998 		 * behavior.
2999 		 */
3000 #if MACH_ASSERT
3001 		pmap_assert_free(pn);
3002 #endif /* MACH_ASSERT */
3003 		panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn));
3004 	}
3005 }
3006 
3007 
3008 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)3009 pmap_root_alloc_size(pmap_t pmap)
3010 {
3011 #pragma unused(pmap)
3012 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3013 	unsigned int root_level = pt_attr_root_level(pt_attr);
3014 	const uint64_t index = pt_attr_va_valid_mask(pt_attr);
3015 	return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
3016 }
3017 
3018 
3019 /*
3020  *	Create and return a physical map.
3021  *
3022  *	If the size specified for the map
3023  *	is zero, the map is an actual physical
3024  *	map, and may be referenced by the
3025  *	hardware.
3026  *
3027  *	If the size specified is non-zero,
3028  *	the map will be used in software only, and
3029  *	is bounded by that size.
3030  */
3031 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)3032 pmap_create_options_internal(
3033 	ledger_t ledger,
3034 	vm_map_size_t size,
3035 	unsigned int flags,
3036 	kern_return_t *kr)
3037 {
3038 	unsigned        i;
3039 	unsigned        tte_index_max;
3040 	pmap_t          p;
3041 	bool is_64bit = flags & PMAP_CREATE_64BIT;
3042 #if defined(HAS_APPLE_PAC)
3043 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
3044 #endif /* defined(HAS_APPLE_PAC) */
3045 	kern_return_t   local_kr = KERN_SUCCESS;
3046 
3047 	if (size != 0) {
3048 		{
3049 			// Size parameter should only be set for stage 2.
3050 			return PMAP_NULL;
3051 		}
3052 	}
3053 
3054 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3055 		return PMAP_NULL;
3056 	}
3057 
3058 #if XNU_MONITOR
3059 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3060 		goto pmap_create_fail;
3061 	}
3062 
3063 	assert(p != PMAP_NULL);
3064 
3065 	if (ledger) {
3066 		pmap_ledger_validate(ledger);
3067 		pmap_ledger_retain(ledger);
3068 	}
3069 #else
3070 	/*
3071 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
3072 	 *	the translation table of the right size for the pmap.
3073 	 */
3074 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3075 		local_kr = KERN_RESOURCE_SHORTAGE;
3076 		goto pmap_create_fail;
3077 	}
3078 #endif
3079 
3080 	p->ledger = ledger;
3081 
3082 
3083 	p->pmap_vm_map_cs_enforced = false;
3084 	p->min = 0;
3085 
3086 
3087 #if CONFIG_ROSETTA
3088 	if (flags & PMAP_CREATE_ROSETTA) {
3089 		p->is_rosetta = TRUE;
3090 	} else {
3091 		p->is_rosetta = FALSE;
3092 	}
3093 #endif /* CONFIG_ROSETTA */
3094 
3095 #if defined(HAS_APPLE_PAC)
3096 	p->disable_jop = disable_jop;
3097 #endif /* defined(HAS_APPLE_PAC) */
3098 
3099 	p->nested_region_true_start = 0;
3100 	p->nested_region_true_end = ~0;
3101 
3102 	p->nx_enabled = true;
3103 	p->is_64bit = is_64bit;
3104 	p->nested_pmap = PMAP_NULL;
3105 	p->type = PMAP_TYPE_USER;
3106 
3107 #if ARM_PARAMETERIZED_PMAP
3108 	/* Default to the native pt_attr */
3109 	p->pmap_pt_attr = native_pt_attr;
3110 #endif /* ARM_PARAMETERIZED_PMAP */
3111 #if __ARM_MIXED_PAGE_SIZE__
3112 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3113 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3114 	}
3115 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3116 	p->max = pmap_user_va_size(p);
3117 
3118 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3119 		local_kr = KERN_NO_SPACE;
3120 		goto id_alloc_fail;
3121 	}
3122 
3123 	pmap_lock_init(p);
3124 
3125 	p->tt_entry_free = (tt_entry_t *)0;
3126 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3127 
3128 
3129 #if XNU_MONITOR
3130 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3131 #else
3132 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3133 #endif
3134 	if (!(p->tte)) {
3135 		local_kr = KERN_RESOURCE_SHORTAGE;
3136 		goto tt1_alloc_fail;
3137 	}
3138 
3139 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3140 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3141 
3142 	/* nullify the translation table */
3143 	for (i = 0; i < tte_index_max; i++) {
3144 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3145 	}
3146 
3147 	FLUSH_PTE();
3148 
3149 	/*
3150 	 *  initialize the rest of the structure
3151 	 */
3152 	p->nested_region_addr = 0x0ULL;
3153 	p->nested_region_size = 0x0ULL;
3154 	p->nested_region_unnested_table_bitmap = NULL;
3155 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3156 
3157 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3158 	p->nested_no_bounds_refcnt = 0;
3159 	p->nested_bounds_set = false;
3160 
3161 
3162 #if MACH_ASSERT
3163 	p->pmap_pid = 0;
3164 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3165 #endif /* MACH_ASSERT */
3166 #if DEVELOPMENT || DEBUG
3167 	p->footprint_was_suspended = FALSE;
3168 #endif /* DEVELOPMENT || DEBUG */
3169 
3170 #if XNU_MONITOR
3171 	os_atomic_init(&p->nested_count, 0);
3172 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3173 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3174 	os_atomic_thread_fence(release);
3175 #endif
3176 	os_atomic_init(&p->ref_count, 1);
3177 	pmap_simple_lock(&pmaps_lock);
3178 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3179 	pmap_simple_unlock(&pmaps_lock);
3180 
3181 	/*
3182 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3183 	 * which can lead to a concurrent disconnect operation making the balance
3184 	 * transiently negative.  The ledger should still ultimately balance out,
3185 	 * which we still check upon pmap destruction.
3186 	 */
3187 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3188 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3189 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3190 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3191 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3192 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3193 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3194 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3195 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3196 
3197 	return p;
3198 
3199 tt1_alloc_fail:
3200 	pmap_get_pt_ops(p)->free_id(p);
3201 id_alloc_fail:
3202 #if XNU_MONITOR
3203 	pmap_free_pmap(p);
3204 
3205 	if (ledger) {
3206 		pmap_ledger_release(ledger);
3207 	}
3208 #else
3209 	zfree(pmap_zone, p);
3210 #endif
3211 pmap_create_fail:
3212 #if XNU_MONITOR
3213 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3214 #endif
3215 	*kr = local_kr;
3216 #if XNU_MONITOR
3217 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3218 #endif
3219 	return PMAP_NULL;
3220 }
3221 
3222 __mockable pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3223 pmap_create_options(
3224 	ledger_t ledger,
3225 	vm_map_size_t size,
3226 	unsigned int flags)
3227 {
3228 	pmap_t pmap;
3229 	kern_return_t kr = KERN_SUCCESS;
3230 
3231 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3232 
3233 	ledger_reference(ledger);
3234 
3235 #if XNU_MONITOR
3236 	for (;;) {
3237 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3238 		if (kr != KERN_RESOURCE_SHORTAGE) {
3239 			break;
3240 		}
3241 		assert(pmap == PMAP_NULL);
3242 		pmap_alloc_page_for_ppl(0);
3243 		kr = KERN_SUCCESS;
3244 	}
3245 #else
3246 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3247 #endif
3248 
3249 	if (pmap == PMAP_NULL) {
3250 		ledger_dereference(ledger);
3251 	}
3252 
3253 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3254 
3255 	return pmap;
3256 }
3257 
3258 #if XNU_MONITOR
3259 /*
3260  * This symbol remains in place when the PPL is enabled so that the dispatch
3261  * table does not change from development to release configurations.
3262  */
3263 #endif
3264 #if MACH_ASSERT || XNU_MONITOR
3265 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3266 pmap_set_process_internal(
3267 	__unused pmap_t pmap,
3268 	__unused int pid,
3269 	__unused char *procname)
3270 {
3271 #if MACH_ASSERT
3272 	if (pmap == NULL || pmap->pmap_pid == -1) {
3273 		return;
3274 	}
3275 
3276 	validate_pmap_mutable(pmap);
3277 
3278 	pmap->pmap_pid = pid;
3279 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3280 #endif /* MACH_ASSERT */
3281 }
3282 #endif /* MACH_ASSERT || XNU_MONITOR */
3283 
3284 #if MACH_ASSERT
3285 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3286 pmap_set_process(
3287 	pmap_t pmap,
3288 	int pid,
3289 	char *procname)
3290 {
3291 #if XNU_MONITOR
3292 	pmap_set_process_ppl(pmap, pid, procname);
3293 #else
3294 	pmap_set_process_internal(pmap, pid, procname);
3295 #endif
3296 }
3297 #endif /* MACH_ASSERT */
3298 
3299 /*
3300  * pmap_deallocate_all_leaf_tts:
3301  *
3302  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3303  * removing and deallocating all TTEs.
3304  */
3305 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3306 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3307 {
3308 	tt_entry_t tte = ARM_TTE_EMPTY;
3309 	tt_entry_t * ttep = NULL;
3310 	tt_entry_t * last_ttep = NULL;
3311 
3312 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3313 
3314 	assert(level < pt_attr_leaf_level(pt_attr));
3315 
3316 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3317 
3318 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3319 		tte = *ttep;
3320 
3321 		if (!(tte & ARM_TTE_VALID)) {
3322 			continue;
3323 		}
3324 
3325 		if (tte_is_block(tte)) {
3326 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3327 			    "pmap=%p, first_ttep=%p, level=%u",
3328 			    __FUNCTION__, ttep, (void *)tte,
3329 			    pmap, first_ttep, level);
3330 		}
3331 
3332 		/* Must be valid, type table */
3333 		if (level < pt_attr_twig_level(pt_attr)) {
3334 			/* If we haven't reached the twig level, recurse to the next level. */
3335 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3336 		}
3337 
3338 		/* Remove the TTE. */
3339 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3340 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3341 	}
3342 }
3343 
3344 /*
3345  * We maintain stats and ledgers so that a task's physical footprint is:
3346  * phys_footprint = ((internal - alternate_accounting)
3347  *                   + (internal_compressed - alternate_accounting_compressed)
3348  *                   + iokit_mapped
3349  *                   + purgeable_nonvolatile
3350  *                   + purgeable_nonvolatile_compressed
3351  *                   + page_table)
3352  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3353  */
3354 
3355 /*
3356  *	Retire the given physical map from service.
3357  *	Should only be called if the map contains
3358  *	no valid mappings.
3359  */
3360 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3361 pmap_destroy_internal(
3362 	pmap_t pmap)
3363 {
3364 	if (pmap == PMAP_NULL) {
3365 		return;
3366 	}
3367 
3368 	validate_pmap(pmap);
3369 
3370 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3371 
3372 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3373 	if (ref_count > 0) {
3374 		return;
3375 	} else if (__improbable(ref_count < 0)) {
3376 		panic("pmap %p: refcount underflow", pmap);
3377 	} else if (__improbable(pmap == kernel_pmap)) {
3378 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3379 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3380 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3381 	}
3382 
3383 	/*
3384 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3385 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3386 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3387 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3388 	 * ref_count of 0 and panic.
3389 	 */
3390 	os_atomic_thread_fence(seq_cst);
3391 
3392 #if XNU_MONITOR
3393 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3394 		panic("pmap %p: attempt to destroy while nested", pmap);
3395 	}
3396 	const int max_cpu = ml_get_max_cpu_number();
3397 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3398 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3399 		if (cpu_data == NULL) {
3400 			continue;
3401 		}
3402 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3403 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3404 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3405 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3406 		}
3407 	}
3408 #endif
3409 	pmap_unmap_commpage(pmap);
3410 
3411 	pmap_simple_lock(&pmaps_lock);
3412 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3413 	pmap_simple_unlock(&pmaps_lock);
3414 
3415 	pmap_trim_self(pmap);
3416 
3417 	/*
3418 	 *	Free the memory maps, then the
3419 	 *	pmap structure.
3420 	 */
3421 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3422 
3423 
3424 
3425 	if (pmap->tte) {
3426 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3427 		pmap->tte = (tt_entry_t *) NULL;
3428 		pmap->ttep = 0;
3429 	}
3430 
3431 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3432 
3433 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3434 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3435 		sync_tlb_flush();
3436 	} else {
3437 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3438 		sync_tlb_flush();
3439 		/* return its asid to the pool */
3440 		pmap_get_pt_ops(pmap)->free_id(pmap);
3441 		if (pmap->nested_pmap != NULL) {
3442 #if XNU_MONITOR
3443 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3444 #endif
3445 			/* release the reference we hold on the nested pmap */
3446 			pmap_destroy_internal(pmap->nested_pmap);
3447 		}
3448 	}
3449 
3450 	pmap_check_ledgers(pmap);
3451 
3452 	if (pmap->nested_region_unnested_table_bitmap) {
3453 #if XNU_MONITOR
3454 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3455 #else
3456 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3457 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3458 #endif
3459 	}
3460 
3461 #if XNU_MONITOR
3462 	if (pmap->ledger) {
3463 		pmap_ledger_release(pmap->ledger);
3464 	}
3465 
3466 	pmap_lock_destroy(pmap);
3467 	pmap_free_pmap(pmap);
3468 #else
3469 	pmap_lock_destroy(pmap);
3470 	zfree(pmap_zone, pmap);
3471 #endif
3472 }
3473 
3474 __mockable void
pmap_destroy(pmap_t pmap)3475 pmap_destroy(
3476 	pmap_t pmap)
3477 {
3478 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3479 
3480 	ledger_t ledger = pmap->ledger;
3481 
3482 #if XNU_MONITOR
3483 	pmap_destroy_ppl(pmap);
3484 
3485 	pmap_ledger_check_balance(pmap);
3486 #else
3487 	pmap_destroy_internal(pmap);
3488 #endif
3489 
3490 	ledger_dereference(ledger);
3491 
3492 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3493 }
3494 
3495 
3496 /*
3497  *	Add a reference to the specified pmap.
3498  */
3499 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3500 pmap_reference_internal(
3501 	pmap_t pmap)
3502 {
3503 	if (pmap != PMAP_NULL) {
3504 		validate_pmap_mutable(pmap);
3505 		os_atomic_inc(&pmap->ref_count, acquire);
3506 	}
3507 }
3508 
3509 void
pmap_reference(pmap_t pmap)3510 pmap_reference(
3511 	pmap_t pmap)
3512 {
3513 #if XNU_MONITOR
3514 	pmap_reference_ppl(pmap);
3515 #else
3516 	pmap_reference_internal(pmap);
3517 #endif
3518 }
3519 
3520 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3521 pmap_tt1_allocate(
3522 	pmap_t          pmap,
3523 	vm_size_t       size,
3524 	unsigned        option)
3525 {
3526 	tt_entry_t      *tt1 = NULL;
3527 	tt_free_entry_t *tt1_free;
3528 	pmap_paddr_t    pa;
3529 	vm_address_t    va;
3530 	vm_address_t    va_end;
3531 	kern_return_t   ret;
3532 
3533 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3534 		size = PAGE_SIZE;
3535 	}
3536 
3537 	/**
3538 	 * We expect top level translation tables to always fit into a single
3539 	 * physical page. This would also catch a misconfiguration if 4K
3540 	 * concatenated page tables needed more than one physical tt1 page.
3541 	 */
3542 	if (__improbable(size > PAGE_SIZE)) {
3543 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3544 	}
3545 
3546 	pmap_simple_lock(&tt1_lock);
3547 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3548 		free_page_size_tt_count--;
3549 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3550 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3551 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3552 		free_tt_count--;
3553 		tt1 = (tt_entry_t *)free_tt_list;
3554 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3555 	}
3556 	pmap_simple_unlock(&tt1_lock);
3557 
3558 	if (tt1 != NULL) {
3559 		pmap_tt_ledger_credit(pmap, size);
3560 		return (tt_entry_t *)tt1;
3561 	}
3562 
3563 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3564 
3565 	if (ret == KERN_RESOURCE_SHORTAGE) {
3566 		return (tt_entry_t *)0;
3567 	}
3568 
3569 #if XNU_MONITOR
3570 	assert(pa);
3571 #endif
3572 
3573 	if (size < PAGE_SIZE) {
3574 		va = phystokv(pa) + size;
3575 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3576 		tt_free_entry_t *next_free = NULL;
3577 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3578 			tt1_free = (tt_free_entry_t *)va;
3579 			tt1_free->next = next_free;
3580 			next_free = tt1_free;
3581 		}
3582 		pmap_simple_lock(&tt1_lock);
3583 		local_free_list->next = free_tt_list;
3584 		free_tt_list = next_free;
3585 		free_tt_count += ((PAGE_SIZE / size) - 1);
3586 		if (free_tt_count > free_tt_max) {
3587 			free_tt_max = free_tt_count;
3588 		}
3589 		pmap_simple_unlock(&tt1_lock);
3590 	}
3591 
3592 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3593 	 * Depending on the device, this can vary between 512b and 16K. */
3594 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3595 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3596 	pmap_tt_ledger_credit(pmap, size);
3597 
3598 	return (tt_entry_t *) phystokv(pa);
3599 }
3600 
3601 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3602 pmap_tt1_deallocate(
3603 	pmap_t pmap,
3604 	tt_entry_t *tt,
3605 	vm_size_t size,
3606 	unsigned option)
3607 {
3608 	tt_free_entry_t *tt_entry;
3609 
3610 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3611 		size = PAGE_SIZE;
3612 	}
3613 
3614 	tt_entry = (tt_free_entry_t *)tt;
3615 	assert(not_in_kdp);
3616 	pmap_simple_lock(&tt1_lock);
3617 
3618 	if (size < PAGE_SIZE) {
3619 		free_tt_count++;
3620 		if (free_tt_count > free_tt_max) {
3621 			free_tt_max = free_tt_count;
3622 		}
3623 		tt_entry->next = free_tt_list;
3624 		free_tt_list = tt_entry;
3625 	}
3626 
3627 	if (size == PAGE_SIZE) {
3628 		free_page_size_tt_count++;
3629 		if (free_page_size_tt_count > free_page_size_tt_max) {
3630 			free_page_size_tt_max = free_page_size_tt_count;
3631 		}
3632 		tt_entry->next = free_page_size_tt_list;
3633 		free_page_size_tt_list = tt_entry;
3634 	}
3635 
3636 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3637 		pmap_simple_unlock(&tt1_lock);
3638 		pmap_tt_ledger_debit(pmap, size);
3639 		return;
3640 	}
3641 
3642 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3643 		free_page_size_tt_count--;
3644 		tt = (tt_entry_t *)free_page_size_tt_list;
3645 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3646 
3647 		pmap_simple_unlock(&tt1_lock);
3648 
3649 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3650 
3651 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3652 
3653 		pmap_simple_lock(&tt1_lock);
3654 	}
3655 
3656 	pmap_simple_unlock(&tt1_lock);
3657 	pmap_tt_ledger_debit(pmap, size);
3658 }
3659 
3660 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3661 pmap_tt_allocate(
3662 	pmap_t pmap,
3663 	tt_entry_t **ttp,
3664 	unsigned int level,
3665 	unsigned int options)
3666 {
3667 	pmap_paddr_t pa;
3668 	*ttp = NULL;
3669 
3670 	/* Traverse the tt_entry_free list to find a free tt_entry */
3671 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3672 		return KERN_ABORTED;
3673 	}
3674 
3675 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3676 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3677 
3678 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3679 		tt_free_next = tt_free_cur->next;
3680 		tt_free_cur->next = NULL;
3681 		*ttp = (tt_entry_t *)tt_free_cur;
3682 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3683 	}
3684 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3685 
3686 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3687 	if (*ttp == NULL) {
3688 		pt_desc_t       *ptdp;
3689 
3690 		const unsigned int alloc_flags =
3691 		    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3692 		/*
3693 		 *  Allocate a VM page for the level x page table entries.
3694 		 */
3695 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3696 			if (options & PMAP_OPTIONS_NOWAIT) {
3697 				return KERN_RESOURCE_SHORTAGE;
3698 			}
3699 			VM_PAGE_WAIT();
3700 		}
3701 
3702 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3703 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3704 			if (options & PMAP_OPTIONS_NOWAIT) {
3705 				/* Deallocate all allocated resources so far. */
3706 				pmap_pages_free(pa, PAGE_SIZE);
3707 				return KERN_RESOURCE_SHORTAGE;
3708 			}
3709 			VM_PAGE_WAIT();
3710 		}
3711 
3712 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3713 			OSAddAtomic64(1, &alloc_ttepages_count);
3714 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3715 		} else {
3716 			OSAddAtomic64(1, &alloc_ptepages_count);
3717 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3718 		}
3719 
3720 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3721 
3722 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3723 
3724 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3725 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3726 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3727 
3728 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3729 		if (PAGE_SIZE > pmap_page_size) {
3730 			vm_address_t    va;
3731 			vm_address_t    va_end;
3732 
3733 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3734 				/* Deallocate all allocated resources so far. */
3735 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3736 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3737 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3738 				pmap_pages_free(pa, PAGE_SIZE);
3739 				ptd_deallocate(ptdp);
3740 
3741 				return KERN_ABORTED;
3742 			}
3743 
3744 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3745 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3746 				pmap->tt_entry_free = (tt_entry_t *)va;
3747 			}
3748 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3749 		}
3750 
3751 		*ttp = (tt_entry_t *)phystokv(pa);
3752 	}
3753 
3754 #if XNU_MONITOR
3755 	assert(*ttp);
3756 #endif
3757 
3758 	return KERN_SUCCESS;
3759 }
3760 
3761 
3762 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3763 pmap_tt_deallocate(
3764 	pmap_t pmap,
3765 	tt_entry_t *ttp,
3766 	unsigned int level)
3767 {
3768 	pt_desc_t *ptdp;
3769 	ptd_info_t *ptd_info;
3770 	unsigned pt_acc_cnt;
3771 	unsigned i;
3772 	vm_offset_t     free_page = 0;
3773 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3774 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3775 
3776 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3777 
3778 	ptdp = ptep_get_ptd(ttp);
3779 	ptd_info = ptd_get_info(ptdp, ttp);
3780 
3781 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3782 
3783 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3784 		ptd_info->refcnt = 0;
3785 	}
3786 
3787 	if (__improbable(ptd_info->refcnt != 0)) {
3788 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3789 	}
3790 
3791 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3792 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3793 	}
3794 
3795 	if (pt_acc_cnt == 0) {
3796 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3797 		unsigned pt_free_entry_cnt = 1;
3798 
3799 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3800 			tt_free_entry_t *tt_free_list_next;
3801 
3802 			tt_free_list_next = tt_free_list->next;
3803 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3804 				pt_free_entry_cnt++;
3805 			}
3806 			tt_free_list = tt_free_list_next;
3807 		}
3808 		if (pt_free_entry_cnt == max_pt_index) {
3809 			tt_free_entry_t *tt_free_list_cur;
3810 
3811 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3812 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3813 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3814 
3815 			while (tt_free_list_cur) {
3816 				tt_free_entry_t *tt_free_list_next;
3817 
3818 				tt_free_list_next = tt_free_list_cur->next;
3819 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3820 					tt_free_list->next = tt_free_list_next->next;
3821 				} else {
3822 					tt_free_list = tt_free_list_next;
3823 				}
3824 				tt_free_list_cur = tt_free_list_next;
3825 			}
3826 		} else {
3827 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3828 			pmap->tt_entry_free = ttp;
3829 		}
3830 	} else {
3831 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3832 		pmap->tt_entry_free = ttp;
3833 	}
3834 
3835 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3836 
3837 	if (free_page != 0) {
3838 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3839 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3840 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3841 		if (level < pt_attr_leaf_level(pt_attr)) {
3842 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3843 		} else {
3844 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3845 		}
3846 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3847 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3848 	}
3849 }
3850 
3851 /**
3852  * Safely clear out a translation table entry.
3853  *
3854  * @note If the TTE to clear out points to a leaf table, then that leaf table
3855  *       must have a refcnt of zero before the TTE can be removed.
3856  * @note This function expects to be called with pmap locked exclusive, and will
3857  *       return with pmap unlocked.
3858  *
3859  * @param pmap The pmap containing the page table whose TTE is being removed.
3860  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3861  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3862  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3863  * @param ttep Pointer to the TTE that should be cleared out.
3864  * @param level The level of the page table that contains the TTE to be removed.
3865  */
3866 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3867 pmap_tte_remove(
3868 	pmap_t pmap,
3869 	vm_offset_t va_start,
3870 	vm_offset_t va_end,
3871 	bool need_strong_sync,
3872 	tt_entry_t *ttep,
3873 	unsigned int level)
3874 {
3875 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3876 
3877 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3878 	const tt_entry_t tte = *ttep;
3879 
3880 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3881 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3882 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3883 	}
3884 
3885 	*ttep = (tt_entry_t) 0;
3886 	FLUSH_PTE_STRONG();
3887 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3888 	if (va_end > va_start) {
3889 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3890 	}
3891 
3892 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3893 
3894 	/**
3895 	 * Remember, the passed in "level" parameter refers to the level above the
3896 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3897 	 * page table).
3898 	 */
3899 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3900 
3901 	/**
3902 	 * Non-leaf pagetables don't track active references in the PTD and instead
3903 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3904 	 * the real refcount below.
3905 	 */
3906 	unsigned short refcnt = PT_DESC_REFCOUNT;
3907 
3908 	/*
3909 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3910 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3911 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3912 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3913 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3914 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3915 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3916 	 * synchronize it against the disconnect operation.  If that removal caused the
3917 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3918 	 * operation is finished using the relevant pagetable descriptor.
3919 	 * Address these cases by waiting until all CPUs have been observed to not be
3920 	 * executing pmap_disconnect().
3921 	 */
3922 	if (remove_leaf_table) {
3923 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3924 		const int max_cpu = ml_get_max_cpu_number();
3925 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3926 		bool inflight_disconnect;
3927 
3928 		/*
3929 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3930 		 * ahead of any prior PTE load which may have observed the effect of a
3931 		 * concurrent disconnect operation.  An acquire fence is required for this;
3932 		 * a load-acquire operation is insufficient.
3933 		 */
3934 		os_atomic_thread_fence(acquire);
3935 		do {
3936 			inflight_disconnect = false;
3937 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3938 			    i >= 0;
3939 			    i = bitmap_next(&active_disconnects[0], i)) {
3940 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3941 				if (cpu_data == NULL) {
3942 					continue;
3943 				}
3944 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3945 					__builtin_arm_wfe();
3946 					inflight_disconnect = true;
3947 					continue;
3948 				}
3949 				os_atomic_clear_exclusive();
3950 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3951 			}
3952 		} while (inflight_disconnect);
3953 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3954 		os_atomic_thread_fence(acquire);
3955 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3956 	}
3957 
3958 #if MACH_ASSERT
3959 	/**
3960 	 * On internal devices, always do the page table consistency check
3961 	 * regardless of page table level or the actual refcnt value.
3962 	 */
3963 	{
3964 #else /* MACH_ASSERT */
3965 	/**
3966 	 * Only perform the page table consistency check when deleting leaf page
3967 	 * tables and it seems like there might be valid/compressed mappings
3968 	 * leftover.
3969 	 */
3970 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3971 #endif /* MACH_ASSERT */
3972 
3973 		/**
3974 		 * There are multiple problems that can arise as a non-zero refcnt:
3975 		 * 1. A bug in the refcnt management logic.
3976 		 * 2. A memory stomper or hardware failure.
3977 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3978 		 *    space before destroying a pmap.
3979 		 *
3980 		 * By looping over the page table and determining how many valid or
3981 		 * compressed entries there actually are, we can narrow down which of
3982 		 * these three cases is causing this panic. If the expected refcnt
3983 		 * (valid + compressed) and the actual refcnt don't match then the
3984 		 * problem is probably either a memory corruption issue (if the
3985 		 * non-empty entries don't match valid+compressed, that could also be a
3986 		 * sign of corruption) or refcnt management bug. Otherwise, there
3987 		 * actually are leftover mappings and the higher layers of xnu are
3988 		 * probably at fault.
3989 		 */
3990 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3991 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3992 
3993 		pt_entry_t *ptep = bpte;
3994 		unsigned short non_empty = 0, valid = 0, comp = 0;
3995 
3996 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3997 			/**
3998 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3999 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
4000 			 * That's because it's possible for the 4-tuple PTE clear operation in
4001 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
4002 			 * pmap_disconnect() to race each other in such a way that the compressed marker
4003 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
4004 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
4005 			 * but we don't want it to trip our internal checks here.
4006 			 */
4007 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
4008 				if ((i % PAGE_RATIO) == 0) {
4009 					comp++;
4010 				} else {
4011 					continue;
4012 				}
4013 			} else if (__improbable(pte_is_valid(*ptep))) {
4014 				valid++;
4015 			}
4016 
4017 			/* Keep track of all non-empty entries to detect memory corruption. */
4018 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
4019 				non_empty++;
4020 			}
4021 		}
4022 
4023 #if MACH_ASSERT
4024 		/**
4025 		 * On internal machines, panic whenever a page table getting deleted has
4026 		 * leftover mappings (valid or otherwise) or a leaf page table has a
4027 		 * non-zero refcnt.
4028 		 */
4029 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
4030 #else /* MACH_ASSERT */
4031 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
4032 		{
4033 #endif /* MACH_ASSERT */
4034 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
4035 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
4036 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
4037 		}
4038 	}
4039 }
4040 
4041 /**
4042  * Given a pointer to an entry within a `level` page table, delete the
4043  * page table at `level` + 1 that is represented by that entry. For instance,
4044  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
4045  * contains the PA of the L3 table, and `level` would be "2".
4046  *
4047  * @note If the table getting deallocated is a leaf table, then that leaf table
4048  *       must have a refcnt of zero before getting deallocated. All other levels
4049  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
4050  * @note This function expects to be called with pmap locked exclusive and will
4051  *       return with pmap unlocked.
4052  *
4053  * @param pmap The pmap that owns the page table to be deallocated.
4054  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4055  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4056  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4057  * @param ttep Pointer to the `level` TTE to remove.
4058  * @param level The level of the table that contains an entry pointing to the
4059  *              table to be removed. The deallocated page table will be a
4060  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
4061  *              deleted).
4062  */
4063 void
4064 pmap_tte_deallocate(
4065 	pmap_t pmap,
4066 	vm_offset_t va_start,
4067 	vm_offset_t va_end,
4068 	bool need_strong_sync,
4069 	tt_entry_t *ttep,
4070 	unsigned int level)
4071 {
4072 	tt_entry_t tte;
4073 
4074 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4075 
4076 	tte = *ttep;
4077 
4078 	if (tte_get_ptd(tte)->pmap != pmap) {
4079 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4080 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4081 	}
4082 
4083 	assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
4084 	    (unsigned long long)tte);
4085 
4086 	/* pmap_tte_remove() will drop the pmap lock */
4087 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4088 
4089 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4090 }
4091 
4092 /*
4093  *	Remove a range of hardware page-table entries.
4094  *	The entries given are the first (inclusive)
4095  *	and last (exclusive) entries for the VM pages.
4096  *	The virtual address is the va for the first pte.
4097  *
4098  *	The pmap must be locked.
4099  *	If the pmap is not the kernel pmap, the range must lie
4100  *	entirely within one pte-page.  This is NOT checked.
4101  *	Assumes that the pte-page exists.
4102  *
4103  *	Returns the number of PTE changed
4104  */
4105 MARK_AS_PMAP_TEXT static int
4106 pmap_remove_range(
4107 	pmap_t pmap,
4108 	vm_map_address_t va,
4109 	pt_entry_t *bpte,
4110 	pt_entry_t *epte)
4111 {
4112 	bool need_strong_sync = false;
4113 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4114 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4115 	if (num_changed > 0) {
4116 		PMAP_UPDATE_TLBS(pmap, va,
4117 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4118 	}
4119 	return num_changed;
4120 }
4121 
4122 
4123 #ifdef PVH_FLAG_EXEC
4124 
4125 /*
4126  *	Update the access protection bits of the physical aperture mapping for a page.
4127  *	This is useful, for example, in guranteeing that a verified executable page
4128  *	has no writable mappings anywhere in the system, including the physical
4129  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4130  *	synchronization overhead in cases where the call to this function is
4131  *	guaranteed to be followed by other TLB operations.
4132  */
4133 void
4134 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4135 {
4136 #if __ARM_PTE_PHYSMAP__
4137 	pvh_assert_locked(pai);
4138 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4139 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4140 
4141 	pt_entry_t tmplate = *pte_p;
4142 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4143 		return;
4144 	}
4145 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4146 	if (tmplate & ARM_PTE_HINT_MASK) {
4147 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4148 		    __func__, pte_p, (void *)kva, tmplate);
4149 	}
4150 	write_pte_strong(pte_p, tmplate);
4151 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4152 	if (!flush_tlb_async) {
4153 		sync_tlb_flush();
4154 	}
4155 #endif
4156 }
4157 #endif /* defined(PVH_FLAG_EXEC) */
4158 
4159 
4160 
4161 MARK_AS_PMAP_TEXT int
4162 pmap_remove_range_options(
4163 	pmap_t pmap,
4164 	vm_map_address_t va,
4165 	pt_entry_t *bpte,
4166 	pt_entry_t *epte,
4167 	vm_map_address_t *eva,
4168 	bool *need_strong_sync __unused,
4169 	int options)
4170 {
4171 	pt_entry_t     *cpte;
4172 	size_t          npages = 0;
4173 	int             num_removed, num_unwired;
4174 	int             num_pte_changed;
4175 	unsigned int    pai = 0;
4176 	pmap_paddr_t    pa;
4177 	int             num_external, num_internal, num_reusable;
4178 	int             num_alt_internal;
4179 	uint64_t        num_compressed, num_alt_compressed;
4180 	int16_t         refcnt = 0;
4181 
4182 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4183 
4184 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4185 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4186 
4187 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4188 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4189 	}
4190 
4191 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4192 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4193 	}
4194 
4195 	if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4196 		panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4197 	}
4198 
4199 	num_removed = 0;
4200 	num_unwired = 0;
4201 	num_pte_changed = 0;
4202 	num_external = 0;
4203 	num_internal = 0;
4204 	num_reusable = 0;
4205 	num_compressed = 0;
4206 	num_alt_internal = 0;
4207 	num_alt_compressed = 0;
4208 
4209 #if XNU_MONITOR
4210 	bool ro_va = false;
4211 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4212 		ro_va = true;
4213 	}
4214 #endif
4215 	for (cpte = bpte; cpte < epte;
4216 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4217 		pt_entry_t      spte;
4218 		boolean_t       managed = FALSE;
4219 
4220 		/*
4221 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4222 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4223 		 */
4224 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4225 			*eva = va;
4226 			break;
4227 		}
4228 
4229 		spte = *((volatile pt_entry_t*)cpte);
4230 
4231 		while (!managed) {
4232 			if (pmap != kernel_pmap &&
4233 			    (options & PMAP_OPTIONS_REMOVE) &&
4234 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4235 				/*
4236 				 * "pmap" must be locked at this point,
4237 				 * so this should not race with another
4238 				 * pmap_remove_range() or pmap_enter().
4239 				 */
4240 
4241 				/* one less "compressed"... */
4242 				num_compressed++;
4243 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4244 					/* ... but it used to be "ALTACCT" */
4245 					num_alt_compressed++;
4246 				}
4247 
4248 				/* clear marker */
4249 				write_pte_fast(cpte, ARM_PTE_EMPTY);
4250 				/*
4251 				 * "refcnt" also accounts for
4252 				 * our "compressed" markers,
4253 				 * so let's update it here.
4254 				 */
4255 				--refcnt;
4256 				spte = *((volatile pt_entry_t*)cpte);
4257 			}
4258 			/*
4259 			 * It may be possible for the pte to transition from managed
4260 			 * to unmanaged in this timeframe; for now, elide the assert.
4261 			 * We should break out as a consequence of checking pa_valid.
4262 			 */
4263 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4264 			pa = pte_to_pa(spte);
4265 			if (!pa_valid(pa)) {
4266 #if XNU_MONITOR
4267 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4268 #endif
4269 #if XNU_MONITOR
4270 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4271 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4272 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4273 					    __func__, (uint64_t)pa);
4274 				}
4275 #endif
4276 				break;
4277 			}
4278 #if HAS_FEAT_XS
4279 			if (pte_is_xs(pt_attr, spte)) {
4280 				*need_strong_sync = true;
4281 			}
4282 #endif /* HAS_FEAT_XS */
4283 			pai = pa_index(pa);
4284 			pvh_lock(pai);
4285 			spte = *((volatile pt_entry_t*)cpte);
4286 			pa = pte_to_pa(spte);
4287 			if (pai == pa_index(pa)) {
4288 				managed = TRUE;
4289 				break; // Leave pai locked as we will unlock it after we free the PV entry
4290 			}
4291 			pvh_unlock(pai);
4292 		}
4293 
4294 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4295 			/*
4296 			 * There used to be a valid mapping here but it
4297 			 * has already been removed when the page was
4298 			 * sent to the VM compressor, so nothing left to
4299 			 * remove now...
4300 			 */
4301 			continue;
4302 		}
4303 
4304 		/* remove the translation, do not flush the TLB */
4305 		if (*cpte != ARM_PTE_EMPTY) {
4306 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4307 			assertf(pte_is_valid(*cpte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4308 #if MACH_ASSERT
4309 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4310 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4311 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4312 			}
4313 #endif
4314 			write_pte_fast(cpte, ARM_PTE_EMPTY);
4315 			num_pte_changed++;
4316 		}
4317 
4318 		if ((spte != ARM_PTE_EMPTY) && (pmap != kernel_pmap)) {
4319 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4320 			assertf(pte_is_valid(spte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4321 			--refcnt;
4322 		}
4323 
4324 		if (pte_is_wired(spte)) {
4325 			pte_set_wired(pmap, cpte, 0);
4326 			num_unwired++;
4327 		}
4328 		/*
4329 		 * if not managed, we're done
4330 		 */
4331 		if (!managed) {
4332 			continue;
4333 		}
4334 
4335 #if XNU_MONITOR
4336 		if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4337 			panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4338 		}
4339 		if (__improbable(ro_va)) {
4340 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4341 		}
4342 #endif
4343 
4344 		/*
4345 		 * find and remove the mapping from the chain for this
4346 		 * physical address.
4347 		 */
4348 		bool is_internal, is_altacct;
4349 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4350 
4351 		if (is_altacct) {
4352 			assert(is_internal);
4353 			num_internal++;
4354 			num_alt_internal++;
4355 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4356 				ppattr_clear_altacct(pai);
4357 				ppattr_clear_internal(pai);
4358 			}
4359 		} else if (is_internal) {
4360 			if (ppattr_test_reusable(pai)) {
4361 				num_reusable++;
4362 			} else {
4363 				num_internal++;
4364 			}
4365 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4366 				ppattr_clear_internal(pai);
4367 			}
4368 		} else {
4369 			num_external++;
4370 		}
4371 		pvh_unlock(pai);
4372 		num_removed++;
4373 	}
4374 
4375 	/*
4376 	 *	Update the counts
4377 	 */
4378 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4379 
4380 	if (pmap != kernel_pmap) {
4381 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4382 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4383 		}
4384 
4385 		/* update ledgers */
4386 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4387 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4388 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4389 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4390 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4391 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4392 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4393 		/* make needed adjustments to phys_footprint */
4394 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4395 		    ((num_internal -
4396 		    num_alt_internal) +
4397 		    (num_compressed -
4398 		    num_alt_compressed)) * pmap_page_size);
4399 	}
4400 
4401 	/* flush the ptable entries we have written */
4402 	if (num_pte_changed > 0) {
4403 		FLUSH_PTE_STRONG();
4404 	}
4405 
4406 	return num_pte_changed;
4407 }
4408 
4409 
4410 /*
4411  *	Remove the given range of addresses
4412  *	from the specified map.
4413  *
4414  *	It is assumed that the start and end are properly
4415  *	rounded to the hardware page size.
4416  */
4417 void
4418 pmap_remove(
4419 	pmap_t pmap,
4420 	vm_map_address_t start,
4421 	vm_map_address_t end)
4422 {
4423 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4424 }
4425 
4426 MARK_AS_PMAP_TEXT vm_map_address_t
4427 pmap_remove_options_internal(
4428 	pmap_t pmap,
4429 	vm_map_address_t start,
4430 	vm_map_address_t end,
4431 	int options)
4432 {
4433 	vm_map_address_t eva = end;
4434 	pt_entry_t     *bpte, *epte;
4435 	pt_entry_t     *pte_p;
4436 	tt_entry_t     *tte_p;
4437 	int             remove_count = 0;
4438 	bool            need_strong_sync = false;
4439 	bool            unlock = true;
4440 
4441 	validate_pmap_mutable(pmap);
4442 
4443 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4444 
4445 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4446 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4447 	}
4448 
4449 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4450 
4451 	tte_p = pmap_tte(pmap, start);
4452 
4453 	if (tte_p == (tt_entry_t *) NULL) {
4454 		goto done;
4455 	}
4456 
4457 	if (tte_is_valid_table(*tte_p)) {
4458 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4459 		bpte = &pte_p[pte_index(pt_attr, start)];
4460 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4461 
4462 		/*
4463 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4464 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4465 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4466 		 */
4467 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4468 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4469 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4470 		}
4471 
4472 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4473 		    &need_strong_sync, options);
4474 
4475 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4476 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4477 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4478 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4479 		}
4480 	}
4481 
4482 done:
4483 	if (unlock) {
4484 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4485 	}
4486 
4487 	if (remove_count > 0) {
4488 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4489 	}
4490 	return eva;
4491 }
4492 
4493 __mockable void
4494 pmap_remove_options(
4495 	pmap_t pmap,
4496 	vm_map_address_t start,
4497 	vm_map_address_t end,
4498 	int options)
4499 {
4500 	vm_map_address_t va;
4501 
4502 	if (pmap == PMAP_NULL) {
4503 		return;
4504 	}
4505 
4506 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4507 
4508 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4509 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4510 	    VM_KERNEL_ADDRHIDE(end));
4511 
4512 	/*
4513 	 * We allow single-page requests to execute non-preemptibly,
4514 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4515 	 * operation, and there are a couple of special use cases that
4516 	 * require a non-preemptible single-page operation.
4517 	 */
4518 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4519 		pmap_verify_preemptible();
4520 	}
4521 
4522 	/*
4523 	 *      Invalidate the translation buffer first
4524 	 */
4525 	va = start;
4526 	while (va < end) {
4527 		vm_map_address_t l;
4528 
4529 #if XNU_TARGET_OS_XR
4530 		/* rdar://84856940 */
4531 		unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4532 
4533 		l = va + BATCH_SIZE;
4534 
4535 		vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4536 
4537 		if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4538 			// We're not allowed to cross an L2 boundary.
4539 			l = l_twig;
4540 		}
4541 #else /* XNU_TARGET_OS_XR */
4542 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4543 #endif /* XNU_TARGET_OS_XR */
4544 		if (l > end) {
4545 			l = end;
4546 		}
4547 
4548 #if XNU_MONITOR
4549 		va = pmap_remove_options_ppl(pmap, va, l, options);
4550 
4551 		pmap_ledger_check_balance(pmap);
4552 #else
4553 		va = pmap_remove_options_internal(pmap, va, l, options);
4554 #endif
4555 	}
4556 
4557 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4558 }
4559 
4560 
4561 /*
4562  *	Remove phys addr if mapped in specified map
4563  */
4564 void
4565 pmap_remove_some_phys(
4566 	__unused pmap_t map,
4567 	__unused ppnum_t pn)
4568 {
4569 	/* Implement to support working set code */
4570 }
4571 
4572 /*
4573  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4574  * switch a thread onto a new vm_map.
4575  */
4576 void
4577 pmap_switch_user(thread_t thread, vm_map_t new_map)
4578 {
4579 	pmap_t new_pmap = new_map->pmap;
4580 
4581 
4582 	thread->map = new_map;
4583 	pmap_set_pmap(new_pmap, thread);
4584 
4585 }
4586 
4587 void
4588 pmap_set_pmap(
4589 	pmap_t pmap,
4590 	thread_t        thread)
4591 {
4592 	pmap_switch(pmap, thread);
4593 #if __ARM_USER_PROTECT__
4594 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4595 	thread->machine.asid = pmap->hw_asid;
4596 #endif
4597 }
4598 
4599 static void
4600 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4601 {
4602 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4603 }
4604 
4605 #if HAS_SPECRES
4606 static void
4607 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4608 {
4609 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4610 	asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4611 }
4612 
4613 #if REQUIRES_DVP_RCTX
4614 static void
4615 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4616 {
4617 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4618 	asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4619 }
4620 #endif /* REQUIRES_DVP_RCTX */
4621 #endif /* HAS_SPECRES */
4622 
4623 static inline bool
4624 pmap_user_ttb_is_clear(void)
4625 {
4626 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4627 }
4628 
4629 MARK_AS_PMAP_TEXT void
4630 pmap_switch_internal(
4631 	pmap_t pmap)
4632 {
4633 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4634 #if XNU_MONITOR
4635 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4636 
4637 	/**
4638 	 * Make sure a pmap is never active-and-nested. For more details,
4639 	 * see pmap_set_nested_internal().
4640 	 */
4641 	os_atomic_thread_fence(seq_cst);
4642 	if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4643 		panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4644 	}
4645 #endif
4646 	validate_pmap_mutable(pmap);
4647 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4648 	uint16_t asid_index = pmap->hw_asid;
4649 	bool do_asid_flush = false;
4650 	bool do_commpage_flush = false;
4651 #if HAS_SPECRES
4652 	bool do_speculation_restriction = false;
4653 #endif /* HAS_SPECRES */
4654 
4655 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4656 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4657 	}
4658 #if __ARM_KERNEL_PROTECT__
4659 	asid_index >>= 1;
4660 #endif
4661 
4662 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4663 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4664 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4665 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4666 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4667 	bool break_before_make = do_shared_region_flush;
4668 
4669 #if !HAS_16BIT_ASID
4670 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4671 		asid_index -= 1;
4672 		pmap_update_plru(asid_index);
4673 
4674 		/* Paranoia. */
4675 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4676 
4677 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4678 		uint8_t new_sw_asid = pmap->sw_asid;
4679 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4680 
4681 		if (new_sw_asid != last_sw_asid) {
4682 			/**
4683 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4684 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4685 			 * then this switch runs the risk of aliasing.  We need to flush the
4686 			 * TLB for this phyiscal ASID in this case.
4687 			 */
4688 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4689 			do_asid_flush = true;
4690 #if HAS_SPECRES
4691 			do_speculation_restriction = true;
4692 #endif /* HAS_SPECRES */
4693 			break_before_make = true;
4694 		}
4695 	}
4696 #endif /* !HAS_16BIT_ASID */
4697 
4698 #if HAS_SPECRES_DEBUGGING
4699 	if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4700 		do_speculation_restriction = true;
4701 	} else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4702 		do_speculation_restriction = false;
4703 	}
4704 #endif /* HAS_SPECRES_DEBUGGING */
4705 
4706 #if __ARM_MIXED_PAGE_SIZE__
4707 	if (pt_attr->pta_tcr_value != get_tcr()) {
4708 		break_before_make = true;
4709 	}
4710 #endif
4711 #if __ARM_MIXED_PAGE_SIZE__
4712 	/*
4713 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4714 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4715 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4716 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4717 	 * conflict abort or other unpredictable behavior.
4718 	 */
4719 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4720 		do_commpage_flush = true;
4721 	}
4722 	if (do_commpage_flush) {
4723 		break_before_make = true;
4724 	}
4725 #endif
4726 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4727 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4728 		pmap_clear_user_ttb_internal();
4729 	}
4730 
4731 #if HAS_SPECRES
4732 	/**
4733 	 * Perform an CFP/DVP flush if required.
4734 	 */
4735 	if (__improbable(do_speculation_restriction)) {
4736 		pmap_flush_core_cfp_asid_async(pmap);
4737 #if REQUIRES_DVP_RCTX
4738 		pmap_flush_core_dvp_asid_async(pmap);
4739 #endif /* REQUIRES_DVP_RCTX */
4740 #if DEVELOPMENT || DEBUG
4741 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4742 #endif /* DEVELOPMENT || DEBUG */
4743 	}
4744 #endif /* HAS_SPECRES */
4745 
4746 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4747 	 * to flush the userspace mappings for that region.  Those mappings are global
4748 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4749 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4750 	if (__improbable(do_shared_region_flush)) {
4751 #if __ARM_RANGE_TLBI__
4752 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4753 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4754 
4755 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4756 		 * There may still be non-global entries that overlap with the incoming pmap's
4757 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4758 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4759 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4760 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4761 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4762 		 * to consider additional invalidation here in the future. */
4763 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4764 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4765 		} else {
4766 			/*
4767 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4768 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4769 			 * have a single-page shared region anyway, not least because pmap_nest()
4770 			 * requires L2 block alignment of the address and size.
4771 			 */
4772 			do_asid_flush = false;
4773 			flush_core_tlb_async();
4774 		}
4775 #else
4776 		do_asid_flush = false;
4777 		flush_core_tlb_async();
4778 #endif // __ARM_RANGE_TLBI__
4779 	}
4780 
4781 #if __ARM_MIXED_PAGE_SIZE__
4782 	if (__improbable(do_commpage_flush)) {
4783 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4784 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4785 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4786 		flush_core_tlb_allrange_async(rtlbi_param);
4787 	}
4788 #endif
4789 	if (__improbable(do_asid_flush)) {
4790 		pmap_flush_core_tlb_asid_async(pmap);
4791 #if DEVELOPMENT || DEBUG
4792 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4793 #endif /* DEVELOPMENT || DEBUG */
4794 	}
4795 
4796 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4797 #if HAS_SPECRES && !HAS_ERRATA_123855614
4798 	    || do_speculation_restriction
4799 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4800 	    )) {
4801 		sync_tlb_flush_local();
4802 	}
4803 
4804 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4805 }
4806 
4807 void
4808 pmap_switch(
4809 	pmap_t pmap,
4810 	thread_t thread __unused)
4811 {
4812 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4813 #if XNU_MONITOR
4814 	pmap_switch_ppl(pmap);
4815 #else
4816 	pmap_switch_internal(pmap);
4817 #endif
4818 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4819 }
4820 
4821 void
4822 pmap_page_protect(
4823 	ppnum_t ppnum,
4824 	vm_prot_t prot)
4825 {
4826 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4827 }
4828 
4829 /*
4830  *	Routine:	pmap_page_protect_options
4831  *
4832  *	Function:
4833  *		Lower the permission for all mappings to a given
4834  *		page.
4835  */
4836 MARK_AS_PMAP_TEXT static void
4837 pmap_page_protect_options_with_flush_range(
4838 	ppnum_t ppnum,
4839 	vm_prot_t prot,
4840 	unsigned int options,
4841 	pmap_tlb_flush_range_t *flush_range)
4842 {
4843 	pmap_paddr_t    phys = ptoa(ppnum);
4844 	pv_entry_t    **pv_h;
4845 	pv_entry_t     *pve_p, *orig_pve_p;
4846 	pv_entry_t     *pveh_p;
4847 	pv_entry_t     *pvet_p;
4848 	pt_entry_t     *pte_p, *orig_pte_p;
4849 	pv_entry_t     *new_pve_p;
4850 	pt_entry_t     *new_pte_p;
4851 	vm_offset_t     pvh_flags;
4852 	unsigned int    pai;
4853 	bool            remove;
4854 	bool            set_NX;
4855 	unsigned int    pvh_cnt = 0;
4856 	unsigned int    pass1_updated = 0;
4857 	unsigned int    pass2_updated = 0;
4858 
4859 	assert(ppnum != vm_page_fictitious_addr);
4860 
4861 	/* Only work with managed pages. */
4862 	if (!pa_valid(phys)) {
4863 		return;
4864 	}
4865 
4866 	/*
4867 	 * Determine the new protection.
4868 	 */
4869 	switch (prot) {
4870 	case VM_PROT_ALL:
4871 		return;         /* nothing to do */
4872 	case VM_PROT_READ:
4873 	case VM_PROT_READ | VM_PROT_EXECUTE:
4874 		remove = false;
4875 		break;
4876 	default:
4877 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4878 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4879 		remove = true;
4880 		break;
4881 	}
4882 
4883 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4884 	if (remove) {
4885 #if !XNU_MONITOR
4886 		mp_disable_preemption();
4887 #endif
4888 		pmap_cpu_data = pmap_get_cpu_data();
4889 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4890 		/*
4891 		 * Ensure the store to inflight_disconnect will be observed before any of the
4892 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4893 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4894 		 * another CPU, in between this function's clearing a PTE and dropping the
4895 		 * corresponding pagetable refcount.  That can lead to a panic if the
4896 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4897 		 * store barrier; a store-release operation would not be sufficient.
4898 		 */
4899 		os_atomic_thread_fence(release);
4900 	}
4901 
4902 	pai = pa_index(phys);
4903 	pvh_lock(pai);
4904 	pv_h = pai_to_pvh(pai);
4905 	pvh_flags = pvh_get_flags(pv_h);
4906 
4907 #if XNU_MONITOR
4908 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4909 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4910 	}
4911 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4912 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4913 	}
4914 	if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4915 		panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4916 	}
4917 #endif
4918 
4919 
4920 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4921 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4922 	pveh_p = PV_ENTRY_NULL;
4923 	pvet_p = PV_ENTRY_NULL;
4924 	new_pve_p = PV_ENTRY_NULL;
4925 	new_pte_p = PT_ENTRY_NULL;
4926 
4927 
4928 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4929 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4930 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4931 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4932 		pveh_p = pve_p;
4933 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4934 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4935 	}
4936 
4937 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4938 	int pve_ptep_idx = 0;
4939 
4940 	/*
4941 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4942 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4943 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4944 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4945 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4946 	 * tlb_flush_needed to be true while issue_tlbi is false.
4947 	 */
4948 	bool issue_tlbi = false;
4949 	bool tlb_flush_needed = false;
4950 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4951 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4952 		pt_entry_t tmplate = ARM_PTE_EMPTY;
4953 		bool update = false;
4954 
4955 		if (pve_p != PV_ENTRY_NULL) {
4956 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4957 			if (pte_p == PT_ENTRY_NULL) {
4958 				goto protect_skip_pve_pass1;
4959 			}
4960 		}
4961 
4962 #ifdef PVH_FLAG_IOMMU
4963 		if (pvh_ptep_is_iommu(pte_p)) {
4964 #if XNU_MONITOR
4965 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4966 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4967 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4968 			}
4969 #endif
4970 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4971 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4972 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4973 			}
4974 			goto protect_skip_pve_pass1;
4975 		}
4976 #endif
4977 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4978 		const pmap_t pmap = ptdp->pmap;
4979 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4980 
4981 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4982 #if MACH_ASSERT
4983 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4984 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4985 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4986 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4987 
4988 				pv_entry_t *check_pvep = pve_p;
4989 
4990 				do {
4991 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4992 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4993 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4994 					}
4995 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4996 
4997 				/* Restore previous PTEP value. */
4998 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4999 			}
5000 #endif
5001 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
5002 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5003 		}
5004 
5005 #if DEVELOPMENT || DEBUG
5006 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5007 #else
5008 		if ((prot & VM_PROT_EXECUTE))
5009 #endif
5010 		{
5011 			set_NX = false;
5012 		} else {
5013 			set_NX = true;
5014 		}
5015 
5016 #if HAS_FEAT_XS
5017 		/**
5018 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
5019 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
5020 		 */
5021 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
5022 #endif /* HAS_FEAT_XS */
5023 
5024 		/* Remove the mapping if new protection is NONE */
5025 		if (remove) {
5026 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
5027 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
5028 				    __func__, pmap, ppnum);
5029 			}
5030 
5031 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5032 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5033 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5034 			pt_entry_t spte = *pte_p;
5035 
5036 			if (pte_is_wired(spte)) {
5037 				pte_set_wired(pmap, pte_p, 0);
5038 				spte = *pte_p;
5039 				if (pmap != kernel_pmap) {
5040 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5041 				}
5042 			}
5043 
5044 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
5045 			    (uint64_t)spte, pte_p, ppnum);
5046 
5047 			if (compress && is_internal && (pmap != kernel_pmap)) {
5048 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
5049 				/* mark this PTE as having been "compressed" */
5050 				tmplate = ARM_PTE_COMPRESSED;
5051 				if (is_altacct) {
5052 					tmplate |= ARM_PTE_COMPRESSED_ALT;
5053 				}
5054 			} else {
5055 				tmplate = ARM_PTE_EMPTY;
5056 			}
5057 
5058 			assert(spte != tmplate);
5059 			write_pte_fast(pte_p, tmplate);
5060 			update = true;
5061 			++pass1_updated;
5062 
5063 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5064 
5065 			if (pmap != kernel_pmap) {
5066 				if (ppattr_test_reusable(pai) &&
5067 				    is_internal &&
5068 				    !is_altacct) {
5069 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5070 				} else if (!is_internal) {
5071 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5072 				}
5073 
5074 				if (is_altacct) {
5075 					assert(is_internal);
5076 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5077 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5078 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5079 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5080 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5081 					}
5082 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5083 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5084 				} else if (ppattr_test_reusable(pai)) {
5085 					assert(is_internal);
5086 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5087 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5088 						/* was not in footprint, but is now */
5089 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5090 					}
5091 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5092 				} else if (is_internal) {
5093 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5094 
5095 					/*
5096 					 * Update all stats related to physical footprint, which only
5097 					 * deals with internal pages.
5098 					 */
5099 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5100 						/*
5101 						 * This removal is only being done so we can send this page to
5102 						 * the compressor; therefore it mustn't affect total task footprint.
5103 						 */
5104 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5105 					} else {
5106 						/*
5107 						 * This internal page isn't going to the compressor, so adjust stats to keep
5108 						 * phys_footprint up to date.
5109 						 */
5110 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5111 					}
5112 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5113 				} else {
5114 					/* external page: no impact on ledgers */
5115 				}
5116 			}
5117 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5118 		} else {
5119 			pt_entry_t spte = *pte_p;
5120 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5121 
5122 			if (pmap == kernel_pmap) {
5123 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5124 			} else {
5125 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5126 			}
5127 
5128 			/*
5129 			 * While the naive implementation of this would serve to add execute
5130 			 * permission, this is not how the VM uses this interface, or how
5131 			 * x86_64 implements it.  So ignore requests to add execute permissions.
5132 			 */
5133 			if (set_NX) {
5134 				tmplate |= pt_attr_leaf_xn(pt_attr);
5135 			}
5136 
5137 
5138 			assert(spte != ARM_PTE_EMPTY);
5139 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5140 
5141 			if (spte != tmplate) {
5142 				/*
5143 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5144 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5145 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
5146 				 * should always be cleared by this function.
5147 				 */
5148 				pte_set_was_writeable(tmplate, true);
5149 				write_pte_fast(pte_p, tmplate);
5150 				update = true;
5151 				++pass1_updated;
5152 			} else if (pte_was_writeable(tmplate)) {
5153 				/*
5154 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5155 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5156 				 * write access to a page, this function should always at least clear that flag for
5157 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5158 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5159 				 * be handled through arm_fast_fault().
5160 				 */
5161 				pte_set_was_writeable(tmplate, false);
5162 				write_pte_fast(pte_p, tmplate);
5163 			}
5164 		}
5165 
5166 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5167 			tlb_flush_needed = true;
5168 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5169 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5170 				issue_tlbi = true;
5171 			}
5172 		}
5173 protect_skip_pve_pass1:
5174 		pte_p = PT_ENTRY_NULL;
5175 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5176 			pve_ptep_idx = 0;
5177 			pve_p = pve_next(pve_p);
5178 		}
5179 	}
5180 
5181 	if (tlb_flush_needed) {
5182 		FLUSH_PTE_STRONG();
5183 	}
5184 
5185 	if (!remove && !issue_tlbi) {
5186 		goto protect_finish;
5187 	}
5188 
5189 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5190 	pv_entry_t **pve_pp = pv_h;
5191 	pve_p = orig_pve_p;
5192 	pte_p = orig_pte_p;
5193 	pve_ptep_idx = 0;
5194 
5195 	/*
5196 	 * We need to keep track of whether a particular PVE list contains IOMMU
5197 	 * mappings when removing entries, because we should only remove CPU
5198 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5199 	 * it around.
5200 	 */
5201 	bool iommu_mapping_in_pve = false;
5202 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5203 		if (pve_p != PV_ENTRY_NULL) {
5204 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5205 			if (pte_p == PT_ENTRY_NULL) {
5206 				goto protect_skip_pve_pass2;
5207 			}
5208 		}
5209 
5210 #ifdef PVH_FLAG_IOMMU
5211 		if (pvh_ptep_is_iommu(pte_p)) {
5212 			iommu_mapping_in_pve = true;
5213 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5214 				/*
5215 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5216 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5217 				 * contain the single IOMMU PTE and exit the loop.
5218 				 */
5219 				new_pte_p = pte_p;
5220 				break;
5221 			}
5222 			goto protect_skip_pve_pass2;
5223 		}
5224 #endif
5225 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5226 		const pmap_t pmap = ptdp->pmap;
5227 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5228 
5229 		if (remove) {
5230 			if (!compress && (pmap != kernel_pmap)) {
5231 				/*
5232 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5233 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5234 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5235 				 * under us.
5236 				 */
5237 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5238 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5239 				}
5240 			}
5241 			/* Remove this CPU mapping from PVE list. */
5242 			if (pve_p != PV_ENTRY_NULL) {
5243 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5244 			}
5245 		} else {
5246 			pt_entry_t spte = *pte_p;
5247 			if (pte_was_writeable(spte)) {
5248 				pte_set_was_writeable(spte, false);
5249 				write_pte_fast(pte_p, spte);
5250 			} else {
5251 				goto protect_skip_pve_pass2;
5252 			}
5253 		}
5254 		++pass2_updated;
5255 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5256 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5257 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5258 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5259 		}
5260 
5261 protect_skip_pve_pass2:
5262 		pte_p = PT_ENTRY_NULL;
5263 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5264 			pve_ptep_idx = 0;
5265 
5266 			if (remove) {
5267 				/**
5268 				 * If there are any IOMMU mappings in the PVE list, preserve
5269 				 * those mappings in a new PVE list (new_pve_p) which will later
5270 				 * become the new PVH entry. Keep track of the CPU mappings in
5271 				 * pveh_p/pvet_p so they can be deallocated later.
5272 				 */
5273 				if (iommu_mapping_in_pve) {
5274 					iommu_mapping_in_pve = false;
5275 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5276 					pve_remove(pv_h, pve_pp, pve_p);
5277 					pveh_p = pvh_pve_list(pv_h);
5278 					pve_p->pve_next = new_pve_p;
5279 					new_pve_p = pve_p;
5280 					pve_p = temp_pve_p;
5281 					continue;
5282 				} else {
5283 					pvet_p = pve_p;
5284 					pvh_cnt++;
5285 				}
5286 			}
5287 
5288 			pve_pp = pve_next_ptr(pve_p);
5289 			pve_p = pve_next(pve_p);
5290 			iommu_mapping_in_pve = false;
5291 		}
5292 	}
5293 
5294 protect_finish:
5295 
5296 #ifdef PVH_FLAG_EXEC
5297 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5298 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5299 	}
5300 #endif
5301 	if (__improbable(pass1_updated != pass2_updated)) {
5302 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5303 		    __func__, pass1_updated, pass2_updated);
5304 	}
5305 	/* if we removed a bunch of entries, take care of them now */
5306 	if (remove) {
5307 		if (new_pve_p != PV_ENTRY_NULL) {
5308 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5309 			pvh_set_flags(pv_h, pvh_flags);
5310 		} else if (new_pte_p != PT_ENTRY_NULL) {
5311 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5312 			pvh_set_flags(pv_h, pvh_flags);
5313 		} else {
5314 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5315 				pmap_flush_noncoherent_page(phys);
5316 			}
5317 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5318 		}
5319 	}
5320 
5321 	if (flush_range && tlb_flush_needed) {
5322 		if (!remove) {
5323 			flush_range->ptfr_flush_needed = true;
5324 			tlb_flush_needed = false;
5325 		}
5326 	}
5327 
5328 	/*
5329 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5330 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5331 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5332 	 * a page to be repurposed while it is still live in the TLBs.
5333 	 */
5334 	if (remove && tlb_flush_needed) {
5335 		sync_tlb_flush();
5336 	}
5337 
5338 
5339 	pvh_unlock(pai);
5340 
5341 	if (remove) {
5342 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5343 #if !XNU_MONITOR
5344 		mp_enable_preemption();
5345 #endif
5346 	}
5347 
5348 	if (!remove && tlb_flush_needed) {
5349 		sync_tlb_flush();
5350 	}
5351 
5352 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5353 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5354 	}
5355 }
5356 
5357 MARK_AS_PMAP_TEXT void
5358 pmap_page_protect_options_internal(
5359 	ppnum_t ppnum,
5360 	vm_prot_t prot,
5361 	unsigned int options,
5362 	void *arg)
5363 {
5364 	if (arg != NULL) {
5365 		/*
5366 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5367 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5368 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5369 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5370 		 * In that case, force the flush to take place.
5371 		 */
5372 		options &= ~PMAP_OPTIONS_NOFLUSH;
5373 	}
5374 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5375 }
5376 
5377 void
5378 pmap_page_protect_options(
5379 	ppnum_t ppnum,
5380 	vm_prot_t prot,
5381 	unsigned int options,
5382 	void *arg)
5383 {
5384 	pmap_paddr_t    phys = ptoa(ppnum);
5385 
5386 	assert(ppnum != vm_page_fictitious_addr);
5387 
5388 	/* Only work with managed pages. */
5389 	if (!pa_valid(phys)) {
5390 		return;
5391 	}
5392 
5393 	/*
5394 	 * Determine the new protection.
5395 	 */
5396 	if (prot == VM_PROT_ALL) {
5397 		return;         /* nothing to do */
5398 	}
5399 
5400 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5401 
5402 #if XNU_MONITOR
5403 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5404 #else
5405 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5406 #endif
5407 
5408 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5409 }
5410 
5411 
5412 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5413 MARK_AS_PMAP_TEXT void
5414 pmap_disable_user_jop_internal(pmap_t pmap)
5415 {
5416 	if (pmap == kernel_pmap) {
5417 		panic("%s: called with kernel_pmap", __func__);
5418 	}
5419 	validate_pmap_mutable(pmap);
5420 	pmap->disable_jop = true;
5421 }
5422 
5423 void
5424 pmap_disable_user_jop(pmap_t pmap)
5425 {
5426 #if XNU_MONITOR
5427 	pmap_disable_user_jop_ppl(pmap);
5428 #else
5429 	pmap_disable_user_jop_internal(pmap);
5430 #endif
5431 }
5432 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5433 
5434 /*
5435  * Indicates if the pmap layer enforces some additional restrictions on the
5436  * given set of protections.
5437  */
5438 bool
5439 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5440 {
5441 	return false;
5442 }
5443 
5444 static inline bool
5445 pmap_allows_xo(pmap_t pmap __unused)
5446 {
5447 	return true;
5448 }
5449 
5450 /*
5451  *	Set the physical protection on the
5452  *	specified range of this map as requested.
5453  *	VERY IMPORTANT: Will not increase permissions.
5454  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5455  */
5456 void
5457 pmap_protect(
5458 	pmap_t pmap,
5459 	vm_map_address_t b,
5460 	vm_map_address_t e,
5461 	vm_prot_t prot)
5462 {
5463 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5464 }
5465 
5466 MARK_AS_PMAP_TEXT vm_map_address_t
5467 pmap_protect_options_internal(
5468 	pmap_t pmap,
5469 	vm_map_address_t start,
5470 	vm_map_address_t end,
5471 	vm_prot_t prot,
5472 	unsigned int options,
5473 	__unused void *args)
5474 {
5475 	tt_entry_t      *tte_p;
5476 	pt_entry_t      *bpte_p, *epte_p;
5477 	pt_entry_t      *pte_p;
5478 	boolean_t        set_NX = TRUE;
5479 	boolean_t        set_XO = FALSE;
5480 	boolean_t        should_have_removed = FALSE;
5481 	bool             need_strong_sync = false;
5482 
5483 	/* Validate the pmap input before accessing its data. */
5484 	validate_pmap_mutable(pmap);
5485 
5486 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5487 
5488 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5489 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5490 	}
5491 
5492 #if DEVELOPMENT || DEBUG
5493 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5494 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5495 			should_have_removed = TRUE;
5496 		}
5497 	} else
5498 #endif
5499 	{
5500 		/* Determine the new protection. */
5501 		switch (prot) {
5502 		case VM_PROT_READ:
5503 		case VM_PROT_READ | VM_PROT_EXECUTE:
5504 			break;
5505 		case VM_PROT_READ | VM_PROT_WRITE:
5506 		case VM_PROT_ALL:
5507 			return end;         /* nothing to do */
5508 		case VM_PROT_EXECUTE:
5509 			set_XO = true;
5510 			if (pmap_allows_xo(pmap)) {
5511 				break;
5512 			}
5513 		/* Fall through and panic if this pmap shouldn't be allowed to have XO mappings. */
5514 		default:
5515 			should_have_removed = TRUE;
5516 		}
5517 	}
5518 
5519 	if (should_have_removed) {
5520 		panic("%s: should have been a remove operation, "
5521 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5522 		    __FUNCTION__,
5523 		    pmap, (void *)start, (void *)end, prot, options, args);
5524 	}
5525 
5526 #if DEVELOPMENT || DEBUG
5527 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5528 #else
5529 	if ((prot & VM_PROT_EXECUTE))
5530 #endif
5531 	{
5532 		set_NX = FALSE;
5533 	} else {
5534 		set_NX = TRUE;
5535 	}
5536 
5537 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5538 	vm_map_address_t va = start;
5539 	unsigned int npages = 0;
5540 
5541 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5542 
5543 	tte_p = pmap_tte(pmap, start);
5544 
5545 	if ((tte_p != (tt_entry_t *) NULL) && tte_is_valid_table(*tte_p)) {
5546 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5547 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5548 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5549 		pte_p = bpte_p;
5550 
5551 		for (pte_p = bpte_p;
5552 		    pte_p < epte_p;
5553 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5554 			++npages;
5555 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5556 			    pmap_pending_preemption())) {
5557 				break;
5558 			}
5559 			pt_entry_t spte;
5560 #if DEVELOPMENT || DEBUG
5561 			boolean_t  force_write = FALSE;
5562 #endif
5563 
5564 			spte = *((volatile pt_entry_t*)pte_p);
5565 
5566 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5567 				continue;
5568 			}
5569 
5570 			pmap_paddr_t    pa;
5571 			unsigned int    pai = 0;
5572 			boolean_t       managed = FALSE;
5573 
5574 			while (!managed) {
5575 				/*
5576 				 * It may be possible for the pte to transition from managed
5577 				 * to unmanaged in this timeframe; for now, elide the assert.
5578 				 * We should break out as a consequence of checking pa_valid.
5579 				 */
5580 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5581 				pa = pte_to_pa(spte);
5582 				if (!pa_valid(pa)) {
5583 					break;
5584 				}
5585 				pai = pa_index(pa);
5586 				pvh_lock(pai);
5587 				spte = *((volatile pt_entry_t*)pte_p);
5588 				pa = pte_to_pa(spte);
5589 				if (pai == pa_index(pa)) {
5590 					managed = TRUE;
5591 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5592 				}
5593 				pvh_unlock(pai);
5594 			}
5595 
5596 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5597 				continue;
5598 			}
5599 
5600 			pt_entry_t      tmplate;
5601 
5602 			if (pmap == kernel_pmap) {
5603 #if DEVELOPMENT || DEBUG
5604 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5605 					force_write = TRUE;
5606 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5607 				} else
5608 #endif
5609 				{
5610 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5611 				}
5612 			} else {
5613 #if DEVELOPMENT || DEBUG
5614 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5615 					assert(pmap->type != PMAP_TYPE_NESTED);
5616 					force_write = TRUE;
5617 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5618 				} else
5619 #endif
5620 				{
5621 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5622 				}
5623 			}
5624 
5625 			/*
5626 			 * XXX Removing "NX" would
5627 			 * grant "execute" access
5628 			 * immediately, bypassing any
5629 			 * checks VM might want to do
5630 			 * in its soft fault path.
5631 			 * pmap_protect() and co. are
5632 			 * not allowed to increase
5633 			 * access permissions.
5634 			 */
5635 			if (set_NX) {
5636 				tmplate |= pt_attr_leaf_xn(pt_attr);
5637 			} else {
5638 				if (pmap == kernel_pmap) {
5639 					/* do NOT clear "PNX"! */
5640 					tmplate |= ARM_PTE_NX;
5641 				} else {
5642 					/* do NOT clear "NX"! */
5643 					tmplate |= pt_attr_leaf_x(pt_attr);
5644 					if (__improbable(set_XO)) {
5645 						tmplate &= ~ARM_PTE_APMASK;
5646 						tmplate |= pt_attr_leaf_rona(pt_attr);
5647 					}
5648 				}
5649 			}
5650 
5651 #if DEVELOPMENT || DEBUG
5652 			if (force_write) {
5653 				/*
5654 				 * TODO: Run CS/Monitor checks here.
5655 				 */
5656 				if (managed) {
5657 					/*
5658 					 * We are marking the page as writable,
5659 					 * so we consider it to be modified and
5660 					 * referenced.
5661 					 */
5662 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5663 					tmplate |= ARM_PTE_AF;
5664 
5665 					if (ppattr_test_reffault(pai)) {
5666 						ppattr_clear_reffault(pai);
5667 					}
5668 
5669 					if (ppattr_test_modfault(pai)) {
5670 						ppattr_clear_modfault(pai);
5671 					}
5672 				}
5673 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5674 				/*
5675 				 * An immediate request for anything other than
5676 				 * write should still mark the page as
5677 				 * referenced if managed.
5678 				 */
5679 				if (managed) {
5680 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5681 					tmplate |= ARM_PTE_AF;
5682 
5683 					if (ppattr_test_reffault(pai)) {
5684 						ppattr_clear_reffault(pai);
5685 					}
5686 				}
5687 			}
5688 #endif
5689 
5690 			/* We do not expect to write fast fault the entry. */
5691 			pte_set_was_writeable(tmplate, false);
5692 #if HAS_FEAT_XS
5693 			if (pte_is_xs(pt_attr, spte)) {
5694 				need_strong_sync = true;
5695 			}
5696 #endif /* HAS_FEAT_XS */
5697 
5698 			write_pte_fast(pte_p, tmplate);
5699 
5700 			if (managed) {
5701 				pvh_assert_locked(pai);
5702 				pvh_unlock(pai);
5703 			}
5704 		}
5705 		FLUSH_PTE_STRONG();
5706 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5707 	} else {
5708 		va = end;
5709 	}
5710 
5711 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5712 	return va;
5713 }
5714 
5715 void
5716 pmap_protect_options(
5717 	pmap_t pmap,
5718 	vm_map_address_t b,
5719 	vm_map_address_t e,
5720 	vm_prot_t prot,
5721 	unsigned int options,
5722 	__unused void *args)
5723 {
5724 	vm_map_address_t l, beg;
5725 
5726 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5727 
5728 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5729 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5730 		    pmap, (uint64_t)b, (uint64_t)e);
5731 	}
5732 
5733 	/*
5734 	 * We allow single-page requests to execute non-preemptibly,
5735 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5736 	 * operation, and there are a couple of special use cases that
5737 	 * require a non-preemptible single-page operation.
5738 	 */
5739 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5740 		pmap_verify_preemptible();
5741 	}
5742 
5743 #if DEVELOPMENT || DEBUG
5744 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5745 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5746 			pmap_remove_options(pmap, b, e, options);
5747 			return;
5748 		}
5749 	} else
5750 #endif
5751 	{
5752 		/* Determine the new protection. */
5753 		switch (prot) {
5754 		case VM_PROT_READ:
5755 		case VM_PROT_READ | VM_PROT_EXECUTE:
5756 			break;
5757 		case VM_PROT_READ | VM_PROT_WRITE:
5758 		case VM_PROT_ALL:
5759 			return;         /* nothing to do */
5760 		case VM_PROT_EXECUTE:
5761 			if (pmap_allows_xo(pmap)) {
5762 				break;
5763 			}
5764 		/* Fall through and remove the mapping if XO is requested and [pmap] doesn't allow it. */
5765 		default:
5766 			pmap_remove_options(pmap, b, e, options);
5767 			return;
5768 		}
5769 	}
5770 
5771 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5772 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5773 	    VM_KERNEL_ADDRHIDE(e));
5774 
5775 	beg = b;
5776 
5777 	while (beg < e) {
5778 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5779 
5780 		if (l > e) {
5781 			l = e;
5782 		}
5783 
5784 #if XNU_MONITOR
5785 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5786 #else
5787 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5788 #endif
5789 	}
5790 
5791 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5792 }
5793 
5794 /**
5795  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5796  *
5797  * @param pmap pmap to insert the pages into.
5798  * @param va virtual address to map the pages into.
5799  * @param pa page number of the first physical page to map.
5800  * @param size block size, in number of pages.
5801  * @param prot mapping protection attributes.
5802  * @param attr flags to pass to pmap_enter().
5803  *
5804  * @return KERN_SUCCESS.
5805  */
5806 kern_return_t
5807 pmap_map_block(
5808 	pmap_t pmap,
5809 	addr64_t va,
5810 	ppnum_t pa,
5811 	uint32_t size,
5812 	vm_prot_t prot,
5813 	int attr,
5814 	unsigned int flags)
5815 {
5816 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5817 }
5818 
5819 /**
5820  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5821  * As opposed to pmap_map_block(), this function takes
5822  * a physical address as an input and operates using the
5823  * page size associated with the input pmap.
5824  *
5825  * @param pmap pmap to insert the pages into.
5826  * @param va virtual address to map the pages into.
5827  * @param pa physical address of the first physical page to map.
5828  * @param size block size, in number of pages.
5829  * @param prot mapping protection attributes.
5830  * @param attr flags to pass to pmap_enter().
5831  *
5832  * @return KERN_SUCCESS.
5833  */
5834 kern_return_t
5835 pmap_map_block_addr(
5836 	pmap_t pmap,
5837 	addr64_t va,
5838 	pmap_paddr_t pa,
5839 	uint32_t size,
5840 	vm_prot_t prot,
5841 	int attr,
5842 	unsigned int flags)
5843 {
5844 #if __ARM_MIXED_PAGE_SIZE__
5845 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5846 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5847 #else
5848 	const uint64_t pmap_page_size = PAGE_SIZE;
5849 #endif
5850 
5851 	for (ppnum_t page = 0; page < size; page++) {
5852 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5853 			panic("%s: failed pmap_enter_addr, "
5854 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5855 			    __FUNCTION__,
5856 			    pmap, va, (uint64_t)pa, size, prot, flags);
5857 		}
5858 
5859 		va += pmap_page_size;
5860 		pa += pmap_page_size;
5861 	}
5862 
5863 	return KERN_SUCCESS;
5864 }
5865 
5866 kern_return_t
5867 pmap_enter_addr(
5868 	pmap_t pmap,
5869 	vm_map_address_t v,
5870 	pmap_paddr_t pa,
5871 	vm_prot_t prot,
5872 	vm_prot_t fault_type,
5873 	unsigned int flags,
5874 	boolean_t wired)
5875 {
5876 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5877 }
5878 
5879 /*
5880  *	Insert the given physical page (p) at
5881  *	the specified virtual address (v) in the
5882  *	target physical map with the protection requested.
5883  *
5884  *	If specified, the page will be wired down, meaning
5885  *	that the related pte can not be reclaimed.
5886  *
5887  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5888  *	or lose information.  That is, this routine must actually
5889  *	insert this page into the given map eventually (must make
5890  *	forward progress eventually.
5891  */
5892 kern_return_t
5893 pmap_enter(
5894 	pmap_t pmap,
5895 	vm_map_address_t v,
5896 	ppnum_t pn,
5897 	vm_prot_t prot,
5898 	vm_prot_t fault_type,
5899 	unsigned int flags,
5900 	boolean_t wired,
5901 	__unused pmap_mapping_type_t mapping_type)
5902 {
5903 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5904 }
5905 
5906 /*
5907  * Attempt to commit the pte.
5908  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5909  * Performs no page table or accounting writes on failures.
5910  */
5911 static inline bool
5912 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5913 {
5914 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5915 	bool success = false, changed_wiring = false;
5916 
5917 	__unreachable_ok_push
5918 	if (TEST_PAGE_RATIO_4) {
5919 		/*
5920 		 * 16K virtual pages w/ 4K hw pages.
5921 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5922 		 * As a result we require the exclusive pmap lock.
5923 		 */
5924 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5925 		*old_pte = *pte_p;
5926 		if (*old_pte == new_pte) {
5927 			/* Another thread completed this operation. Nothing to do here. */
5928 			success = true;
5929 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5930 		    pte_is_valid(*old_pte)) {
5931 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5932 			success = false;
5933 		} else {
5934 			write_pte_fast(pte_p, new_pte);
5935 			success = true;
5936 		}
5937 	} else {
5938 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5939 	}
5940 	__unreachable_ok_pop
5941 
5942 	if (success && *old_pte != new_pte) {
5943 		if (pte_is_valid(*old_pte)) {
5944 			bool need_strong_sync = false;
5945 			FLUSH_PTE_STRONG();
5946 #if HAS_FEAT_XS
5947 			if (pte_is_xs(pt_attr, *old_pte)) {
5948 				need_strong_sync = true;
5949 			}
5950 #endif /* HAS_FEAT_XS */
5951 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5952 		} else {
5953 			FLUSH_PTE();
5954 			__builtin_arm_isb(ISB_SY);
5955 		}
5956 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5957 		    (new_pte & ARM_PTE_WIRED) != 0 :
5958 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5959 
5960 		if (pmap != kernel_pmap && changed_wiring) {
5961 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5962 			if (new_pte & ARM_PTE_WIRED) {
5963 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5964 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5965 			} else {
5966 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5967 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5968 			}
5969 		}
5970 
5971 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5972 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5973 	}
5974 	return success;
5975 }
5976 
5977 MARK_AS_PMAP_TEXT static pt_entry_t
5978 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5979 {
5980 	pt_entry_t pte;
5981 
5982 	switch (wimg & (VM_WIMG_MASK)) {
5983 	case VM_WIMG_IO:
5984 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5985 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5986 		// AP, while preserving the security benefits of using device
5987 		// mapping against side-channel attacks. On pre-H14 platforms,
5988 		// the accesses will still be strongly ordered.
5989 		if (is_dram_addr(pa)) {
5990 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5991 		} else {
5992 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5993 #if HAS_FEAT_XS
5994 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5995 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5996 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5997 			}
5998 #endif /* HAS_FEAT_XS */
5999 		}
6000 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6001 		break;
6002 	case VM_WIMG_RT:
6003 		if (is_dram_addr(pa)) {
6004 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
6005 		} else {
6006 #if HAS_FEAT_XS
6007 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6008 #else /* HAS_FEAT_XS */
6009 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6010 #endif /* HAS_FEAT_XS */
6011 #if DEBUG || DEVELOPMENT
6012 			pmap_wcrt_on_non_dram_count_increment_atomic();
6013 #endif /* DEBUG || DEVELOPMENT */
6014 		}
6015 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6016 		break;
6017 	case VM_WIMG_POSTED:
6018 		if (is_dram_addr(pa)) {
6019 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6020 		} else {
6021 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6022 		}
6023 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6024 		break;
6025 	case VM_WIMG_POSTED_REORDERED:
6026 		if (is_dram_addr(pa)) {
6027 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6028 		} else {
6029 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6030 		}
6031 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6032 		break;
6033 	case VM_WIMG_POSTED_COMBINED_REORDERED:
6034 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6035 #if HAS_FEAT_XS
6036 		if (!is_dram_addr(pa)) {
6037 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6038 		}
6039 #endif /* HAS_FEAT_XS */
6040 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6041 		break;
6042 	case VM_WIMG_WCOMB:
6043 		if (is_dram_addr(pa)) {
6044 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6045 		} else {
6046 #if HAS_FEAT_XS
6047 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6048 #else /* HAS_FEAT_XS */
6049 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6050 #endif /* HAS_FEAT_XS */
6051 #if DEBUG || DEVELOPMENT
6052 			pmap_wcrt_on_non_dram_count_increment_atomic();
6053 #endif /* DEBUG || DEVELOPMENT */
6054 		}
6055 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6056 		break;
6057 	case VM_WIMG_WTHRU:
6058 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6059 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6060 		break;
6061 	case VM_WIMG_COPYBACK:
6062 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6063 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6064 		break;
6065 	case VM_WIMG_INNERWBACK:
6066 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6067 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6068 		break;
6069 	default:
6070 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6071 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6072 	}
6073 
6074 	return pte;
6075 }
6076 
6077 
6078 /*
6079  * Construct a PTE (and the physical page attributes) for the given virtual to
6080  * physical mapping.
6081  *
6082  * This function has no side effects and is safe to call so that it is safe to
6083  * call while attempting a pmap_enter transaction.
6084  */
6085 MARK_AS_PMAP_TEXT static pt_entry_t
6086 pmap_construct_pte(
6087 	const pmap_t pmap,
6088 	vm_map_address_t va,
6089 	pmap_paddr_t pa,
6090 	vm_prot_t prot,
6091 	vm_prot_t fault_type,
6092 	boolean_t wired,
6093 	const pt_attr_t* const pt_attr,
6094 	unsigned int options __unused,
6095 	uint16_t *pp_attr_bits /* OUTPUT */
6096 	)
6097 {
6098 	bool set_NX = false, set_XO = false, set_TPRO = false;
6099 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
6100 	assert(pp_attr_bits != NULL);
6101 	*pp_attr_bits = 0;
6102 
6103 	if (wired) {
6104 		pte |= ARM_PTE_WIRED;
6105 	}
6106 
6107 #if DEVELOPMENT || DEBUG
6108 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6109 #else
6110 	if ((prot & VM_PROT_EXECUTE))
6111 #endif
6112 	{
6113 		set_NX = false;
6114 	} else {
6115 		set_NX = true;
6116 	}
6117 
6118 	if (prot == VM_PROT_EXECUTE) {
6119 		set_XO = true;
6120 		if (!pmap_allows_xo(pmap)) {
6121 			panic("%s: attempted execute-only mapping", __func__);
6122 		}
6123 	}
6124 
6125 	if (set_NX) {
6126 		pte |= pt_attr_leaf_xn(pt_attr);
6127 	} else {
6128 		if (pmap == kernel_pmap) {
6129 			pte |= ARM_PTE_NX;
6130 		} else {
6131 			pte |= pt_attr_leaf_x(pt_attr);
6132 		}
6133 	}
6134 
6135 	if (pmap == kernel_pmap) {
6136 #if __ARM_KERNEL_PROTECT__
6137 		pte |= ARM_PTE_NG;
6138 #endif /* __ARM_KERNEL_PROTECT__ */
6139 		if (prot & VM_PROT_WRITE) {
6140 			pte |= ARM_PTE_AP(AP_RWNA);
6141 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6142 		} else {
6143 			pte |= ARM_PTE_AP(AP_RONA);
6144 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6145 		}
6146 	} else {
6147 		if (pmap->type != PMAP_TYPE_NESTED) {
6148 			pte |= ARM_PTE_NG;
6149 		} else if ((pmap->nested_region_unnested_table_bitmap)
6150 		    && (va >= pmap->nested_region_addr)
6151 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6152 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
6153 
6154 			if ((pmap->nested_region_unnested_table_bitmap)
6155 			    && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6156 				pte |= ARM_PTE_NG;
6157 			}
6158 		}
6159 		if (set_TPRO) {
6160 			pte |= pt_attr_leaf_rona(pt_attr);
6161 			*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6162 		} else if (prot & VM_PROT_WRITE) {
6163 			assert(pmap->type != PMAP_TYPE_NESTED);
6164 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6165 				if (fault_type & VM_PROT_WRITE) {
6166 					pte |= pt_attr_leaf_rw(pt_attr);
6167 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6168 				} else {
6169 					pte |= pt_attr_leaf_ro(pt_attr);
6170 					/*
6171 					 * Mark the page as MODFAULT so that a subsequent write
6172 					 * may be handled through arm_fast_fault().
6173 					 */
6174 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6175 					pte_set_was_writeable(pte, true);
6176 				}
6177 			} else {
6178 				pte |= pt_attr_leaf_rw(pt_attr);
6179 				*pp_attr_bits |= PP_ATTR_REFERENCED;
6180 			}
6181 		} else {
6182 			if (__improbable(set_XO)) {
6183 				pte |= pt_attr_leaf_rona(pt_attr);
6184 			} else {
6185 				pte |= pt_attr_leaf_ro(pt_attr);
6186 			}
6187 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6188 		}
6189 	}
6190 
6191 	pte |= ARM_PTE_AF;
6192 	return pte;
6193 }
6194 
6195 MARK_AS_PMAP_TEXT kern_return_t
6196 pmap_enter_options_internal(
6197 	pmap_t pmap,
6198 	vm_map_address_t v,
6199 	pmap_paddr_t pa,
6200 	vm_prot_t prot,
6201 	vm_prot_t fault_type,
6202 	unsigned int flags,
6203 	boolean_t wired,
6204 	unsigned int options)
6205 {
6206 	ppnum_t         pn = (ppnum_t)atop(pa);
6207 	pt_entry_t      pte;
6208 	pt_entry_t      spte;
6209 	pt_entry_t      *pte_p;
6210 	bool            refcnt_updated;
6211 	bool            wiredcnt_updated;
6212 	bool            ro_va = false;
6213 	unsigned int    wimg_bits;
6214 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6215 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6216 	kern_return_t   kr = KERN_SUCCESS;
6217 	uint16_t pp_attr_bits;
6218 	volatile uint16_t *refcnt;
6219 	volatile uint16_t *wiredcnt;
6220 	pv_free_list_t *local_pv_free;
6221 
6222 	validate_pmap_mutable(pmap);
6223 
6224 #if XNU_MONITOR
6225 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6226 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6227 	}
6228 #endif
6229 
6230 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6231 
6232 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6233 		panic("%s: pmap %p v 0x%llx not page-aligned",
6234 		    __func__, pmap, (unsigned long long)v);
6235 	}
6236 
6237 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6238 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6239 	}
6240 
6241 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6242 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6243 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6244 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6245 	}
6246 
6247 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6248 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6249 		    pmap, (uint64_t)pa);
6250 	}
6251 
6252 	/* The PA should not extend beyond the architected physical address space */
6253 	pa &= ARM_PTE_PAGE_MASK;
6254 
6255 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6256 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6257 		extern vm_offset_t ctrr_test_page;
6258 		if (__probable(v != ctrr_test_page))
6259 #endif
6260 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6261 	}
6262 	if (__improbable((prot == VM_PROT_EXECUTE) && !pmap_allows_xo(pmap))) {
6263 		return KERN_PROTECTION_FAILURE;
6264 	}
6265 
6266 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6267 		if (__improbable(prot != VM_PROT_READ)) {
6268 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6269 			    __func__, (unsigned long long)v, prot);
6270 		}
6271 		ro_va = true;
6272 	}
6273 	assert(pn != vm_page_fictitious_addr);
6274 
6275 	refcnt_updated = false;
6276 	wiredcnt_updated = false;
6277 
6278 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6279 		/*
6280 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6281 		 *
6282 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6283 		 */
6284 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6285 	}
6286 
6287 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6288 		return KERN_ABORTED;
6289 	}
6290 
6291 	/*
6292 	 *	Expand pmap to include this pte.  Assume that
6293 	 *	pmap is always expanded to include enough hardware
6294 	 *	pages to map one VM page.
6295 	 */
6296 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6297 		/* Must unlock to expand the pmap. */
6298 		pmap_unlock(pmap, lock_mode);
6299 
6300 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6301 
6302 		if (kr != KERN_SUCCESS) {
6303 			return kr;
6304 		}
6305 
6306 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6307 			return KERN_ABORTED;
6308 		}
6309 	}
6310 
6311 	if (options & PMAP_OPTIONS_NOENTER) {
6312 		pmap_unlock(pmap, lock_mode);
6313 		return KERN_SUCCESS;
6314 	}
6315 
6316 	/*
6317 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6318 	 * done via a cmpxchg loop.
6319 	 * We need to be careful about modifying non-local data structures before commiting
6320 	 * the new pte since we may need to re-do the transaction.
6321 	 */
6322 	spte = os_atomic_load(pte_p, relaxed);
6323 	while (!committed) {
6324 		refcnt = NULL;
6325 		wiredcnt = NULL;
6326 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6327 		had_valid_mapping = pte_is_valid(spte);
6328 
6329 		if (pmap != kernel_pmap) {
6330 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6331 			refcnt = &ptd_info->refcnt;
6332 			wiredcnt = &ptd_info->wiredcnt;
6333 			/*
6334 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6335 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6336 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6337 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6338 			 * have PTDs, so we can't use the check there.
6339 			 */
6340 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6341 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6342 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6343 			}
6344 			/*
6345 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6346 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6347 			 * or acquire the pmap lock exclusive.
6348 			 */
6349 			if (!wiredcnt_updated) {
6350 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6351 				wiredcnt_updated = true;
6352 			}
6353 			if (!refcnt_updated) {
6354 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6355 				refcnt_updated = true;
6356 				drop_refcnt = true;
6357 			}
6358 		}
6359 
6360 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6361 			/*
6362 			 * There is already a mapping here & it's for a different physical page.
6363 			 * First remove that mapping.
6364 			 *
6365 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6366 			 */
6367 			if (lock_mode == PMAP_LOCK_SHARED) {
6368 				if (pmap_lock_shared_to_exclusive(pmap)) {
6369 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6370 				} else {
6371 					/*
6372 					 * We failed to upgrade to an exclusive lock.
6373 					 * As a result we no longer hold the lock at all,
6374 					 * so we need to re-acquire it and restart the transaction.
6375 					 */
6376 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6377 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6378 					/* pmap might have changed after we dropped the lock. Try again. */
6379 					spte = os_atomic_load(pte_p, relaxed);
6380 					continue;
6381 				}
6382 			}
6383 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6384 			spte = ARM_PTE_EMPTY;
6385 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_EMPTY);
6386 		}
6387 
6388 		pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, options, &pp_attr_bits);
6389 
6390 		if (pa_valid(pa)) {
6391 			unsigned int pai;
6392 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6393 
6394 			is_internal = FALSE;
6395 			is_altacct = FALSE;
6396 
6397 			pai = pa_index(pa);
6398 
6399 			pvh_lock(pai);
6400 
6401 			/*
6402 			 * Make sure that the current per-cpu PV free list has
6403 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6404 			 * if the transaction succeeds. We're either in the
6405 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6406 			 * Note that we can still be interrupted, but a primary
6407 			 * interrupt handler can never enter the pmap.
6408 			 */
6409 #if !XNU_MONITOR
6410 			assert(get_preemption_level() > 0);
6411 #endif
6412 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6413 			pv_entry_t **pv_h = pai_to_pvh(pai);
6414 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6415 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6416 
6417 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6418 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6419 				int new_allocated_pves = 0;
6420 
6421 				while (new_allocated_pves < 2) {
6422 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6423 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6424 					if (pv_status == PV_ALLOC_FAIL) {
6425 						break;
6426 					} else if (pv_status == PV_ALLOC_RETRY) {
6427 						/*
6428 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6429 						 * it will have dropped the pmap lock while doing so.
6430 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6431 						 * be on a different CPU now.
6432 						 */
6433 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6434 					} else {
6435 						/* If we've gotten this far then a node should've been allocated. */
6436 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6437 
6438 						new_allocated_pves++;
6439 					}
6440 				}
6441 
6442 				for (int i = 0; i < new_allocated_pves; i++) {
6443 					pv_free(new_pve_p[i]);
6444 				}
6445 			}
6446 
6447 			if (pv_status == PV_ALLOC_FAIL) {
6448 				pvh_unlock(pai);
6449 				kr = KERN_RESOURCE_SHORTAGE;
6450 				break;
6451 			} else if (pv_status == PV_ALLOC_RETRY) {
6452 				pvh_unlock(pai);
6453 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6454 				spte = os_atomic_load(pte_p, relaxed);
6455 				continue;
6456 			}
6457 
6458 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6459 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6460 			} else {
6461 				wimg_bits = pmap_cache_attributes(pn);
6462 			}
6463 
6464 			/* We may be retrying this operation after dropping the PVH lock.
6465 			 * Cache attributes for the physical page may have changed while the lock
6466 			 * was dropped, so clear any cache attributes we may have previously set
6467 			 * in the PTE template. */
6468 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6469 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6470 
6471 #if XNU_MONITOR
6472 			/* The regular old kernel is not allowed to remap PPL pages. */
6473 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6474 				panic("%s: page belongs to PPL, "
6475 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6476 				    __FUNCTION__,
6477 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6478 			}
6479 
6480 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6481 				panic("%s: page locked down, "
6482 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6483 				    __FUNCTION__,
6484 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6485 			}
6486 #endif
6487 
6488 
6489 
6490 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6491 			if (!committed) {
6492 				pvh_unlock(pai);
6493 				continue;
6494 			}
6495 			had_valid_mapping = pte_is_valid(spte);
6496 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6497 
6498 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6499 			/*
6500 			 * If there was already a valid pte here then we reuse its reference
6501 			 * on the ptd and drop the one that we took above.
6502 			 */
6503 			drop_refcnt = had_valid_mapping;
6504 
6505 			if (!had_valid_mapping) {
6506 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6507 				int pve_ptep_idx = 0;
6508 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6509 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6510 				if (pv_status != PV_ALLOC_SUCCESS) {
6511 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6512 					    __func__, pv_status, new_pve_p, pmap);
6513 				}
6514 
6515 				if (pmap != kernel_pmap) {
6516 					if (options & PMAP_OPTIONS_INTERNAL) {
6517 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6518 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6519 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6520 							/*
6521 							 * Make a note to ourselves that this
6522 							 * mapping is using alternative
6523 							 * accounting. We'll need this in order
6524 							 * to know which ledger to debit when
6525 							 * the mapping is removed.
6526 							 *
6527 							 * The altacct bit must be set while
6528 							 * the pv head is locked. Defer the
6529 							 * ledger accounting until after we've
6530 							 * dropped the lock.
6531 							 */
6532 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6533 							is_altacct = TRUE;
6534 						}
6535 					}
6536 					if (ppattr_test_reusable(pai) &&
6537 					    !is_altacct) {
6538 						is_reusable = TRUE;
6539 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6540 						is_internal = TRUE;
6541 					} else {
6542 						is_external = TRUE;
6543 					}
6544 				}
6545 			}
6546 
6547 			pvh_unlock(pai);
6548 
6549 			if (pp_attr_bits != 0) {
6550 				ppattr_pa_set_bits(pa, pp_attr_bits);
6551 			}
6552 
6553 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6554 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6555 
6556 				if (is_internal) {
6557 					/*
6558 					 * Make corresponding adjustments to
6559 					 * phys_footprint statistics.
6560 					 */
6561 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6562 					if (is_altacct) {
6563 						/*
6564 						 * If this page is internal and
6565 						 * in an IOKit region, credit
6566 						 * the task's total count of
6567 						 * dirty, internal IOKit pages.
6568 						 * It should *not* count towards
6569 						 * the task's total physical
6570 						 * memory footprint, because
6571 						 * this entire region was
6572 						 * already billed to the task
6573 						 * at the time the mapping was
6574 						 * created.
6575 						 *
6576 						 * Put another way, this is
6577 						 * internal++ and
6578 						 * alternate_accounting++, so
6579 						 * net effect on phys_footprint
6580 						 * is 0. That means: don't
6581 						 * touch phys_footprint here.
6582 						 */
6583 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6584 					} else {
6585 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6586 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6587 							skip_footprint_debit = true;
6588 						} else {
6589 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6590 						}
6591 					}
6592 				}
6593 				if (is_reusable) {
6594 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6595 				} else if (is_external) {
6596 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6597 				}
6598 			}
6599 		} else {
6600 			if (prot & VM_PROT_EXECUTE) {
6601 				kr = KERN_FAILURE;
6602 				break;
6603 			}
6604 
6605 			wimg_bits = pmap_cache_attributes(pn);
6606 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6607 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6608 			}
6609 
6610 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6611 
6612 #if XNU_MONITOR
6613 			pte = pmap_construct_io_pte(pa, pte);
6614 
6615 			/**
6616 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6617 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6618 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6619 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6620 			 */
6621 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6622 			    pte_is_valid(spte) &&
6623 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6624 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6625 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6626 				    __func__, (uint64_t)pte_to_pa(spte));
6627 			}
6628 #endif
6629 
6630 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6631 			if (committed) {
6632 				had_valid_mapping = pte_is_valid(spte);
6633 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6634 
6635 				/**
6636 				 * If there was already a valid pte here then we reuse its
6637 				 * reference on the ptd and drop the one that we took above.
6638 				 */
6639 				drop_refcnt = had_valid_mapping;
6640 			}
6641 		}
6642 		if (committed) {
6643 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6644 				assert(pmap != kernel_pmap);
6645 
6646 				/* One less "compressed" */
6647 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6648 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6649 
6650 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6651 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6652 				} else if (!skip_footprint_debit) {
6653 					/* Was part of the footprint */
6654 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6655 				}
6656 				/* The old entry held a reference so drop the extra one that we took above. */
6657 				drop_refcnt = true;
6658 			}
6659 		}
6660 	}
6661 
6662 	if (drop_refcnt && refcnt != NULL) {
6663 		assert(refcnt_updated);
6664 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6665 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6666 		}
6667 	}
6668 
6669 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6670 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6671 	}
6672 
6673 	pmap_unlock(pmap, lock_mode);
6674 
6675 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6676 		pmap_phys_write_disable(v);
6677 	}
6678 
6679 	return kr;
6680 }
6681 
6682 kern_return_t
6683 pmap_enter_options_addr(
6684 	pmap_t pmap,
6685 	vm_map_address_t v,
6686 	pmap_paddr_t pa,
6687 	vm_prot_t prot,
6688 	vm_prot_t fault_type,
6689 	unsigned int flags,
6690 	boolean_t wired,
6691 	unsigned int options,
6692 	__unused void   *arg,
6693 	__unused pmap_mapping_type_t mapping_type)
6694 {
6695 	kern_return_t kr = KERN_FAILURE;
6696 
6697 
6698 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6699 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6700 
6701 
6702 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6703 	do {
6704 #if XNU_MONITOR
6705 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6706 #else
6707 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6708 #endif
6709 
6710 		if (kr == KERN_RESOURCE_SHORTAGE) {
6711 #if XNU_MONITOR
6712 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6713 #endif
6714 			if (nowait_requested) {
6715 				break;
6716 			}
6717 		}
6718 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6719 
6720 #if XNU_MONITOR
6721 	pmap_ledger_check_balance(pmap);
6722 #endif
6723 
6724 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6725 
6726 	return kr;
6727 }
6728 
6729 kern_return_t
6730 pmap_enter_options(
6731 	pmap_t pmap,
6732 	vm_map_address_t v,
6733 	ppnum_t pn,
6734 	vm_prot_t prot,
6735 	vm_prot_t fault_type,
6736 	unsigned int flags,
6737 	boolean_t wired,
6738 	unsigned int options,
6739 	__unused void   *arg,
6740 	pmap_mapping_type_t mapping_type)
6741 {
6742 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6743 }
6744 
6745 /*
6746  *	Routine:	pmap_change_wiring
6747  *	Function:	Change the wiring attribute for a map/virtual-address
6748  *			pair.
6749  *	In/out conditions:
6750  *			The mapping must already exist in the pmap.
6751  */
6752 MARK_AS_PMAP_TEXT kern_return_t
6753 pmap_change_wiring_internal(
6754 	pmap_t pmap,
6755 	vm_map_address_t v,
6756 	boolean_t wired)
6757 {
6758 	pt_entry_t     *pte_p;
6759 	pmap_paddr_t    pa;
6760 
6761 	validate_pmap_mutable(pmap);
6762 
6763 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6764 		return KERN_ABORTED;
6765 	}
6766 
6767 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6768 
6769 	pte_p = pmap_pte(pmap, v);
6770 	if (pte_p == PT_ENTRY_NULL) {
6771 		if (!wired) {
6772 			/*
6773 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6774 			 * may have been freed by a remove operation.
6775 			 */
6776 			goto pmap_change_wiring_return;
6777 		} else {
6778 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6779 		}
6780 	}
6781 	/*
6782 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6783 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6784 	 */
6785 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6786 
6787 	while (pa_valid(pa)) {
6788 		pmap_paddr_t new_pa;
6789 
6790 		pvh_lock(pa_index(pa));
6791 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6792 
6793 		if (pa == new_pa) {
6794 			break;
6795 		}
6796 
6797 		pvh_unlock(pa_index(pa));
6798 		pa = new_pa;
6799 	}
6800 
6801 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6802 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6803 		if (!wired) {
6804 			/* PTE cleared by prior remove/disconnect operation */
6805 			goto pmap_change_wiring_cleanup;
6806 		} else {
6807 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6808 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6809 		}
6810 	}
6811 
6812 	assertf(pte_is_valid(*pte_p), "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6813 	if (wired != pte_is_wired(*pte_p)) {
6814 		pte_set_wired(pmap, pte_p, wired);
6815 		if (pmap != kernel_pmap) {
6816 			if (wired) {
6817 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6818 			} else if (!wired) {
6819 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6820 			}
6821 		}
6822 	}
6823 
6824 pmap_change_wiring_cleanup:
6825 	if (pa_valid(pa)) {
6826 		pvh_unlock(pa_index(pa));
6827 	}
6828 
6829 pmap_change_wiring_return:
6830 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6831 
6832 	return KERN_SUCCESS;
6833 }
6834 
6835 void
6836 pmap_change_wiring(
6837 	pmap_t pmap,
6838 	vm_map_address_t v,
6839 	boolean_t wired)
6840 {
6841 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6842 	pmap_verify_preemptible();
6843 
6844 	kern_return_t kr = KERN_FAILURE;
6845 #if XNU_MONITOR
6846 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6847 	do {
6848 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6849 	} while (kr == KERN_ABORTED);
6850 
6851 	pmap_ledger_check_balance(pmap);
6852 #else
6853 	/* Since we verified preemptibility, call the helper only once. */
6854 	kr = pmap_change_wiring_internal(pmap, v, wired);
6855 #endif
6856 
6857 	if (kr != KERN_SUCCESS) {
6858 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6859 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6860 	}
6861 }
6862 
6863 MARK_AS_PMAP_TEXT pmap_paddr_t
6864 pmap_find_pa_internal(
6865 	pmap_t pmap,
6866 	addr64_t va)
6867 {
6868 	pmap_paddr_t    pa = 0;
6869 
6870 	validate_pmap(pmap);
6871 
6872 	if (pmap != kernel_pmap) {
6873 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6874 	}
6875 
6876 	pa = pmap_vtophys(pmap, va);
6877 
6878 	if (pmap != kernel_pmap) {
6879 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6880 	}
6881 
6882 	return pa;
6883 }
6884 
6885 pmap_paddr_t
6886 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6887 {
6888 	pmap_paddr_t pa = 0;
6889 
6890 	if (pmap == kernel_pmap) {
6891 		pa = mmu_kvtop(va);
6892 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6893 		/*
6894 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6895 		 * translation even if PAN would prevent kernel access through the translation.
6896 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6897 		 */
6898 		pa = mmu_uvtop(va);
6899 	}
6900 	return pa;
6901 }
6902 
6903 pmap_paddr_t
6904 pmap_find_pa(
6905 	pmap_t pmap,
6906 	addr64_t va)
6907 {
6908 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6909 
6910 	if (pa != 0) {
6911 		return pa;
6912 	}
6913 
6914 	if (not_in_kdp) {
6915 #if XNU_MONITOR
6916 		return pmap_find_pa_ppl(pmap, va);
6917 #else
6918 		return pmap_find_pa_internal(pmap, va);
6919 #endif
6920 	} else {
6921 		return pmap_vtophys(pmap, va);
6922 	}
6923 }
6924 
6925 ppnum_t
6926 pmap_find_phys_nofault(
6927 	pmap_t pmap,
6928 	addr64_t va)
6929 {
6930 	ppnum_t ppn;
6931 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6932 	return ppn;
6933 }
6934 
6935 ppnum_t
6936 pmap_find_phys(
6937 	pmap_t pmap,
6938 	addr64_t va)
6939 {
6940 	ppnum_t ppn;
6941 	ppn = atop(pmap_find_pa(pmap, va));
6942 	return ppn;
6943 }
6944 
6945 /**
6946  * Translate a kernel virtual address into a physical address.
6947  *
6948  * @param va The kernel virtual address to translate. Does not work on user
6949  *           virtual addresses.
6950  *
6951  * @return The physical address if the translation was successful, or zero if
6952  *         no valid mappings were found for the given virtual address.
6953  */
6954 pmap_paddr_t
6955 kvtophys(vm_offset_t va)
6956 {
6957 	/**
6958 	 * Attempt to do the translation first in hardware using the AT (address
6959 	 * translation) instruction. This will attempt to use the MMU to do the
6960 	 * translation for us.
6961 	 */
6962 	pmap_paddr_t pa = mmu_kvtop(va);
6963 
6964 	if (pa) {
6965 		return pa;
6966 	}
6967 
6968 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6969 	return pmap_vtophys(kernel_pmap, va);
6970 }
6971 
6972 /**
6973  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6974  * points to a non-kernel-managed physical page, then this call will panic().
6975  *
6976  * @note The output of this function is guaranteed to be a kernel-managed
6977  *       physical page, which means it's safe to pass the output directly to
6978  *       pa_index() to create a physical address index for various pmap data
6979  *       structures.
6980  *
6981  * @param va The kernel virtual address to translate. Does not work on user
6982  *           virtual addresses.
6983  *
6984  * @return The translated physical address for the given virtual address.
6985  */
6986 pmap_paddr_t
6987 kvtophys_nofail(vm_offset_t va)
6988 {
6989 	pmap_paddr_t pa = kvtophys(va);
6990 
6991 	if (!pa_valid(pa)) {
6992 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6993 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6994 	}
6995 
6996 	return pa;
6997 }
6998 
6999 pmap_paddr_t
7000 pmap_vtophys(
7001 	pmap_t pmap,
7002 	addr64_t va)
7003 {
7004 	if ((va < pmap->min) || (va >= pmap->max)) {
7005 		return 0;
7006 	}
7007 
7008 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7009 
7010 	tt_entry_t * ttp = NULL;
7011 	tt_entry_t * ttep = NULL;
7012 	tt_entry_t   tte = ARM_TTE_EMPTY;
7013 	pmap_paddr_t pa = 0;
7014 	unsigned int cur_level;
7015 
7016 	ttp = pmap->tte;
7017 
7018 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
7019 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
7020 
7021 		tte = *ttep;
7022 
7023 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
7024 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
7025 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
7026 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
7027 
7028 		if ((tte & valid_mask) != valid_mask) {
7029 			return (pmap_paddr_t) 0;
7030 		}
7031 
7032 		/* This detects both leaf entries and intermediate block mappings. */
7033 		if ((tte & type_mask) == type_block) {
7034 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
7035 			break;
7036 		}
7037 
7038 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
7039 	}
7040 
7041 	return pa;
7042 }
7043 
7044 /*
7045  *	pmap_init_pte_page - Initialize a page table page.
7046  */
7047 MARK_AS_PMAP_TEXT void
7048 pmap_init_pte_page(
7049 	pmap_t pmap,
7050 	pt_entry_t *pte_p,
7051 	vm_offset_t va,
7052 	unsigned int ttlevel,
7053 	boolean_t alloc_ptd)
7054 {
7055 	pt_desc_t   *ptdp = NULL;
7056 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
7057 
7058 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7059 		if (alloc_ptd) {
7060 			/*
7061 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
7062 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
7063 			 * bootstrap request, so we check for an existing PTD here.
7064 			 */
7065 			ptdp = ptd_alloc(pmap);
7066 			if (ptdp == NULL) {
7067 				panic("%s: unable to allocate PTD", __func__);
7068 			}
7069 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
7070 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
7071 			pvh_set_flags(pvh, 0);
7072 		} else {
7073 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
7074 		}
7075 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7076 		ptdp = pvh_ptd(pvh);
7077 	} else {
7078 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7079 	}
7080 
7081 	// below barrier ensures previous updates to the page are visible to PTW before
7082 	// it is linked to the PTE of previous level
7083 	__builtin_arm_dmb(DMB_ISHST);
7084 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7085 }
7086 
7087 /*
7088  *	Routine:	pmap_expand
7089  *
7090  *	Expands a pmap to be able to map the specified virtual address.
7091  *
7092  *	Allocates new memory for the default (COARSE) translation table
7093  *	entry, initializes all the pte entries to ARM_PTE_EMPTY and
7094  *	also allocates space for the corresponding pv entries.
7095  *
7096  *	Nothing should be locked.
7097  */
7098 MARK_AS_PMAP_TEXT static kern_return_t
7099 pmap_expand(
7100 	pmap_t pmap,
7101 	vm_map_address_t v,
7102 	unsigned int options,
7103 	unsigned int level)
7104 {
7105 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7106 
7107 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7108 		return KERN_INVALID_ADDRESS;
7109 	}
7110 	pmap_paddr_t    pa;
7111 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
7112 	tt_entry_t              *tte_p;
7113 	tt_entry_t              *tt_p;
7114 
7115 	pa = 0x0ULL;
7116 	tt_p =  (tt_entry_t *)NULL;
7117 
7118 	for (; ttlevel < level; ttlevel++) {
7119 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7120 			return KERN_ABORTED;
7121 		}
7122 
7123 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7124 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7125 			kern_return_t ret;
7126 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7127 				if (options & PMAP_OPTIONS_NOWAIT) {
7128 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7129 					return ret;
7130 				}
7131 #if XNU_MONITOR
7132 				panic("%s: failed to allocate tt, "
7133 				    "pmap=%p, v=%p, options=0x%x, level=%u",
7134 				    __FUNCTION__,
7135 				    pmap, (void *)v, options, level);
7136 #else
7137 				VM_PAGE_WAIT();
7138 #endif
7139 			}
7140 
7141 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7142 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7143 				return KERN_ABORTED;
7144 			}
7145 
7146 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7147 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7148 				pa = kvtophys_nofail((vm_offset_t)tt_p);
7149 				tte_p = pmap_ttne(pmap, ttlevel, v);
7150 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7151 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7152 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7153 				pa = 0x0ULL;
7154 				tt_p = (tt_entry_t *)NULL;
7155 			}
7156 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7157 		} else {
7158 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7159 		}
7160 
7161 		if (tt_p != (tt_entry_t *)NULL) {
7162 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7163 			tt_p = (tt_entry_t *)NULL;
7164 		}
7165 	}
7166 
7167 	return KERN_SUCCESS;
7168 }
7169 
7170 /*
7171  *	Routine:	pmap_gc
7172  *	Function:
7173  *              Pmap garbage collection
7174  *		Called by the pageout daemon when pages are scarce.
7175  *
7176  */
7177 void
7178 pmap_gc(void)
7179 {
7180 	/*
7181 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7182 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7183 	 * or may contain wired mappings.  However, with the relatively recent change to
7184 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7185 	 * page, it may make sense to call that function here.
7186 	 */
7187 }
7188 
7189 /*
7190  *      By default, don't attempt pmap GC more frequently
7191  *      than once / 1 minutes.
7192  */
7193 
7194 void
7195 compute_pmap_gc_throttle(
7196 	void *arg __unused)
7197 {
7198 }
7199 
7200 /*
7201  * pmap_attribute_cache_sync(vm_offset_t pa)
7202  *
7203  * Invalidates all of the instruction cache on a physical page and
7204  * pushes any dirty data from the data cache for the same physical page
7205  */
7206 
7207 kern_return_t
7208 pmap_attribute_cache_sync(
7209 	ppnum_t pp,
7210 	vm_size_t size,
7211 	__unused vm_machine_attribute_t attribute,
7212 	__unused vm_machine_attribute_val_t * value)
7213 {
7214 	if (size > PAGE_SIZE) {
7215 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7216 	} else {
7217 		cache_sync_page(pp);
7218 	}
7219 
7220 	return KERN_SUCCESS;
7221 }
7222 
7223 /*
7224  * pmap_sync_page_data_phys(ppnum_t pp)
7225  *
7226  * Invalidates all of the instruction cache on a physical page and
7227  * pushes any dirty data from the data cache for the same physical page
7228  */
7229 void
7230 pmap_sync_page_data_phys(
7231 	ppnum_t pp)
7232 {
7233 	cache_sync_page(pp);
7234 }
7235 
7236 /*
7237  * pmap_sync_page_attributes_phys(ppnum_t pp)
7238  *
7239  * Write back and invalidate all cachelines on a physical page.
7240  */
7241 void
7242 pmap_sync_page_attributes_phys(
7243 	ppnum_t pp)
7244 {
7245 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7246 }
7247 
7248 #if CONFIG_COREDUMP
7249 /* temporary workaround */
7250 boolean_t
7251 coredumpok(
7252 	vm_map_t map,
7253 	mach_vm_offset_t va)
7254 {
7255 	pt_entry_t     *pte_p;
7256 	pt_entry_t      spte;
7257 
7258 	pte_p = pmap_pte(map->pmap, va);
7259 	if (0 == pte_p) {
7260 		return FALSE;
7261 	}
7262 	if (vm_map_entry_has_device_pager(map, va)) {
7263 		return FALSE;
7264 	}
7265 	spte = *pte_p;
7266 	return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7267 }
7268 #endif
7269 
7270 void
7271 fillPage(
7272 	ppnum_t pn,
7273 	unsigned int fill)
7274 {
7275 	unsigned int   *addr;
7276 	int             count;
7277 
7278 	addr = (unsigned int *) phystokv(ptoa(pn));
7279 	count = PAGE_SIZE / sizeof(unsigned int);
7280 	while (count--) {
7281 		*addr++ = fill;
7282 	}
7283 }
7284 
7285 extern void     mapping_set_mod(ppnum_t pn);
7286 
7287 void
7288 mapping_set_mod(
7289 	ppnum_t pn)
7290 {
7291 	pmap_set_modify(pn);
7292 }
7293 
7294 extern void     mapping_set_ref(ppnum_t pn);
7295 
7296 void
7297 mapping_set_ref(
7298 	ppnum_t pn)
7299 {
7300 	pmap_set_reference(pn);
7301 }
7302 
7303 /*
7304  * Clear specified attribute bits.
7305  *
7306  * Try to force an arm_fast_fault() for all mappings of
7307  * the page - to force attributes to be set again at fault time.
7308  * If the forcing succeeds, clear the cached bits at the head.
7309  * Otherwise, something must have been wired, so leave the cached
7310  * attributes alone.
7311  */
7312 MARK_AS_PMAP_TEXT static void
7313 phys_attribute_clear_with_flush_range(
7314 	ppnum_t         pn,
7315 	unsigned int    bits,
7316 	int             options,
7317 	void            *arg,
7318 	pmap_tlb_flush_range_t *flush_range)
7319 {
7320 	pmap_paddr_t    pa = ptoa(pn);
7321 	vm_prot_t       allow_mode = VM_PROT_ALL;
7322 
7323 #if XNU_MONITOR
7324 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7325 		panic("%s: illegal request, "
7326 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7327 		    __FUNCTION__,
7328 		    pn, bits, options, arg, flush_range);
7329 	}
7330 #endif
7331 	if ((arg != NULL) || (flush_range != NULL)) {
7332 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7333 	}
7334 
7335 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7336 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7337 		    "invalid options",
7338 		    pn, bits, options, arg, flush_range);
7339 	}
7340 
7341 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7342 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7343 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7344 		    "should not clear 'modified' without flushing TLBs",
7345 		    pn, bits, options, arg, flush_range);
7346 	}
7347 
7348 	assert(pn != vm_page_fictitious_addr);
7349 
7350 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7351 		assert(bits == PP_ATTR_MODIFIED);
7352 
7353 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7354 		/*
7355 		 * We short circuit this case; it should not need to
7356 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7357 		 * pmap_page_protect has taken care of resetting
7358 		 * the state so that we'll see the next write as a fault to
7359 		 * the VM (i.e. we don't want a fast fault).
7360 		 */
7361 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7362 		return;
7363 	}
7364 	if (bits & PP_ATTR_REFERENCED) {
7365 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7366 	}
7367 	if (bits & PP_ATTR_MODIFIED) {
7368 		allow_mode &= ~VM_PROT_WRITE;
7369 	}
7370 
7371 	if (bits == PP_ATTR_NOENCRYPT) {
7372 		/*
7373 		 * We short circuit this case; it should not need to
7374 		 * invoke arm_force_fast_fault, so just clear and
7375 		 * return.  On ARM, this bit is just a debugging aid.
7376 		 */
7377 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7378 		return;
7379 	}
7380 
7381 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7382 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7383 	}
7384 }
7385 
7386 MARK_AS_PMAP_TEXT void
7387 phys_attribute_clear_internal(
7388 	ppnum_t         pn,
7389 	unsigned int    bits,
7390 	int             options,
7391 	void            *arg)
7392 {
7393 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7394 }
7395 
7396 #if __ARM_RANGE_TLBI__
7397 MARK_AS_PMAP_TEXT static vm_map_address_t
7398 phys_attribute_clear_twig_internal(
7399 	pmap_t pmap,
7400 	vm_map_address_t start,
7401 	vm_map_address_t end,
7402 	unsigned int bits,
7403 	unsigned int options,
7404 	pmap_tlb_flush_range_t *flush_range)
7405 {
7406 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7407 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7408 	assert(end >= start);
7409 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7410 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7411 	vm_map_address_t va = start;
7412 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7413 	tt_entry_t     *tte_p;
7414 	tte_p = pmap_tte(pmap, start);
7415 	unsigned int npages = 0;
7416 
7417 	if (tte_p == (tt_entry_t *) NULL) {
7418 		return end;
7419 	}
7420 
7421 	if (tte_is_valid_table(*tte_p)) {
7422 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7423 
7424 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7425 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7426 		assert(end_pte_p >= start_pte_p);
7427 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7428 			if (__improbable(npages++ && pmap_pending_preemption())) {
7429 				return va;
7430 			}
7431 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7432 			if (pa_valid(pa)) {
7433 				ppnum_t pn = (ppnum_t) atop(pa);
7434 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7435 			}
7436 		}
7437 	}
7438 	return end;
7439 }
7440 
7441 MARK_AS_PMAP_TEXT vm_map_address_t
7442 phys_attribute_clear_range_internal(
7443 	pmap_t pmap,
7444 	vm_map_address_t start,
7445 	vm_map_address_t end,
7446 	unsigned int bits,
7447 	unsigned int options)
7448 {
7449 	if (__improbable(end < start)) {
7450 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7451 	}
7452 	validate_pmap_mutable(pmap);
7453 
7454 	vm_map_address_t va = start;
7455 	pmap_tlb_flush_range_t flush_range = {
7456 		.ptfr_pmap = pmap,
7457 		.ptfr_start = start,
7458 		.ptfr_end = end,
7459 		.ptfr_flush_needed = false
7460 	};
7461 
7462 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7463 		return va;
7464 	}
7465 
7466 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7467 
7468 	while (va < end) {
7469 		vm_map_address_t curr_end;
7470 
7471 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7472 		if (curr_end > end) {
7473 			curr_end = end;
7474 		}
7475 
7476 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7477 		if ((va < curr_end) || pmap_pending_preemption()) {
7478 			break;
7479 		}
7480 	}
7481 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7482 	if (flush_range.ptfr_flush_needed) {
7483 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7484 			flush_range.ptfr_start,
7485 			flush_range.ptfr_end - flush_range.ptfr_start,
7486 			flush_range.ptfr_pmap,
7487 			true,
7488 			false);
7489 		sync_tlb_flush();
7490 	}
7491 	return va;
7492 }
7493 
7494 static void
7495 phys_attribute_clear_range(
7496 	pmap_t pmap,
7497 	vm_map_address_t start,
7498 	vm_map_address_t end,
7499 	unsigned int bits,
7500 	unsigned int options)
7501 {
7502 	/*
7503 	 * We allow single-page requests to execute non-preemptibly,
7504 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7505 	 * operation, and there are a couple of special use cases that
7506 	 * require a non-preemptible single-page operation.
7507 	 */
7508 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7509 		pmap_verify_preemptible();
7510 	}
7511 
7512 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7513 
7514 	while (start < end) {
7515 #if XNU_MONITOR
7516 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7517 #else
7518 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7519 #endif
7520 	}
7521 
7522 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7523 }
7524 #endif /* __ARM_RANGE_TLBI__ */
7525 
7526 static void
7527 phys_attribute_clear(
7528 	ppnum_t         pn,
7529 	unsigned int    bits,
7530 	int             options,
7531 	void            *arg)
7532 {
7533 	/*
7534 	 * Do we really want this tracepoint?  It will be extremely chatty.
7535 	 * Also, should we have a corresponding trace point for the set path?
7536 	 */
7537 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7538 
7539 #if XNU_MONITOR
7540 	phys_attribute_clear_ppl(pn, bits, options, arg);
7541 #else
7542 	phys_attribute_clear_internal(pn, bits, options, arg);
7543 #endif
7544 
7545 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7546 }
7547 
7548 /*
7549  *	Set specified attribute bits.
7550  *
7551  *	Set cached value in the pv head because we have
7552  *	no per-mapping hardware support for referenced and
7553  *	modify bits.
7554  */
7555 MARK_AS_PMAP_TEXT void
7556 phys_attribute_set_internal(
7557 	ppnum_t pn,
7558 	unsigned int bits)
7559 {
7560 	pmap_paddr_t    pa = ptoa(pn);
7561 	assert(pn != vm_page_fictitious_addr);
7562 
7563 #if XNU_MONITOR
7564 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7565 		panic("%s: illegal request, "
7566 		    "pn=%u, bits=%#x",
7567 		    __FUNCTION__,
7568 		    pn, bits);
7569 	}
7570 #endif
7571 
7572 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7573 
7574 	return;
7575 }
7576 
7577 static void
7578 phys_attribute_set(
7579 	ppnum_t pn,
7580 	unsigned int bits)
7581 {
7582 #if XNU_MONITOR
7583 	phys_attribute_set_ppl(pn, bits);
7584 #else
7585 	phys_attribute_set_internal(pn, bits);
7586 #endif
7587 }
7588 
7589 
7590 /*
7591  *	Check specified attribute bits.
7592  *
7593  *	use the software cached bits (since no hw support).
7594  */
7595 static boolean_t
7596 phys_attribute_test(
7597 	ppnum_t pn,
7598 	unsigned int bits)
7599 {
7600 	pmap_paddr_t    pa = ptoa(pn);
7601 	assert(pn != vm_page_fictitious_addr);
7602 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7603 }
7604 
7605 
7606 /*
7607  *	Set the modify/reference bits on the specified physical page.
7608  */
7609 void
7610 pmap_set_modify(ppnum_t pn)
7611 {
7612 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7613 }
7614 
7615 
7616 /*
7617  *	Clear the modify bits on the specified physical page.
7618  */
7619 void
7620 pmap_clear_modify(
7621 	ppnum_t pn)
7622 {
7623 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7624 }
7625 
7626 
7627 /*
7628  *	pmap_is_modified:
7629  *
7630  *	Return whether or not the specified physical page is modified
7631  *	by any physical maps.
7632  */
7633 boolean_t
7634 pmap_is_modified(
7635 	ppnum_t pn)
7636 {
7637 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7638 }
7639 
7640 
7641 /*
7642  *	Set the reference bit on the specified physical page.
7643  */
7644 static void
7645 pmap_set_reference(
7646 	ppnum_t pn)
7647 {
7648 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7649 }
7650 
7651 /*
7652  *	Clear the reference bits on the specified physical page.
7653  */
7654 void
7655 pmap_clear_reference(
7656 	ppnum_t pn)
7657 {
7658 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7659 }
7660 
7661 
7662 /*
7663  *	pmap_is_referenced:
7664  *
7665  *	Return whether or not the specified physical page is referenced
7666  *	by any physical maps.
7667  */
7668 boolean_t
7669 pmap_is_referenced(
7670 	ppnum_t pn)
7671 {
7672 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7673 }
7674 
7675 /*
7676  * pmap_get_refmod(phys)
7677  *  returns the referenced and modified bits of the specified
7678  *  physical page.
7679  */
7680 unsigned int
7681 pmap_get_refmod(
7682 	ppnum_t pn)
7683 {
7684 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7685 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7686 }
7687 
7688 static inline unsigned int
7689 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7690 {
7691 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7692 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7693 }
7694 
7695 /*
7696  * pmap_clear_refmod(phys, mask)
7697  *  clears the referenced and modified bits as specified by the mask
7698  *  of the specified physical page.
7699  */
7700 void
7701 pmap_clear_refmod_options(
7702 	ppnum_t         pn,
7703 	unsigned int    mask,
7704 	unsigned int    options,
7705 	void            *arg)
7706 {
7707 	unsigned int    bits;
7708 
7709 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7710 	phys_attribute_clear(pn, bits, options, arg);
7711 }
7712 
7713 /*
7714  * Perform pmap_clear_refmod_options on a virtual address range.
7715  * The operation will be performed in bulk & tlb flushes will be coalesced
7716  * if possible.
7717  *
7718  * Returns true if the operation is supported on this platform.
7719  * If this function returns false, the operation is not supported and
7720  * nothing has been modified in the pmap.
7721  */
7722 bool
7723 pmap_clear_refmod_range_options(
7724 	pmap_t pmap __unused,
7725 	vm_map_address_t start __unused,
7726 	vm_map_address_t end __unused,
7727 	unsigned int mask __unused,
7728 	unsigned int options __unused)
7729 {
7730 #if __ARM_RANGE_TLBI__
7731 	unsigned int    bits;
7732 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7733 	phys_attribute_clear_range(pmap, start, end, bits, options);
7734 	return true;
7735 #else /* __ARM_RANGE_TLBI__ */
7736 #pragma unused(pmap, start, end, mask, options)
7737 	/*
7738 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7739 	 * contiguous range of addresses. This is large performance improvement on
7740 	 * platforms that support ranged tlbi instructions. But on older platforms,
7741 	 * we can only flush per-page or the entire asid. So we currently
7742 	 * only support this operation on platforms that support ranged tlbi.
7743 	 * instructions. On other platforms, we require that
7744 	 * the VM modify the bits on a per-page basis.
7745 	 */
7746 	return false;
7747 #endif /* __ARM_RANGE_TLBI__ */
7748 }
7749 
7750 void
7751 pmap_clear_refmod(
7752 	ppnum_t pn,
7753 	unsigned int mask)
7754 {
7755 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7756 }
7757 
7758 unsigned int
7759 pmap_disconnect_options(
7760 	ppnum_t pn,
7761 	unsigned int options,
7762 	void *arg)
7763 {
7764 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7765 		/*
7766 		 * On ARM, the "modified" bit is managed by software, so
7767 		 * we know up-front if the physical page is "modified",
7768 		 * without having to scan all the PTEs pointing to it.
7769 		 * The caller should have made the VM page "busy" so noone
7770 		 * should be able to establish any new mapping and "modify"
7771 		 * the page behind us.
7772 		 */
7773 		if (pmap_is_modified(pn)) {
7774 			/*
7775 			 * The page has been modified and will be sent to
7776 			 * the VM compressor.
7777 			 */
7778 			options |= PMAP_OPTIONS_COMPRESSOR;
7779 		} else {
7780 			/*
7781 			 * The page hasn't been modified and will be freed
7782 			 * instead of compressed.
7783 			 */
7784 		}
7785 	}
7786 
7787 	/* disconnect the page */
7788 	pmap_page_protect_options(pn, 0, options, arg);
7789 
7790 	/* return ref/chg status */
7791 	return pmap_get_refmod(pn);
7792 }
7793 
7794 /*
7795  *	Routine:
7796  *		pmap_disconnect
7797  *
7798  *	Function:
7799  *		Disconnect all mappings for this page and return reference and change status
7800  *		in generic format.
7801  *
7802  */
7803 unsigned int
7804 pmap_disconnect(
7805 	ppnum_t pn)
7806 {
7807 	pmap_page_protect(pn, 0);       /* disconnect the page */
7808 	return pmap_get_refmod(pn);   /* return ref/chg status */
7809 }
7810 
7811 boolean_t
7812 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7813 {
7814 	if (ptoa(first) >= vm_last_phys) {
7815 		return FALSE;
7816 	}
7817 	if (ptoa(last) < vm_first_phys) {
7818 		return FALSE;
7819 	}
7820 
7821 	return TRUE;
7822 }
7823 
7824 /*
7825  * The state maintained by the noencrypt functions is used as a
7826  * debugging aid on ARM.  This incurs some overhead on the part
7827  * of the caller.  A special case check in phys_attribute_clear
7828  * (the most expensive path) currently minimizes this overhead,
7829  * but stubbing these functions out on RELEASE kernels yields
7830  * further wins.
7831  */
7832 boolean_t
7833 pmap_is_noencrypt(
7834 	ppnum_t pn)
7835 {
7836 #if DEVELOPMENT || DEBUG
7837 	boolean_t result = FALSE;
7838 
7839 	if (!pa_valid(ptoa(pn))) {
7840 		return FALSE;
7841 	}
7842 
7843 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7844 
7845 	return result;
7846 #else
7847 #pragma unused(pn)
7848 	return FALSE;
7849 #endif
7850 }
7851 
7852 void
7853 pmap_set_noencrypt(
7854 	ppnum_t pn)
7855 {
7856 #if DEVELOPMENT || DEBUG
7857 	if (!pa_valid(ptoa(pn))) {
7858 		return;
7859 	}
7860 
7861 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7862 #else
7863 #pragma unused(pn)
7864 #endif
7865 }
7866 
7867 void
7868 pmap_clear_noencrypt(
7869 	ppnum_t pn)
7870 {
7871 #if DEVELOPMENT || DEBUG
7872 	if (!pa_valid(ptoa(pn))) {
7873 		return;
7874 	}
7875 
7876 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7877 #else
7878 #pragma unused(pn)
7879 #endif
7880 }
7881 
7882 #if XNU_MONITOR
7883 boolean_t
7884 pmap_is_monitor(ppnum_t pn)
7885 {
7886 	assert(pa_valid(ptoa(pn)));
7887 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7888 }
7889 #endif
7890 
7891 void
7892 pmap_lock_phys_page(ppnum_t pn)
7893 {
7894 #if !XNU_MONITOR
7895 	unsigned int    pai;
7896 	pmap_paddr_t    phys = ptoa(pn);
7897 
7898 	if (pa_valid(phys)) {
7899 		pai = pa_index(phys);
7900 		pvh_lock(pai);
7901 	} else
7902 #else
7903 	(void)pn;
7904 #endif
7905 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7906 }
7907 
7908 
7909 void
7910 pmap_unlock_phys_page(ppnum_t pn)
7911 {
7912 #if !XNU_MONITOR
7913 	unsigned int    pai;
7914 	pmap_paddr_t    phys = ptoa(pn);
7915 
7916 	if (pa_valid(phys)) {
7917 		pai = pa_index(phys);
7918 		pvh_unlock(pai);
7919 	} else
7920 #else
7921 	(void)pn;
7922 #endif
7923 	{ simple_unlock(&phys_backup_lock);}
7924 }
7925 
7926 MARK_AS_PMAP_TEXT static void
7927 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7928 {
7929 	if (pmap != kernel_pmap) {
7930 		pmap_t nested_pmap = pmap->nested_pmap;
7931 		cpu_data_ptr->cpu_nested_pmap = nested_pmap;
7932 		if (nested_pmap != NULL) {
7933 			cpu_data_ptr->cpu_nested_pmap_attr = pmap_get_pt_attr(nested_pmap);
7934 			/**
7935 			 * Obtain the full shared region bounds from the nested pmap.  If the top-level pmap
7936 			 * hasn't been fully nested yet, its bounds may not yet be configured, or may be in the
7937 			 * process of being configured on another core.
7938 			 */
7939 			cpu_data_ptr->cpu_nested_region_addr = nested_pmap->nested_region_addr;
7940 			cpu_data_ptr->cpu_nested_region_size = nested_pmap->nested_region_size;
7941 		}
7942 #if __ARM_MIXED_PAGE_SIZE__
7943 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7944 #endif
7945 	}
7946 
7947 
7948 #if __ARM_MIXED_PAGE_SIZE__
7949 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7950 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7951 	}
7952 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7953 
7954 
7955 	if (pmap != kernel_pmap) {
7956 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7957 	} else if (!pmap_user_ttb_is_clear()) {
7958 		pmap_clear_user_ttb_internal();
7959 	}
7960 }
7961 
7962 MARK_AS_PMAP_TEXT void
7963 pmap_clear_user_ttb_internal(void)
7964 {
7965 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7966 }
7967 
7968 void
7969 pmap_clear_user_ttb(void)
7970 {
7971 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7972 #if XNU_MONITOR
7973 	pmap_clear_user_ttb_ppl();
7974 #else
7975 	pmap_clear_user_ttb_internal();
7976 #endif
7977 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7978 }
7979 
7980 
7981 #if defined(__arm64__)
7982 /*
7983  * Marker for use in multi-pass fast-fault PV list processing.
7984  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7985  * these functions, as compressed PTEs should never be present in PV lists.
7986  * Note that this only holds true for arm64; for arm32 we don't have enough
7987  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7988  * and WRITEABLE marker depending on whether the PTE is valid.
7989  */
7990 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7991 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7992 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7993 #endif
7994 
7995 
7996 MARK_AS_PMAP_TEXT static boolean_t
7997 arm_force_fast_fault_with_flush_range(
7998 	ppnum_t         ppnum,
7999 	vm_prot_t       allow_mode,
8000 	int             options,
8001 	pmap_tlb_flush_range_t *flush_range)
8002 {
8003 	pmap_paddr_t     phys = ptoa(ppnum);
8004 	pv_entry_t      *pve_p;
8005 	pt_entry_t      *pte_p;
8006 	unsigned int     pai;
8007 	unsigned int     pass1_updated = 0;
8008 	unsigned int     pass2_updated = 0;
8009 	boolean_t        result;
8010 	pv_entry_t     **pv_h;
8011 	bool             is_reusable;
8012 	bool             ref_fault;
8013 	bool             mod_fault;
8014 	bool             clear_write_fault = false;
8015 	bool             ref_aliases_mod = false;
8016 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
8017 
8018 	assert(ppnum != vm_page_fictitious_addr);
8019 
8020 	if (!pa_valid(phys)) {
8021 		return FALSE;   /* Not a managed page. */
8022 	}
8023 
8024 	result = TRUE;
8025 	ref_fault = false;
8026 	mod_fault = false;
8027 	pai = pa_index(phys);
8028 	if (__probable(mustsynch)) {
8029 		pvh_lock(pai);
8030 	}
8031 	pv_h = pai_to_pvh(pai);
8032 
8033 #if XNU_MONITOR
8034 	if (__improbable(ppattr_pa_test_monitor(phys))) {
8035 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
8036 	}
8037 #endif
8038 	pte_p = PT_ENTRY_NULL;
8039 	pve_p = PV_ENTRY_NULL;
8040 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8041 		pte_p = pvh_ptep(pv_h);
8042 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8043 		pve_p = pvh_pve_list(pv_h);
8044 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8045 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
8046 	}
8047 
8048 	is_reusable = ppattr_test_reusable(pai);
8049 
8050 	/*
8051 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
8052 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
8053 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8054 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
8055 	 * operation, TLB invalidation may be handled by the caller so it's possible for
8056 	 * tlb_flush_needed to be true while issue_tlbi is false.
8057 	 */
8058 	bool issue_tlbi = false;
8059 	bool tlb_flush_needed = false;
8060 
8061 	pv_entry_t *orig_pve_p = pve_p;
8062 	pt_entry_t *orig_pte_p = pte_p;
8063 	int pve_ptep_idx = 0;
8064 
8065 	/*
8066 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8067 	 * TLB invalidation in pass 2.
8068 	 */
8069 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8070 		pt_entry_t       spte;
8071 		pt_entry_t       tmplate;
8072 
8073 		if (pve_p != PV_ENTRY_NULL) {
8074 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8075 			if (pte_p == PT_ENTRY_NULL) {
8076 				goto fff_skip_pve_pass1;
8077 			}
8078 		}
8079 
8080 #ifdef PVH_FLAG_IOMMU
8081 		if (pvh_ptep_is_iommu(pte_p)) {
8082 			goto fff_skip_pve_pass1;
8083 		}
8084 #endif
8085 		if (*pte_p == ARM_PTE_EMPTY) {
8086 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8087 		}
8088 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8089 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8090 		}
8091 
8092 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8093 		const pmap_t pmap = ptdp->pmap;
8094 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8095 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8096 
8097 		assert(va >= pmap->min && va < pmap->max);
8098 
8099 		/* update pmap stats and ledgers */
8100 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8101 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8102 		if (is_altacct) {
8103 			/*
8104 			 * We do not track "reusable" status for
8105 			 * "alternate accounting" mappings.
8106 			 */
8107 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8108 		    is_reusable &&
8109 		    is_internal &&
8110 		    pmap != kernel_pmap) {
8111 			/* one less "reusable" */
8112 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8113 			/* one more "internal" */
8114 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8115 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8116 
8117 			/*
8118 			 * Since the page is being marked non-reusable, we assume that it will be
8119 			 * modified soon.  Avoid the cost of another trap to handle the fast
8120 			 * fault when we next write to this page.
8121 			 */
8122 			clear_write_fault = true;
8123 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8124 		    !is_reusable &&
8125 		    is_internal &&
8126 		    pmap != kernel_pmap) {
8127 			/* one more "reusable" */
8128 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8129 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8130 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8131 		}
8132 
8133 		bool wiredskip = pte_is_wired(*pte_p) &&
8134 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8135 
8136 		if (wiredskip) {
8137 			result = FALSE;
8138 			goto fff_skip_pve_pass1;
8139 		}
8140 
8141 		spte = *pte_p;
8142 		tmplate = spte;
8143 
8144 #if HAS_FEAT_XS
8145 		/**
8146 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8147 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8148 		 */
8149 		assert(!pte_is_xs(pt_attr, spte));
8150 #endif /* HAS_FEAT_XS */
8151 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8152 			/* read protection sets the pte to fault */
8153 			tmplate =  tmplate & ~ARM_PTE_AF;
8154 			ref_fault = true;
8155 		}
8156 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8157 			/* take away write permission if set */
8158 			if (pmap == kernel_pmap) {
8159 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8160 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8161 					pte_set_was_writeable(tmplate, true);
8162 					mod_fault = true;
8163 				}
8164 			} else {
8165 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8166 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8167 					pte_set_was_writeable(tmplate, true);
8168 					mod_fault = true;
8169 				}
8170 			}
8171 		}
8172 
8173 #if MACH_ASSERT && XNU_MONITOR
8174 		if (is_pte_xprr_protected(pmap, spte)) {
8175 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8176 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8177 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8178 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8179 				    ppnum, options, allow_mode);
8180 			}
8181 		}
8182 #endif /* MACH_ASSERT && XNU_MONITOR */
8183 
8184 		if (result && (tmplate != spte)) {
8185 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8186 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
8187 				tlb_flush_needed = true;
8188 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8189 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8190 #ifdef ARM_PTE_FF_MARKER
8191 					assert(!(spte & ARM_PTE_FF_MARKER));
8192 					tmplate |= ARM_PTE_FF_MARKER;
8193 					++pass1_updated;
8194 #endif
8195 					issue_tlbi = true;
8196 				}
8197 			}
8198 			write_pte_fast(pte_p, tmplate);
8199 		}
8200 
8201 fff_skip_pve_pass1:
8202 		pte_p = PT_ENTRY_NULL;
8203 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8204 			pve_ptep_idx = 0;
8205 			pve_p = pve_next(pve_p);
8206 		}
8207 	}
8208 
8209 	if (tlb_flush_needed) {
8210 		FLUSH_PTE_STRONG();
8211 	}
8212 
8213 	if (!issue_tlbi) {
8214 		goto fff_finish;
8215 	}
8216 
8217 	/* Pass 2: Issue any required TLB invalidations */
8218 	pve_p = orig_pve_p;
8219 	pte_p = orig_pte_p;
8220 	pve_ptep_idx = 0;
8221 
8222 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8223 		if (pve_p != PV_ENTRY_NULL) {
8224 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8225 			if (pte_p == PT_ENTRY_NULL) {
8226 				goto fff_skip_pve_pass2;
8227 			}
8228 		}
8229 
8230 #ifdef PVH_FLAG_IOMMU
8231 		if (pvh_ptep_is_iommu(pte_p)) {
8232 			goto fff_skip_pve_pass2;
8233 		}
8234 #endif
8235 
8236 #ifdef ARM_PTE_FF_MARKER
8237 		pt_entry_t spte = *pte_p;
8238 
8239 		if (!(spte & ARM_PTE_FF_MARKER)) {
8240 			goto fff_skip_pve_pass2;
8241 		} else {
8242 			spte &= (~ARM_PTE_FF_MARKER);
8243 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8244 			write_pte_fast(pte_p, spte);
8245 			++pass2_updated;
8246 		}
8247 #endif
8248 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8249 		const pmap_t pmap = ptdp->pmap;
8250 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8251 
8252 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8253 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8254 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8255 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8256 		}
8257 
8258 fff_skip_pve_pass2:
8259 		pte_p = PT_ENTRY_NULL;
8260 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8261 			pve_ptep_idx = 0;
8262 			pve_p = pve_next(pve_p);
8263 		}
8264 	}
8265 
8266 fff_finish:
8267 	if (__improbable(pass1_updated != pass2_updated)) {
8268 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8269 		    __func__, pass1_updated, pass2_updated);
8270 	}
8271 
8272 	/*
8273 	 * If we are using the same approach for ref and mod
8274 	 * faults on this PTE, do not clear the write fault;
8275 	 * this would cause both ref and mod to be set on the
8276 	 * page again, and prevent us from taking ANY read/write
8277 	 * fault on the mapping.
8278 	 */
8279 	if (clear_write_fault && !ref_aliases_mod) {
8280 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8281 	}
8282 	if (tlb_flush_needed) {
8283 		if (flush_range) {
8284 			/* Delayed flush. Signal to the caller that the flush is needed. */
8285 			flush_range->ptfr_flush_needed = true;
8286 		} else {
8287 			sync_tlb_flush();
8288 		}
8289 	}
8290 
8291 	/* update global "reusable" status for this page */
8292 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8293 		ppattr_clear_reusable(pai);
8294 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8295 		ppattr_set_reusable(pai);
8296 	}
8297 
8298 	if (mod_fault) {
8299 		ppattr_set_modfault(pai);
8300 	}
8301 	if (ref_fault) {
8302 		ppattr_set_reffault(pai);
8303 	}
8304 	if (__probable(mustsynch)) {
8305 		pvh_unlock(pai);
8306 	}
8307 	return result;
8308 }
8309 
8310 MARK_AS_PMAP_TEXT boolean_t
8311 arm_force_fast_fault_internal(
8312 	ppnum_t         ppnum,
8313 	vm_prot_t       allow_mode,
8314 	int             options)
8315 {
8316 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8317 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8318 	}
8319 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8320 }
8321 
8322 /*
8323  *	Routine:	arm_force_fast_fault
8324  *
8325  *	Function:
8326  *		Force all mappings for this page to fault according
8327  *		to the access modes allowed, so we can gather ref/modify
8328  *		bits again.
8329  */
8330 
8331 boolean_t
8332 arm_force_fast_fault(
8333 	ppnum_t         ppnum,
8334 	vm_prot_t       allow_mode,
8335 	int             options,
8336 	__unused void   *arg)
8337 {
8338 	pmap_paddr_t    phys = ptoa(ppnum);
8339 
8340 	assert(ppnum != vm_page_fictitious_addr);
8341 
8342 	if (!pa_valid(phys)) {
8343 		return FALSE;   /* Not a managed page. */
8344 	}
8345 
8346 #if XNU_MONITOR
8347 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8348 #else
8349 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8350 #endif
8351 }
8352 
8353 /*
8354  *	Routine:	arm_clear_fast_fault
8355  *
8356  *	Function:
8357  *		Clear pending force fault for all mappings for this page based on
8358  *		the observed fault type, update ref/modify bits.
8359  */
8360 MARK_AS_PMAP_TEXT static boolean_t
8361 arm_clear_fast_fault(
8362 	ppnum_t ppnum,
8363 	vm_prot_t fault_type,
8364 	pt_entry_t *pte_p)
8365 {
8366 	pmap_paddr_t    pa = ptoa(ppnum);
8367 	pv_entry_t     *pve_p;
8368 	unsigned int    pai;
8369 	boolean_t       result;
8370 	bool            tlb_flush_needed = false;
8371 	pv_entry_t    **pv_h;
8372 	unsigned int    npve = 0;
8373 	unsigned int    pass1_updated = 0;
8374 	unsigned int    pass2_updated = 0;
8375 
8376 	assert(ppnum != vm_page_fictitious_addr);
8377 
8378 	if (!pa_valid(pa)) {
8379 		return FALSE;   /* Not a managed page. */
8380 	}
8381 
8382 	result = FALSE;
8383 	pai = pa_index(pa);
8384 	pvh_assert_locked(pai);
8385 	pv_h = pai_to_pvh(pai);
8386 
8387 	pve_p = PV_ENTRY_NULL;
8388 	if (pte_p == PT_ENTRY_NULL) {
8389 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8390 			pte_p = pvh_ptep(pv_h);
8391 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8392 			pve_p = pvh_pve_list(pv_h);
8393 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8394 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8395 		}
8396 	}
8397 
8398 	pv_entry_t *orig_pve_p = pve_p;
8399 	pt_entry_t *orig_pte_p = pte_p;
8400 	int pve_ptep_idx = 0;
8401 
8402 	/*
8403 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8404 	 * TLB invalidation in pass 2.
8405 	 */
8406 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8407 		pt_entry_t spte;
8408 		pt_entry_t tmplate;
8409 
8410 		if (pve_p != PV_ENTRY_NULL) {
8411 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8412 			if (pte_p == PT_ENTRY_NULL) {
8413 				goto cff_skip_pve_pass1;
8414 			}
8415 		}
8416 
8417 #ifdef PVH_FLAG_IOMMU
8418 		if (pvh_ptep_is_iommu(pte_p)) {
8419 			goto cff_skip_pve_pass1;
8420 		}
8421 #endif
8422 		if (*pte_p == ARM_PTE_EMPTY) {
8423 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8424 		}
8425 
8426 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8427 		const pmap_t pmap = ptdp->pmap;
8428 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8429 
8430 		assert(va >= pmap->min && va < pmap->max);
8431 
8432 		spte = *pte_p;
8433 		tmplate = spte;
8434 
8435 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8436 			{
8437 				if (pmap == kernel_pmap) {
8438 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8439 				} else {
8440 					assert(pmap->type != PMAP_TYPE_NESTED);
8441 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8442 				}
8443 			}
8444 
8445 			tmplate |= ARM_PTE_AF;
8446 
8447 			pte_set_was_writeable(tmplate, false);
8448 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8449 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8450 			tmplate = spte | ARM_PTE_AF;
8451 
8452 			{
8453 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8454 			}
8455 		}
8456 
8457 #if MACH_ASSERT && XNU_MONITOR
8458 		if (is_pte_xprr_protected(pmap, spte)) {
8459 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8460 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8461 				    "ppnum=0x%x, fault_type=0x%x",
8462 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8463 				    ppnum, fault_type);
8464 			}
8465 		}
8466 #endif /* MACH_ASSERT && XNU_MONITOR */
8467 
8468 		assert(spte != ARM_PTE_EMPTY);
8469 		if (spte != tmplate) {
8470 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8471 #ifdef ARM_PTE_FF_MARKER
8472 				assert(!(spte & ARM_PTE_FF_MARKER));
8473 				tmplate |= ARM_PTE_FF_MARKER;
8474 				++pass1_updated;
8475 #endif
8476 				tlb_flush_needed = true;
8477 			}
8478 			write_pte_fast(pte_p, tmplate);
8479 			result = TRUE;
8480 		}
8481 
8482 cff_skip_pve_pass1:
8483 		pte_p = PT_ENTRY_NULL;
8484 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8485 			pve_ptep_idx = 0;
8486 			pve_p = pve_next(pve_p);
8487 			++npve;
8488 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8489 				break;
8490 			}
8491 		}
8492 	}
8493 
8494 	if (!tlb_flush_needed) {
8495 		goto cff_finish;
8496 	}
8497 
8498 	FLUSH_PTE_STRONG();
8499 
8500 	/* Pass 2: Issue any required TLB invalidations */
8501 	pve_p = orig_pve_p;
8502 	pte_p = orig_pte_p;
8503 	pve_ptep_idx = 0;
8504 	npve = 0;
8505 
8506 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8507 		if (pve_p != PV_ENTRY_NULL) {
8508 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8509 			if (pte_p == PT_ENTRY_NULL) {
8510 				goto cff_skip_pve_pass2;
8511 			}
8512 		}
8513 
8514 #ifdef PVH_FLAG_IOMMU
8515 		if (pvh_ptep_is_iommu(pte_p)) {
8516 			goto cff_skip_pve_pass2;
8517 		}
8518 #endif
8519 
8520 #ifdef ARM_PTE_FF_MARKER
8521 		pt_entry_t spte = *pte_p;
8522 
8523 		if (!(spte & ARM_PTE_FF_MARKER)) {
8524 			goto cff_skip_pve_pass2;
8525 		} else {
8526 			spte &= (~ARM_PTE_FF_MARKER);
8527 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8528 			write_pte_fast(pte_p, spte);
8529 			++pass2_updated;
8530 		}
8531 #endif
8532 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8533 		const pmap_t pmap = ptdp->pmap;
8534 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8535 
8536 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8537 		    pmap, true, false);
8538 
8539 cff_skip_pve_pass2:
8540 		pte_p = PT_ENTRY_NULL;
8541 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8542 			pve_ptep_idx = 0;
8543 			pve_p = pve_next(pve_p);
8544 			++npve;
8545 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8546 				break;
8547 			}
8548 		}
8549 	}
8550 
8551 cff_finish:
8552 	if (__improbable(pass1_updated != pass2_updated)) {
8553 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8554 		    __func__, pass1_updated, pass2_updated);
8555 	}
8556 	if (tlb_flush_needed) {
8557 		sync_tlb_flush();
8558 	}
8559 	return result;
8560 }
8561 
8562 /*
8563  * Determine if the fault was induced by software tracking of
8564  * modify/reference bits.  If so, re-enable the mapping (and set
8565  * the appropriate bits).
8566  *
8567  * Returns KERN_SUCCESS if the fault was induced and was
8568  * successfully handled.
8569  *
8570  * Returns KERN_FAILURE if the fault was not induced and
8571  * the function was unable to deal with it.
8572  *
8573  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8574  * disallows this type of access.
8575  *
8576  * Returns KERN_ABORTED if the pmap lock is taken and a
8577  * preemption is pending.
8578  *
8579  */
8580 MARK_AS_PMAP_TEXT kern_return_t
8581 arm_fast_fault_internal(
8582 	pmap_t pmap,
8583 	vm_map_address_t va,
8584 	vm_prot_t fault_type,
8585 	__unused bool was_af_fault,
8586 	__unused bool from_user)
8587 {
8588 	kern_return_t   result = KERN_FAILURE;
8589 	pt_entry_t     *ptep;
8590 	pt_entry_t      spte = ARM_PTE_EMPTY;
8591 	unsigned int    pai;
8592 	pmap_paddr_t    pa;
8593 	validate_pmap_mutable(pmap);
8594 
8595 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8596 		return KERN_ABORTED;
8597 	}
8598 
8599 	/*
8600 	 * If the entry doesn't exist, is completely invalid, or is already
8601 	 * valid, we can't fix it here.
8602 	 */
8603 
8604 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8605 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8606 	if (ptep != PT_ENTRY_NULL) {
8607 		while (true) {
8608 			spte = *((volatile pt_entry_t*)ptep);
8609 
8610 			pa = pte_to_pa(spte);
8611 
8612 			if ((spte == ARM_PTE_EMPTY) ||
8613 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8614 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8615 				return result;
8616 			}
8617 
8618 			if (!pa_valid(pa)) {
8619 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8620 #if XNU_MONITOR
8621 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8622 					return KERN_PROTECTION_FAILURE;
8623 				} else
8624 #endif
8625 				return result;
8626 			}
8627 			pai = pa_index(pa);
8628 			pvh_lock(pai);
8629 			if (*ptep == spte) {
8630 				/*
8631 				 * Double-check the spte value, as we care about the AF bit.
8632 				 * It's also possible that pmap_page_protect() transitioned the
8633 				 * PTE to compressed/empty before we grabbed the PVH lock.
8634 				 */
8635 				break;
8636 			}
8637 			pvh_unlock(pai);
8638 		}
8639 	} else {
8640 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8641 		return result;
8642 	}
8643 
8644 
8645 	if ((result != KERN_SUCCESS) &&
8646 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8647 		/*
8648 		 * An attempted access will always clear ref/mod fault state, as
8649 		 * appropriate for the fault type.  arm_clear_fast_fault will
8650 		 * update the associated PTEs for the page as appropriate; if
8651 		 * any PTEs are updated, we redrive the access.  If the mapping
8652 		 * does not actually allow for the attempted access, the
8653 		 * following fault will (hopefully) fail to update any PTEs, and
8654 		 * thus cause arm_fast_fault to decide that it failed to handle
8655 		 * the fault.
8656 		 */
8657 		if (ppattr_test_reffault(pai)) {
8658 			ppattr_clear_reffault(pai);
8659 		}
8660 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8661 			ppattr_clear_modfault(pai);
8662 		}
8663 
8664 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8665 			/*
8666 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8667 			 * cost of not doing so is a another fault in a case
8668 			 * that should already result in an exception.
8669 			 */
8670 			result = KERN_SUCCESS;
8671 		}
8672 	}
8673 
8674 	/*
8675 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8676 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8677 	 * on mappings of the same page
8678 	 */
8679 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8680 		uintptr_t ap_ro, ap_rw, ap_x;
8681 		if (pmap == kernel_pmap) {
8682 			ap_ro = ARM_PTE_AP(AP_RONA);
8683 			ap_rw = ARM_PTE_AP(AP_RWNA);
8684 			ap_x = ARM_PTE_NX;
8685 		} else {
8686 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8687 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8688 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8689 		}
8690 		/*
8691 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8692 		 * hardware they may be xPRR-protected, in which case they'll be handled
8693 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8694 		 * handling path currently does not call arm_fast_fault() without at least
8695 		 * VM_PROT_READ in fault_type.
8696 		 */
8697 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8698 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8699 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8700 				result = KERN_SUCCESS;
8701 			}
8702 		}
8703 	}
8704 
8705 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8706 		/*
8707 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8708 		 * another pending PV list operation or an excessively large PV list.
8709 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8710 		 * taking a fault on the same mapping.
8711 		 */
8712 		result = KERN_SUCCESS;
8713 	}
8714 
8715 	pvh_unlock(pai);
8716 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8717 	return result;
8718 }
8719 
8720 kern_return_t
8721 arm_fast_fault(
8722 	pmap_t pmap,
8723 	vm_map_address_t va,
8724 	vm_prot_t fault_type,
8725 	bool was_af_fault,
8726 	__unused bool from_user)
8727 {
8728 	kern_return_t   result = KERN_FAILURE;
8729 
8730 	if (va < pmap->min || va >= pmap->max) {
8731 		return result;
8732 	}
8733 
8734 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8735 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8736 	    from_user);
8737 
8738 	do {
8739 #if XNU_MONITOR
8740 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8741 #else
8742 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8743 #endif
8744 	} while (result == KERN_ABORTED);
8745 
8746 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8747 
8748 	return result;
8749 }
8750 
8751 void
8752 pmap_copy_page(
8753 	ppnum_t psrc,
8754 	ppnum_t pdst,
8755 	int options)
8756 {
8757 	bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8758 	    (addr64_t) (ptoa(pdst)),
8759 	    PAGE_SIZE,
8760 	    options);
8761 }
8762 
8763 
8764 /*
8765  *	pmap_copy_page copies the specified (machine independent) pages.
8766  */
8767 void
8768 pmap_copy_part_page(
8769 	ppnum_t psrc,
8770 	vm_offset_t src_offset,
8771 	ppnum_t pdst,
8772 	vm_offset_t dst_offset,
8773 	vm_size_t len)
8774 {
8775 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8776 	    (addr64_t) (ptoa(pdst) + dst_offset),
8777 	    len);
8778 }
8779 
8780 
8781 /*
8782  *	pmap_zero_page zeros the specified (machine independent) page.
8783  */
8784 void
8785 pmap_zero_page(
8786 	ppnum_t pn)
8787 {
8788 	assert(pn != vm_page_fictitious_addr);
8789 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8790 }
8791 
8792 void
8793 pmap_zero_page_with_options(
8794 	ppnum_t pn,
8795 	int options)
8796 {
8797 	assert(pn != vm_page_fictitious_addr);
8798 	bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8799 }
8800 
8801 /*
8802  *	pmap_zero_part_page
8803  *	zeros the specified (machine independent) part of a page.
8804  */
8805 void
8806 pmap_zero_part_page(
8807 	ppnum_t pn,
8808 	vm_offset_t offset,
8809 	vm_size_t len)
8810 {
8811 	assert(pn != vm_page_fictitious_addr);
8812 	assert(offset + len <= PAGE_SIZE);
8813 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8814 }
8815 
8816 void
8817 pmap_map_globals(
8818 	void)
8819 {
8820 	pt_entry_t      *ptep, pte;
8821 
8822 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8823 	assert(ptep != PT_ENTRY_NULL);
8824 	assert(*ptep == ARM_PTE_EMPTY);
8825 
8826 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8827 #if __ARM_KERNEL_PROTECT__
8828 	pte |= ARM_PTE_NG;
8829 #endif /* __ARM_KERNEL_PROTECT__ */
8830 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8831 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8832 	*ptep = pte;
8833 	FLUSH_PTE();
8834 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8835 
8836 #if KASAN
8837 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8838 #endif
8839 }
8840 
8841 vm_offset_t
8842 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8843 {
8844 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8845 		panic("%s: invalid index %u", __func__, index);
8846 	}
8847 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8848 }
8849 
8850 MARK_AS_PMAP_TEXT unsigned int
8851 pmap_map_cpu_windows_copy_internal(
8852 	ppnum_t pn,
8853 	vm_prot_t prot,
8854 	unsigned int wimg_bits)
8855 {
8856 	pt_entry_t      *ptep = NULL, pte;
8857 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8858 	unsigned int    cpu_num;
8859 	unsigned int    i;
8860 	vm_offset_t     cpu_copywindow_vaddr = 0;
8861 	bool            need_strong_sync = false;
8862 
8863 #if XNU_MONITOR
8864 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8865 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8866 #endif
8867 
8868 #if XNU_MONITOR
8869 #ifdef  __ARM_COHERENT_IO__
8870 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8871 		panic("%s: attempted to map a managed page, "
8872 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8873 		    __FUNCTION__,
8874 		    pn, prot, wimg_bits);
8875 	}
8876 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8877 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8878 	}
8879 
8880 #else /* __ARM_COHERENT_IO__ */
8881 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8882 #endif /* __ARM_COHERENT_IO__ */
8883 #endif /* XNU_MONITOR */
8884 	cpu_num = pmap_cpu_data->cpu_number;
8885 
8886 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8887 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8888 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8889 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8890 		if (!pte_is_valid(*ptep)) {
8891 			break;
8892 		}
8893 	}
8894 	if (i == CPUWINDOWS_MAX) {
8895 		panic("pmap_map_cpu_windows_copy: out of window");
8896 	}
8897 
8898 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8899 #if __ARM_KERNEL_PROTECT__
8900 	pte |= ARM_PTE_NG;
8901 #endif /* __ARM_KERNEL_PROTECT__ */
8902 
8903 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8904 
8905 	if (prot & VM_PROT_WRITE) {
8906 		pte |= ARM_PTE_AP(AP_RWNA);
8907 	} else {
8908 		pte |= ARM_PTE_AP(AP_RONA);
8909 	}
8910 #if HAS_FEAT_XS
8911 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8912 #endif
8913 	write_pte_fast(ptep, pte);
8914 	/*
8915 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8916 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8917 	 */
8918 	FLUSH_PTE_STRONG();
8919 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8920 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8921 
8922 	return i;
8923 }
8924 
8925 unsigned int
8926 pmap_map_cpu_windows_copy(
8927 	ppnum_t pn,
8928 	vm_prot_t prot,
8929 	unsigned int wimg_bits)
8930 {
8931 #if XNU_MONITOR
8932 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8933 #else
8934 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8935 #endif
8936 }
8937 
8938 MARK_AS_PMAP_TEXT void
8939 pmap_unmap_cpu_windows_copy_internal(
8940 	unsigned int index)
8941 {
8942 	pt_entry_t      *ptep;
8943 	unsigned int    cpu_num;
8944 	vm_offset_t     cpu_copywindow_vaddr = 0;
8945 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8946 
8947 	cpu_num = pmap_cpu_data->cpu_number;
8948 
8949 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8950 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8951 	 * (which are likely to have been on I/O memory) are complete before
8952 	 * tearing down the mapping. */
8953 	__builtin_arm_dsb(DSB_SY);
8954 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8955 	write_pte_strong(ptep, ARM_PTE_EMPTY);
8956 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8957 }
8958 
8959 void
8960 pmap_unmap_cpu_windows_copy(
8961 	unsigned int index)
8962 {
8963 #if XNU_MONITOR
8964 	return pmap_unmap_cpu_windows_copy_ppl(index);
8965 #else
8966 	return pmap_unmap_cpu_windows_copy_internal(index);
8967 #endif
8968 }
8969 
8970 #if XNU_MONITOR
8971 
8972 MARK_AS_PMAP_TEXT void
8973 pmap_invoke_with_page(
8974 	ppnum_t page_number,
8975 	void *ctx,
8976 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8977 {
8978 	#pragma unused(page_number, ctx, callback)
8979 }
8980 
8981 /*
8982  * Loop over every pmap_io_range (I/O ranges marked as owned by
8983  * the PPL in the device tree) and conditionally call callback() on each range
8984  * that needs to be included in the hibernation image.
8985  *
8986  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8987  *                 context is needed in the callback.
8988  * @param callback Callback function invoked on each range (gated by flag).
8989  */
8990 MARK_AS_PMAP_TEXT void
8991 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8992 {
8993 	extern const pmap_io_range_t* io_attr_table;
8994 	extern const unsigned int num_io_rgns;
8995 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8996 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8997 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8998 		}
8999 	}
9000 }
9001 
9002 /**
9003  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
9004  * PPL-owned page. Otherwise, do nothing.
9005  *
9006  * @param addr Physical address of the page to set the HASHED flag on.
9007  */
9008 MARK_AS_PMAP_TEXT void
9009 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
9010 {
9011 	/* Ignore non-managed kernel memory. */
9012 	if (!pa_valid(addr)) {
9013 		return;
9014 	}
9015 
9016 	const unsigned int pai = pa_index(addr);
9017 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
9018 		pv_entry_t **pv_h = pai_to_pvh(pai);
9019 
9020 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
9021 		pvh_lock(pai);
9022 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
9023 		pvh_unlock(pai);
9024 	}
9025 }
9026 
9027 /**
9028  * Loop through every physical page in the system and clear out the HASHED flag
9029  * on every PPL-owned page. That flag is used to keep track of which pages have
9030  * been hashed into the hibernation image during the hibernation entry process.
9031  *
9032  * The HASHED flag needs to be cleared out between hibernation cycles because the
9033  * pv_head_table and pp_attr_table's might have been copied into the hibernation
9034  * image with the HASHED flag set on certain pages. It's important to clear the
9035  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
9036  * into the hibernation image can't be compromised across hibernation cycles.
9037  */
9038 MARK_AS_PMAP_TEXT void
9039 pmap_clear_ppl_hashed_flag_all(void)
9040 {
9041 	const unsigned int last_index = pa_index(vm_last_phys);
9042 	pv_entry_t **pv_h = NULL;
9043 
9044 	for (int pai = 0; pai < last_index; ++pai) {
9045 		pv_h = pai_to_pvh(pai);
9046 
9047 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
9048 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
9049 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
9050 			pvh_lock(pai);
9051 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
9052 			pvh_unlock(pai);
9053 		}
9054 	}
9055 }
9056 
9057 /**
9058  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9059  * ppl_hib driver will call this after all wired pages have been copied into the
9060  * hibernation image.
9061  */
9062 MARK_AS_PMAP_TEXT void
9063 pmap_check_ppl_hashed_flag_all(void)
9064 {
9065 	const unsigned int last_index = pa_index(vm_last_phys);
9066 	pv_entry_t **pv_h = NULL;
9067 
9068 	for (int pai = 0; pai < last_index; ++pai) {
9069 		pv_h = pai_to_pvh(pai);
9070 
9071 		/**
9072 		 * The PMAP stacks are explicitly not saved into the image so skip checking
9073 		 * the pages that contain the PMAP stacks.
9074 		 */
9075 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9076 		    (pai < pa_index(pmap_stacks_end_pa));
9077 
9078 		if (!is_pmap_stack &&
9079 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9080 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9081 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9082 		}
9083 	}
9084 }
9085 
9086 #endif /* XNU_MONITOR */
9087 
9088 /*
9089  * Indicate that a pmap is intended to be used as a nested pmap
9090  * within one or more larger address spaces.  This must be set
9091  * before pmap_nest() is called with this pmap as the 'subordinate'.
9092  */
9093 MARK_AS_PMAP_TEXT void
9094 pmap_set_nested_internal(
9095 	pmap_t pmap)
9096 {
9097 	validate_pmap_mutable(pmap);
9098 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9099 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9100 		    __func__, pmap, pmap->type);
9101 	}
9102 
9103 #if XNU_MONITOR
9104 	/**
9105 	 * The "seq_cst" ordering of the atomic load here guarantees
9106 	 * the check below is performed after the type update above
9107 	 * is observed. Together with similar order guarantee at
9108 	 * pmap_switch_internal(), it makes sure a pmap is never
9109 	 * active-and-nested:
9110 	 *
9111 	 * pmap_set_nested() | pmap_switch()
9112 	 * --------------------------------------
9113 	 * set nested        | set active
9114 	 * store-load barrier| store-load barrier
9115 	 * assert !active    | assert !nested
9116 	 */
9117 	const int max_cpu = ml_get_max_cpu_number();
9118 	for (unsigned int i = 0; i <= max_cpu; ++i) {
9119 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9120 		if (cpu_data == NULL) {
9121 			continue;
9122 		}
9123 		if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9124 			panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9125 		}
9126 	}
9127 #endif /* XNU_MONITOR */
9128 
9129 	/**
9130 	 * Ensure that a (potentially concurrent) call to pmap_set_shared_region() hasn't tried
9131 	 * to give this pmap its own nested pmap.
9132 	 */
9133 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9134 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9135 	}
9136 
9137 	pmap_get_pt_ops(pmap)->free_id(pmap);
9138 }
9139 
9140 __mockable void
9141 pmap_set_nested(
9142 	pmap_t pmap)
9143 {
9144 #if XNU_MONITOR
9145 	pmap_set_nested_ppl(pmap);
9146 #else
9147 	pmap_set_nested_internal(pmap);
9148 #endif
9149 }
9150 
9151 bool
9152 pmap_is_nested(
9153 	pmap_t pmap)
9154 {
9155 	return pmap->type == PMAP_TYPE_NESTED;
9156 }
9157 
9158 /*
9159  * pmap_trim_range(pmap, start, end)
9160  *
9161  * pmap  = pmap to operate on
9162  * start = start of the range
9163  * end   = end of the range
9164  *
9165  * Attempts to deallocate TTEs for the given range in the nested range.
9166  */
9167 MARK_AS_PMAP_TEXT static void
9168 pmap_trim_range(
9169 	pmap_t pmap,
9170 	addr64_t start,
9171 	addr64_t end)
9172 {
9173 	addr64_t cur;
9174 	addr64_t nested_region_start;
9175 	addr64_t nested_region_end;
9176 	addr64_t adjusted_start;
9177 	addr64_t adjusted_end;
9178 	addr64_t adjust_offmask;
9179 	tt_entry_t * tte_p;
9180 	pt_entry_t * pte_p;
9181 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9182 
9183 	if (__improbable(end < start)) {
9184 		panic("%s: invalid address range, "
9185 		    "pmap=%p, start=%p, end=%p",
9186 		    __func__,
9187 		    pmap, (void*)start, (void*)end);
9188 	}
9189 
9190 	nested_region_start = pmap->nested_region_addr;
9191 	nested_region_end = nested_region_start + pmap->nested_region_size;
9192 
9193 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9194 		panic("%s: range outside nested region %p-%p, "
9195 		    "pmap=%p, start=%p, end=%p",
9196 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9197 		    pmap, (void*)start, (void*)end);
9198 	}
9199 
9200 	/* Contract the range to TT page boundaries. */
9201 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9202 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9203 	adjusted_end = end & ~adjust_offmask;
9204 
9205 	/* Iterate over the range, trying to remove TTEs. */
9206 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9207 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9208 
9209 		tte_p = pmap_tte(pmap, cur);
9210 
9211 		if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9212 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9213 
9214 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9215 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9216 				/* Deallocate for the nested map. */
9217 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9218 			} else if (pmap->type == PMAP_TYPE_USER) {
9219 				/**
9220 				 * Just remove for the parent map. If the leaf table pointed
9221 				 * to by the TTE being removed (owned by the nested pmap)
9222 				 * has any mappings, then this call will panic. This
9223 				 * enforces the policy that tables being trimmed must be
9224 				 * empty to prevent possible use-after-free attacks.
9225 				 */
9226 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9227 			} else {
9228 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9229 			}
9230 		} else {
9231 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9232 		}
9233 	}
9234 
9235 	/* Remove empty L2 TTs. */
9236 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9237 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9238 
9239 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9240 		/* For each L1 entry in our range... */
9241 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9242 
9243 		bool remove_tt1e = true;
9244 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9245 		tt_entry_t * tt2e_start;
9246 		tt_entry_t * tt2e_end;
9247 		tt_entry_t * tt2e_p;
9248 		tt_entry_t tt1e;
9249 
9250 		if (tt1e_p == NULL) {
9251 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9252 			continue;
9253 		}
9254 
9255 		tt1e = *tt1e_p;
9256 
9257 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9258 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9259 			continue;
9260 		}
9261 
9262 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9263 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9264 
9265 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9266 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9267 				/*
9268 				 * If any TTEs are populated, don't remove the
9269 				 * L1 TT.
9270 				 */
9271 				remove_tt1e = false;
9272 			}
9273 		}
9274 
9275 		if (remove_tt1e) {
9276 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9277 		} else {
9278 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9279 		}
9280 	}
9281 }
9282 
9283 /**
9284  * State machine for multi-step pmap trimming. Trimming is the action of
9285  * deallocating the TTEs of the shared region of pmaps down to a given range.
9286  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9287  * disabling preemption for too long. These steps include computing the bounds
9288  * of the shared region, trimming the head of the "grand", trimming the tail of
9289  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9290  * different conditions.
9291  *
9292  * @param grand the pmap in which the pages are nested
9293  * @param subord the pmap from which the pages are shared, or nested
9294  * @param vstart start of the used range in "grand"
9295  * @param size size of the used range
9296  * @param state the current state of the state machine
9297  *
9298  * @return the next state of the state machine, to be used in the next call
9299  *         into this function.
9300  */
9301 MARK_AS_PMAP_TEXT pmap_trim_state_t
9302 pmap_trim_internal(
9303 	pmap_t grand,
9304 	pmap_t subord,
9305 	addr64_t vstart,
9306 	uint64_t size,
9307 	pmap_trim_state_t state)
9308 {
9309 	/* Validation needs to be done regardless of state. */
9310 	addr64_t vend;
9311 
9312 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9313 		panic("%s: grand addr wraps around, "
9314 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9315 		    __func__, grand, subord, (void*)vstart, size, state);
9316 	}
9317 
9318 	validate_pmap_mutable(grand);
9319 	validate_pmap(subord);
9320 
9321 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9322 		panic("%s: subord is of non-nestable type 0x%hhx, "
9323 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9324 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9325 	}
9326 
9327 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9328 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9329 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9330 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9331 	}
9332 
9333 	if (__improbable(grand->nested_pmap != subord)) {
9334 		panic("%s: grand->nested != subord, "
9335 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9336 		    __func__, grand, subord, (void*)vstart, size, state);
9337 	}
9338 
9339 	if (__improbable((size != 0) &&
9340 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9341 		panic("%s: grand range not in nested region, "
9342 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9343 		    __func__, grand, subord, (void*)vstart, size, state);
9344 	}
9345 
9346 
9347 	/* Trimming starts with figuring out the bounds for the grand. */
9348 	if (state == PMAP_TRIM_STATE_START) {
9349 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9350 
9351 		/**
9352 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9353 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9354 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9355 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9356 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9357 		 * PMAP_TRIM_STATE_DONE.
9358 		 */
9359 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9360 			assert(subord->nested_bounds_set);
9361 
9362 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9363 			if (!grand->nested_bounds_set) {
9364 				/* Inherit the bounds from subord. */
9365 				grand->nested_region_true_start = subord->nested_region_true_start;
9366 				grand->nested_region_true_end = subord->nested_region_true_end;
9367 				grand->nested_bounds_set = true;
9368 			}
9369 
9370 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9371 
9372 			/* Now that the grand has bounds, we are done. */
9373 			return PMAP_TRIM_STATE_DONE;
9374 		}
9375 
9376 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9377 		if ((!subord->nested_bounds_set) && size) {
9378 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9379 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9380 
9381 			subord->nested_region_true_start = vstart;
9382 			subord->nested_region_true_end = vend;
9383 			subord->nested_region_true_start &= ~adjust_offmask;
9384 
9385 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9386 				panic("%s: padded true end wraps around, "
9387 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9388 				    __func__, grand, subord, (void*)vstart, size, state);
9389 			}
9390 
9391 			subord->nested_region_true_end &= ~adjust_offmask;
9392 			subord->nested_bounds_set = true;
9393 		}
9394 
9395 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9396 		if (subord->nested_bounds_set) {
9397 			/* Inherit the bounds from subord. */
9398 			grand->nested_region_true_start = subord->nested_region_true_start;
9399 			grand->nested_region_true_end = subord->nested_region_true_end;
9400 			grand->nested_bounds_set = true;
9401 
9402 			/* If we know the bounds, we can trim the pmap. */
9403 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9404 
9405 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9406 		} else {
9407 			/* Don't trim if we don't know the bounds. */
9408 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9409 
9410 			return PMAP_TRIM_STATE_DONE;
9411 		}
9412 	}
9413 
9414 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9415 	if (!grand->nested_bounds_set) {
9416 		panic("%s: !grand->nested_bounds_set, "
9417 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9418 		    __func__, grand, subord, (void*)vstart, size, state);
9419 	}
9420 
9421 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9422 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9423 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9424 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9425 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9426 			    (unsigned int)grand->nested_no_bounds_ref_state);
9427 		}
9428 
9429 #if XNU_MONITOR
9430 		if (pmap_pending_preemption()) {
9431 			return PMAP_TRIM_STATE_GRAND_AFTER;
9432 		}
9433 #endif
9434 
9435 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9436 	}
9437 
9438 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9439 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9440 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9441 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9442 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9443 			    (unsigned int)grand->nested_no_bounds_ref_state);
9444 		}
9445 
9446 #if XNU_MONITOR
9447 		if (pmap_pending_preemption()) {
9448 			return PMAP_TRIM_STATE_SUBORD;
9449 		}
9450 #endif
9451 
9452 		state = PMAP_TRIM_STATE_SUBORD;
9453 	}
9454 
9455 	/* START state is guaranteed to compute the bounds for the subord. */
9456 	if (!subord->nested_bounds_set) {
9457 		panic("%s: !subord->nested_bounds_set, "
9458 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9459 		    __func__, grand, subord, (void*)vstart, size, state);
9460 	}
9461 
9462 	if (state == PMAP_TRIM_STATE_SUBORD) {
9463 		/**
9464 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9465 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9466 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9467 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9468 		 * the state update is visible only once the preceding trim operation is complete.  An
9469 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9470 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9471 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9472 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9473 		 * of the state CAS.
9474 		 */
9475 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9476 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9477 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9478 			    (unsigned int)grand->nested_no_bounds_ref_state);
9479 		}
9480 		pmap_trim_subord(subord);
9481 	}
9482 
9483 	return PMAP_TRIM_STATE_DONE;
9484 }
9485 
9486 MARK_AS_PMAP_TEXT static void
9487 pmap_trim_self(pmap_t pmap)
9488 {
9489 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9490 		/* If we have a no bounds ref, we need to drop it. */
9491 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9492 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9493 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9494 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9495 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9496 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9497 
9498 		if (nested_bounds_set) {
9499 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9500 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9501 		}
9502 		/*
9503 		 * Try trimming the nested pmap, in case we had the
9504 		 * last reference.
9505 		 */
9506 		pmap_trim_subord(pmap->nested_pmap);
9507 	}
9508 }
9509 
9510 /*
9511  * pmap_trim_subord(grand, subord)
9512  *
9513  * grand  = pmap that we have nested subord in
9514  * subord = nested pmap we are attempting to trim
9515  *
9516  * Trims subord if possible
9517  */
9518 MARK_AS_PMAP_TEXT static void
9519 pmap_trim_subord(pmap_t subord)
9520 {
9521 	bool contract_subord = false;
9522 
9523 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9524 
9525 	subord->nested_no_bounds_refcnt--;
9526 
9527 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9528 		/* If this was the last no bounds reference, trim subord. */
9529 		contract_subord = true;
9530 	}
9531 
9532 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9533 
9534 	if (contract_subord) {
9535 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9536 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9537 	}
9538 }
9539 
9540 /**
9541  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9542  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9543  * disabling preemption for too long.
9544  *
9545  * @note When we load the shared region we always create pages tables for the
9546  *       entire region. In practice, the shared cache may use just a portion
9547  *       of that. Before we know the bounds of the shared region, it can
9548  *       already be mapped into processes. Therefore, once the bounds are
9549  *       known, "trimming" comes in handy to remove the unnecessary page
9550  *       tables in the processes the shared region is mapped in, and eventually
9551  *       those in the shared region itself. Note that the shared region must
9552  *       be trimmed after the user processes because it has the L3 entries
9553  *       everyone else is pointing to.
9554  *
9555  * @param grand the pmap in which the pages are nested
9556  * @param subord the pmap from which the pages are shared, or nested
9557  * @param vstart start of the used range in "grand"
9558  * @param size size of the used range
9559  */
9560 void
9561 pmap_trim(
9562 	pmap_t grand,
9563 	pmap_t subord,
9564 	addr64_t vstart,
9565 	uint64_t size)
9566 {
9567 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9568 
9569 #if XNU_MONITOR
9570 	/* On PPL systems, drives the state machine until its done. */
9571 	while (state != PMAP_TRIM_STATE_DONE) {
9572 		__assert_only pmap_trim_state_t old_state = state;
9573 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9574 
9575 		/* Are we making progress? */
9576 		assert(old_state != state);
9577 	}
9578 
9579 	pmap_ledger_check_balance(grand);
9580 	pmap_ledger_check_balance(subord);
9581 #else
9582 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9583 
9584 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9585 	assert(state == PMAP_TRIM_STATE_DONE);
9586 #endif
9587 }
9588 
9589 #if HAS_APPLE_PAC
9590 void *
9591 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9592 {
9593 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9594 		panic("attempt to sign user pointer without process independent key");
9595 	}
9596 
9597 	void *res = NULL;
9598 	uint64_t current_intr_state = pmap_interrupts_disable();
9599 
9600 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9601 
9602 	__compiler_materialize_and_prevent_reordering_on(value);
9603 	switch (key) {
9604 	case ptrauth_key_asia:
9605 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9606 		break;
9607 	case ptrauth_key_asda:
9608 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9609 		break;
9610 	default:
9611 		__builtin_unreachable();
9612 	}
9613 	__compiler_materialize_and_prevent_reordering_on(res);
9614 
9615 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9616 
9617 	pmap_interrupts_restore(current_intr_state);
9618 
9619 	return res;
9620 }
9621 
9622 void *
9623 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9624 {
9625 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9626 }
9627 
9628 void *
9629 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9630 {
9631 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9632 		panic("attempt to auth user pointer without process independent key");
9633 	}
9634 
9635 	void *res = NULL;
9636 	uint64_t current_intr_state = pmap_interrupts_disable();
9637 
9638 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9639 	__compiler_materialize_and_prevent_reordering_on(value);
9640 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9641 	__compiler_materialize_and_prevent_reordering_on(res);
9642 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9643 
9644 	pmap_interrupts_restore(current_intr_state);
9645 
9646 	return res;
9647 }
9648 
9649 void *
9650 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9651 {
9652 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9653 }
9654 #endif /* HAS_APPLE_PAC */
9655 
9656 /*
9657  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9658  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9659  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9660  * return value, to indicate where a preempted [un]nest operation should resume.
9661  * When the return value contains the ending address of the nested region with
9662  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9663  */
9664 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9665 
9666 /**
9667  * Establishes the pmap associated with a shared region as the nested pmap
9668  * for a top-level user pmap.
9669  *
9670  * @param grand The top-level user pmap
9671  * @param subord The pmap to be set as [grand]'s nested pmap
9672  * @param vstart The base VA of the region to be nested.
9673  * @param size The size (in bytes) of the region to be nested.
9674  */
9675 MARK_AS_PMAP_TEXT kern_return_t
9676 pmap_set_shared_region_internal(
9677 	pmap_t grand,
9678 	pmap_t subord,
9679 	addr64_t vstart,
9680 	uint64_t size)
9681 {
9682 	addr64_t        vend;
9683 	uint64_t        nested_region_unnested_table_bitmap_size;
9684 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9685 	kern_return_t   kr = KERN_SUCCESS;
9686 
9687 	validate_pmap_mutable(grand);
9688 	validate_pmap(subord);
9689 
9690 #if XNU_MONITOR
9691 	/*
9692 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9693 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9694 	 * be in the process of being destroyed.  If destruction is already committed,
9695 	 * then the check of ref_count below will cover us.  If destruction is initiated
9696 	 * during or after this call, then pmap_destroy() will catch the non-zero
9697 	 * nested_count.
9698 	 */
9699 	os_atomic_inc(&subord->nested_count, relaxed);
9700 	os_atomic_thread_fence(seq_cst);
9701 #endif
9702 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9703 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9704 	}
9705 
9706 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9707 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9708 	}
9709 
9710 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9711 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9712 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9713 	}
9714 	if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9715 		panic("%s: pmap %p unaligned set_shared_region request 0x%llx, 0x%llx",
9716 		    __func__, grand, vstart, size);
9717 	}
9718 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9719 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9720 	}
9721 
9722 	if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
9723 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9724 
9725 		/**
9726 		 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9727 		 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9728 		 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9729 		 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9730 		 */
9731 		nested_region_unnested_table_bitmap_size <<= 1;
9732 
9733 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9734 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9735 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9736 			    __func__, nested_region_unnested_table_bitmap_size,
9737 			    grand, subord, vstart, size);
9738 		}
9739 
9740 #if XNU_MONITOR
9741 		pmap_paddr_t pa = 0;
9742 
9743 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9744 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9745 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9746 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9747 			    grand, subord, vstart, size);
9748 		}
9749 
9750 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9751 
9752 		if (kr != KERN_SUCCESS) {
9753 			goto done;
9754 		}
9755 
9756 		assert(pa);
9757 
9758 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9759 #else
9760 		nested_region_unnested_table_bitmap = kalloc_data(
9761 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9762 			Z_WAITOK | Z_ZERO);
9763 #endif
9764 
9765 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9766 			kr = KERN_ABORTED;
9767 			goto done;
9768 		}
9769 
9770 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9771 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9772 			subord->nested_region_addr = vstart;
9773 			subord->nested_region_size = (mach_vm_offset_t) size;
9774 
9775 			/**
9776 			 * Use a store-release operation to ensure that the rest of the subord->nested_region_*
9777 			 * fields are initialized and visible before setting the nested_region_unnested_table_bitmap
9778 			 * field (which is used as the flag to say that the rest are initialized).
9779 			 */
9780 			os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
9781 			nested_region_unnested_table_bitmap = NULL;
9782 		}
9783 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9784 	}
9785 
9786 	if (__improbable(!os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst))) {
9787 		panic("%s: attempt to nest pmap %p into pmap %p which already has a nested pmap %p",
9788 		    __func__, subord, grand, grand->nested_pmap);
9789 	}
9790 	/**
9791 	 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9792 	 * into a nested pmap, which would then produce multiple levels of nesting.
9793 	 */
9794 	if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9795 		panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9796 	}
9797 
9798 done:
9799 	if (nested_region_unnested_table_bitmap != NULL) {
9800 #if XNU_MONITOR
9801 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9802 #else
9803 		kfree_data(nested_region_unnested_table_bitmap,
9804 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9805 #endif
9806 		nested_region_unnested_table_bitmap = NULL;
9807 	}
9808 
9809 	if (kr != KERN_SUCCESS) {
9810 #if XNU_MONITOR
9811 		os_atomic_dec(&subord->nested_count, relaxed);
9812 #endif
9813 		pmap_destroy_internal(subord);
9814 	}
9815 
9816 	return kr;
9817 }
9818 
9819 __mockable void
9820 pmap_set_shared_region(
9821 	pmap_t grand,
9822 	pmap_t subord,
9823 	addr64_t vstart,
9824 	uint64_t size)
9825 {
9826 	kern_return_t kr = KERN_SUCCESS;
9827 
9828 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9829 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9830 
9831 	pmap_verify_preemptible();
9832 #if XNU_MONITOR
9833 	do {
9834 		kr = pmap_set_shared_region_ppl(grand, subord, vstart, size);
9835 		if (kr == KERN_RESOURCE_SHORTAGE) {
9836 			pmap_alloc_page_for_ppl(0);
9837 		} else if ((kr != KERN_SUCCESS) && (kr != KERN_ABORTED)) {
9838 			panic("%s: unexpected return code 0x%x from pmap_set_shared_region_ppl",
9839 			    __func__, kr);
9840 		}
9841 	} while (kr != KERN_SUCCESS);
9842 
9843 	pmap_ledger_check_balance(grand);
9844 	pmap_ledger_check_balance(subord);
9845 #else
9846 	/**
9847 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9848 	 * we have verified preemptibility. Therefore, pmap_set_shared_region_internal()
9849 	 * will wait for a page or a lock instead of bailing out as in the PPL flavor.
9850 	 */
9851 	kr = pmap_set_shared_region_internal(grand, subord, vstart, size);
9852 	assert3u(kr, ==, KERN_SUCCESS);
9853 #endif
9854 
9855 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9856 }
9857 
9858 /**
9859  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9860  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9861  * This function operates in 3 main phases:
9862  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9863  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9864  *    the mapping range are present in subord.
9865  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9866  *    contains pointers to subord's leaf-level pagetable pages for the specified
9867  *    VA range.
9868  *
9869  * This function may return early due to pending AST_URGENT preemption; if so
9870  * it will indicate the need to be re-entered.
9871  *
9872  * @note This function requires that [subord] has already been associated with
9873  *       [grand] through a call to pmap_set_shared_region().
9874  *
9875  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9876  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9877  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9878  * @param size twig-aligned size of the nesting range
9879  * @param vrestart the twig-aligned starting address of the current call.  May contain
9880  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9881  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9882  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9883  *
9884  * @return the virtual address at which to restart the operation, possibly including
9885  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9886  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9887  */
9888 MARK_AS_PMAP_TEXT vm_map_offset_t
9889 pmap_nest_internal(
9890 	pmap_t grand,
9891 	pmap_t subord,
9892 	addr64_t vstart,
9893 	uint64_t size,
9894 	vm_map_offset_t vrestart,
9895 	kern_return_t *krp)
9896 {
9897 	kern_return_t kr = KERN_FAILURE;
9898 	vm_map_offset_t vaddr;
9899 	tt_entry_t     *stte_p;
9900 	tt_entry_t     *gtte_p;
9901 	int             expand_options = 0;
9902 	bool            grand_locked = false;
9903 
9904 	addr64_t vend;
9905 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9906 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9907 	}
9908 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9909 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9910 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9911 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9912 	}
9913 
9914 	assert(krp != NULL);
9915 	validate_pmap_mutable(grand);
9916 	validate_pmap(subord);
9917 
9918 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9919 
9920 	if (__improbable(subord != grand->nested_pmap)) {
9921 		panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9922 		    __func__, subord, grand, grand->nested_pmap);
9923 	}
9924 
9925 #if XNU_MONITOR
9926 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9927 #endif
9928 
9929 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9930 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9931 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9932 		    grand, vstart, size, (unsigned long long)vrestart);
9933 	}
9934 
9935 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9936 		kr = KERN_ABORTED;
9937 		goto nest_cleanup;
9938 	}
9939 
9940 	if (__improbable((subord->nested_region_addr + subord->nested_region_size) < vend) ||
9941 	    (subord->nested_region_addr > vstart)) {
9942 		panic("%s: attempt to nest [0x%llx, 0x%llx) in pmap %p outside nested pmap %p bounds [0x%llx, 0x%llx)\n",
9943 		    __func__, vstart, vend, grand, subord, subord->nested_region_addr, subord->nested_region_addr + subord->nested_region_size);
9944 	}
9945 	if (grand->nested_region_size == 0) {
9946 		/*
9947 		 * If this is grand's first nesting operation, keep the reference on subord.
9948 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9949 		 */
9950 		if (!subord->nested_bounds_set) {
9951 			/*
9952 			 * We are nesting without the shared regions bounds
9953 			 * being known.  We'll have to trim the pmap later.
9954 			 */
9955 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9956 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9957 				panic("%s: grand %p already nested", __func__, grand);
9958 			}
9959 			subord->nested_no_bounds_refcnt++;
9960 		}
9961 
9962 		/**
9963 		 * Ensure that we won't exceed the nested_region_unnested_table bitmap bounds established
9964 		 * in pmap_set_shared_region_internal().
9965 		 */
9966 		if (__improbable((vstart < subord->nested_region_addr) ||
9967 		    (vend > (subord->nested_region_addr + subord->nested_region_size)))) {
9968 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9969 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9970 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9971 		}
9972 
9973 		grand->nested_region_addr = vstart;
9974 		grand->nested_region_size = (mach_vm_offset_t) size;
9975 	} else {
9976 		if (__improbable(grand->nested_region_addr > vstart)) {
9977 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9978 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9979 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9980 		}
9981 	}
9982 
9983 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9984 	if (vaddr < subord->nested_region_true_start) {
9985 		vaddr = subord->nested_region_true_start;
9986 	}
9987 
9988 	addr64_t true_end = vend;
9989 	if (true_end > subord->nested_region_true_end) {
9990 		true_end = subord->nested_region_true_end;
9991 	}
9992 	__unused unsigned int ttecount = 0;
9993 
9994 	if (vrestart & PMAP_NEST_GRAND) {
9995 		goto nest_grand;
9996 	}
9997 
9998 	while (vaddr < true_end) {
9999 		stte_p = pmap_tte(subord, vaddr);
10000 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
10001 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10002 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
10003 
10004 			if (kr != KERN_SUCCESS) {
10005 				goto done;
10006 			}
10007 
10008 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
10009 		}
10010 		vaddr += pt_attr_twig_size(pt_attr);
10011 		vrestart = vaddr;
10012 		++ttecount;
10013 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10014 		    pmap_pending_preemption())) {
10015 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10016 			kr = KERN_SUCCESS;
10017 			goto done;
10018 		}
10019 	}
10020 	/*
10021 	 * copy TTEs from subord pmap into grand pmap
10022 	 */
10023 
10024 	vaddr = (vm_map_offset_t) vstart;
10025 	if (vaddr < subord->nested_region_true_start) {
10026 		vaddr = subord->nested_region_true_start;
10027 	}
10028 	vrestart = vaddr | PMAP_NEST_GRAND;
10029 
10030 nest_grand:
10031 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10032 
10033 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10034 		kr = KERN_ABORTED;
10035 		goto done;
10036 	}
10037 	while (vaddr < true_end) {
10038 		gtte_p = pmap_tte(grand, vaddr);
10039 		if (gtte_p == PT_ENTRY_NULL) {
10040 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10041 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
10042 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10043 				if (kr == KERN_SUCCESS) {
10044 					kr = KERN_ABORTED;
10045 				}
10046 			}
10047 
10048 			if (kr != KERN_SUCCESS) {
10049 				goto done;
10050 			}
10051 
10052 			gtte_p = pmap_tt2e(grand, vaddr);
10053 		}
10054 		/* Don't leak a page table page.  Don't violate break-before-make. */
10055 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
10056 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
10057 			    __func__, gtte_p, grand);
10058 		}
10059 		/**
10060 		 * It's possible that grand was trimmed by pmap_trim_internal() while the
10061 		 * lock was dropped, in which case the previously stored "true" start/end
10062 		 * will no longer be accurate.  In that case, we need to avoid nesting
10063 		 * tables outside the trimmed range, as those tables may be immediately freed
10064 		 * which would lead to a dangling page table pointer in grand.
10065 		 * Note that pmap_trim() may concurrently update grand's bounds as we are
10066 		 * making these checks, but in that case pmap_trim_range() has not yet
10067 		 * been called on grand and will wait for us to drop grand's lock, so it
10068 		 * should see any TTEs we've nested here and clear them appropriately.
10069 		 */
10070 		if (__probable((vaddr >= grand->nested_region_true_start) &&
10071 		    (vaddr < grand->nested_region_true_end))) {
10072 			stte_p = pmap_tte(subord, vaddr);
10073 			if (__improbable(stte_p == PT_ENTRY_NULL)) {
10074 				panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
10075 			}
10076 			*gtte_p = *stte_p;
10077 		}
10078 
10079 		vaddr += pt_attr_twig_size(pt_attr);
10080 		vrestart = vaddr | PMAP_NEST_GRAND;
10081 		++ttecount;
10082 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10083 		    pmap_pending_preemption())) {
10084 			break;
10085 		}
10086 	}
10087 	if (vaddr >= true_end) {
10088 		vrestart = vend | PMAP_NEST_GRAND;
10089 	}
10090 
10091 	kr = KERN_SUCCESS;
10092 done:
10093 
10094 	FLUSH_PTE();
10095 	__builtin_arm_isb(ISB_SY);
10096 
10097 	if (grand_locked) {
10098 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10099 	}
10100 
10101 nest_cleanup:
10102 #if XNU_MONITOR
10103 	if (kr != KERN_SUCCESS) {
10104 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10105 		*krp = kr;
10106 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10107 	}
10108 #else
10109 	if (kr != KERN_SUCCESS) {
10110 		*krp = kr;
10111 	}
10112 #endif
10113 	return vrestart;
10114 }
10115 
10116 __mockable kern_return_t
10117 pmap_nest(
10118 	pmap_t grand,
10119 	pmap_t subord,
10120 	addr64_t vstart,
10121 	uint64_t size)
10122 {
10123 	kern_return_t kr = KERN_SUCCESS;
10124 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10125 	vm_map_offset_t vend = vaddr + size;
10126 	__unused vm_map_offset_t vlast = vaddr;
10127 
10128 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10129 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10130 	    VM_KERNEL_ADDRHIDE(vstart));
10131 
10132 	pmap_verify_preemptible();
10133 #if XNU_MONITOR
10134 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
10135 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10136 		if (kr == KERN_RESOURCE_SHORTAGE) {
10137 			pmap_alloc_page_for_ppl(0);
10138 			kr = KERN_SUCCESS;
10139 		} else if (kr == KERN_ABORTED) {
10140 			/**
10141 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10142 			 * that it won't update kr when KERN_SUCCESS is to be returned.
10143 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10144 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10145 			 */
10146 			kr = KERN_SUCCESS;
10147 			continue;
10148 		} else if (kr != KERN_SUCCESS) {
10149 			break;
10150 		} else if (vaddr == vlast) {
10151 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10152 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10153 		}
10154 		vlast = vaddr;
10155 	}
10156 
10157 	pmap_ledger_check_balance(grand);
10158 	pmap_ledger_check_balance(subord);
10159 #else
10160 	/**
10161 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10162 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10163 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10164 	 */
10165 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10166 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10167 	}
10168 #endif
10169 
10170 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10171 
10172 	return kr;
10173 }
10174 
10175 /*
10176  *	kern_return_t pmap_unnest(grand, vaddr)
10177  *
10178  *	grand  = the pmap that will have the virtual range unnested
10179  *	vaddr  = start of range in pmap to be unnested
10180  *	size   = size of range in pmap to be unnested
10181  *
10182  */
10183 
10184 kern_return_t
10185 pmap_unnest(
10186 	pmap_t grand,
10187 	addr64_t vaddr,
10188 	uint64_t size)
10189 {
10190 	return pmap_unnest_options(grand, vaddr, size, 0);
10191 }
10192 
10193 /**
10194  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10195  * from a top-level pmap ('grand').  The corresponding mappings in the nested
10196  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10197  * still have the region nested.  The mappings in 'grand' will be left empty
10198  * with the assumption that they will be demand-filled by subsequent access faults.
10199  *
10200  * This function operates in 2 main phases:
10201  * 1. Iteration over the nested pmap's mappings for the specified range to mark
10202  *    them non-global.
10203  * 2. Clearing of the twig-level TTEs for the address range in grand.
10204  *
10205  * This function may return early due to pending AST_URGENT preemption; if so
10206  * it will indicate the need to be re-entered.
10207  *
10208  * @param grand pmap from which to unnest mappings
10209  * @param vaddr twig-aligned virtual address for the beginning of the nested range
10210  * @param size twig-aligned size of the nested range
10211  * @param vrestart the page-aligned starting address of the current call.  May contain
10212  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10213  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10214  *        grand is being torn down and step 1) above is not needed.
10215  *
10216  * @return the virtual address at which to restart the operation, possibly including
10217  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
10218  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10219  */
10220 MARK_AS_PMAP_TEXT vm_map_offset_t
10221 pmap_unnest_options_internal(
10222 	pmap_t grand,
10223 	addr64_t vaddr,
10224 	uint64_t size,
10225 	vm_map_offset_t vrestart,
10226 	unsigned int option)
10227 {
10228 	vm_map_offset_t start;
10229 	vm_map_offset_t addr;
10230 	tt_entry_t     *tte_p;
10231 	unsigned int    current_index;
10232 	unsigned int    start_index;
10233 	unsigned int    max_index;
10234 	unsigned int    entry_count = 0;
10235 
10236 	addr64_t vend;
10237 	addr64_t true_end;
10238 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10239 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10240 	}
10241 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10242 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10243 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10244 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10245 	}
10246 
10247 	validate_pmap_mutable(grand);
10248 
10249 	if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10250 		panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10251 	}
10252 
10253 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10254 
10255 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10256 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10257 		    (unsigned long long)vaddr, (unsigned long long)size);
10258 	}
10259 
10260 	if (__improbable(grand->nested_pmap == NULL)) {
10261 		panic("%s: %p has no nested pmap", __func__, grand);
10262 	}
10263 
10264 	true_end = vend;
10265 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10266 		true_end = grand->nested_pmap->nested_region_true_end;
10267 	}
10268 
10269 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10270 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10271 			return vrestart;
10272 		}
10273 
10274 		start = vrestart;
10275 		if (start < grand->nested_pmap->nested_region_true_start) {
10276 			start = grand->nested_pmap->nested_region_true_start;
10277 		}
10278 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10279 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10280 		bool flush_tlb = false;
10281 
10282 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10283 			pt_entry_t  *bpte, *cpte;
10284 
10285 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10286 
10287 			bpte = pmap_pte(grand->nested_pmap, addr);
10288 
10289 			/*
10290 			 * If we've re-entered this function partway through unnesting a leaf region, the
10291 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10292 			 * the run of PTEs and the adjacent "in-progress" bit will be set.
10293 			 */
10294 			if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10295 			    testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10296 				/*
10297 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10298 				 * the nested pmap in this region will now be marked non-global.  Do this
10299 				 * before marking any of the PTEs within the region as non-global to avoid
10300 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10301 				 * in the region, which could lead to a TLB conflict if a non-global entry
10302 				 * is later inserted for the same VA in a pmap which has fully unnested this
10303 				 * region.
10304 				 */
10305 				setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10306 				setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10307 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10308 					pmap_paddr_t    pa;
10309 					unsigned int    pai = 0;
10310 					boolean_t               managed = FALSE;
10311 					pt_entry_t  spte;
10312 
10313 					if (pte_is_valid(*cpte) && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10314 						spte = *((volatile pt_entry_t*)cpte);
10315 						while (!managed) {
10316 							pa = pte_to_pa(spte);
10317 							if (!pa_valid(pa)) {
10318 								break;
10319 							}
10320 							pai = pa_index(pa);
10321 							pvh_lock(pai);
10322 							spte = *((volatile pt_entry_t*)cpte);
10323 							pa = pte_to_pa(spte);
10324 							if (pai == pa_index(pa)) {
10325 								managed = TRUE;
10326 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10327 							}
10328 							pvh_unlock(pai);
10329 						}
10330 
10331 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10332 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10333 							flush_tlb = true;
10334 						}
10335 
10336 						if (managed) {
10337 							pvh_assert_locked(pai);
10338 							pvh_unlock(pai);
10339 						}
10340 					}
10341 
10342 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10343 					vrestart = addr;
10344 					++entry_count;
10345 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10346 					    pmap_pending_preemption())) {
10347 						goto unnest_subord_done;
10348 					}
10349 				}
10350 				clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10351 			}
10352 			addr = vlim;
10353 			vrestart = addr;
10354 			++entry_count;
10355 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10356 			    pmap_pending_preemption())) {
10357 				break;
10358 			}
10359 		}
10360 
10361 unnest_subord_done:
10362 		if (flush_tlb) {
10363 			FLUSH_PTE_STRONG();
10364 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10365 		}
10366 
10367 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10368 		if (current_index < max_index) {
10369 			return vrestart;
10370 		}
10371 	}
10372 
10373 	/*
10374 	 * invalidate all pdes for segment at vaddr in pmap grand
10375 	 */
10376 	if (vrestart & PMAP_NEST_GRAND) {
10377 		addr = vrestart & ~PMAP_NEST_GRAND;
10378 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10379 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10380 		}
10381 	} else {
10382 		addr = vaddr;
10383 		vrestart = vaddr | PMAP_NEST_GRAND;
10384 	}
10385 
10386 	/**
10387 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10388 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10389 	 * upon reentry.
10390 	 */
10391 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10392 		return vrestart;
10393 	}
10394 
10395 	if (addr < grand->nested_pmap->nested_region_true_start) {
10396 		addr = grand->nested_pmap->nested_region_true_start;
10397 	}
10398 
10399 	start = addr;
10400 
10401 	while (addr < true_end) {
10402 		tte_p = pmap_tte(grand, addr);
10403 		/*
10404 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10405 		 * so it's possible that a region we're trying to unnest may not have been
10406 		 * nested in the first place.
10407 		 */
10408 		if (tte_p != NULL) {
10409 			*tte_p = ARM_TTE_TYPE_FAULT;
10410 		}
10411 		addr += pt_attr_twig_size(pt_attr);
10412 		vrestart = addr | PMAP_NEST_GRAND;
10413 		++entry_count;
10414 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10415 		    pmap_pending_preemption())) {
10416 			break;
10417 		}
10418 	}
10419 	if (addr >= true_end) {
10420 		vrestart = vend | PMAP_NEST_GRAND;
10421 	}
10422 
10423 	FLUSH_PTE_STRONG();
10424 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10425 
10426 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10427 
10428 	return vrestart;
10429 }
10430 
10431 __mockable kern_return_t
10432 pmap_unnest_options(
10433 	pmap_t grand,
10434 	addr64_t vaddr,
10435 	uint64_t size,
10436 	unsigned int option)
10437 {
10438 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10439 	vm_map_offset_t vend = vaddr + size;
10440 
10441 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10442 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10443 
10444 	pmap_verify_preemptible();
10445 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10446 #if XNU_MONITOR
10447 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10448 #else
10449 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10450 #endif
10451 	}
10452 
10453 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10454 
10455 	return KERN_SUCCESS;
10456 }
10457 
10458 boolean_t
10459 pmap_adjust_unnest_parameters(
10460 	__unused pmap_t p,
10461 	__unused vm_map_offset_t *s,
10462 	__unused vm_map_offset_t *e)
10463 {
10464 	return TRUE; /* to get to log_unnest_badness()... */
10465 }
10466 
10467 /**
10468  * Perform any necessary pre-nesting of the parent's shared region at fork()
10469  * time.
10470  *
10471  * @note This should only be called from vm_map_fork().
10472  *
10473  * @param old_pmap The pmap of the parent task.
10474  * @param new_pmap The pmap of the child task.
10475  *
10476  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10477  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10478  */
10479 kern_return_t
10480 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
10481 {
10482 	if (old_pmap == NULL || new_pmap == NULL) {
10483 		return KERN_INVALID_ARGUMENT;
10484 	}
10485 	if (old_pmap->nested_pmap == NULL) {
10486 		return KERN_SUCCESS;
10487 	}
10488 	/**
10489 	 * Obtain the full shared region bounds from the nested pmap.  If old_pmap
10490 	 * hasn't been fully nested yet, its bounds may not yet be configured.
10491 	 */
10492 	pmap_set_shared_region(new_pmap,
10493 	    old_pmap->nested_pmap,
10494 	    old_pmap->nested_pmap->nested_region_addr,
10495 	    old_pmap->nested_pmap->nested_region_size);
10496 	return KERN_SUCCESS;
10497 }
10498 
10499 /*
10500  * disable no-execute capability on
10501  * the specified pmap
10502  */
10503 #if DEVELOPMENT || DEBUG
10504 void
10505 pmap_disable_NX(
10506 	pmap_t pmap)
10507 {
10508 	pmap->nx_enabled = FALSE;
10509 }
10510 #else
10511 void
10512 pmap_disable_NX(
10513 	__unused pmap_t pmap)
10514 {
10515 }
10516 #endif
10517 
10518 /*
10519  * flush a range of hardware TLB entries.
10520  * NOTE: assumes the smallest TLB entry in use will be for
10521  * an ARM small page (4K).
10522  */
10523 
10524 #if __ARM_RANGE_TLBI__
10525 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10526 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10527 #else
10528 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10529 #endif // __ARM_RANGE_TLBI__
10530 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10531     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10532     "of npages to 32 bits below may truncate.");
10533 
10534 static void
10535 flush_mmu_tlb_region_asid_async(
10536 	vm_offset_t va,
10537 	size_t length,
10538 	pmap_t pmap,
10539 	bool last_level_only __unused,
10540 	bool strong __unused)
10541 {
10542 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10543 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10544 	size_t npages = length >> pmap_page_shift;
10545 	uint32_t asid;
10546 
10547 	asid = pmap->hw_asid;
10548 
10549 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10550 		boolean_t       flush_all = FALSE;
10551 
10552 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10553 			flush_all = TRUE;
10554 		}
10555 		if (flush_all) {
10556 			flush_mmu_tlb_async();
10557 		} else {
10558 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10559 		}
10560 		return;
10561 	}
10562 #if __ARM_RANGE_TLBI__
10563 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10564 		/**
10565 		 * Note that casting npages to 32 bits here is always safe thanks to
10566 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10567 		 */
10568 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10569 		if (pmap->type == PMAP_TYPE_NESTED) {
10570 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10571 		} else {
10572 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10573 		}
10574 		return;
10575 	}
10576 #endif
10577 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10578 	va = tlbi_asid(asid) | tlbi_addr(va);
10579 
10580 	if (pmap->type == PMAP_TYPE_NESTED) {
10581 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10582 	} else {
10583 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10584 	}
10585 }
10586 
10587 MARK_AS_PMAP_TEXT static void
10588 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10589 {
10590 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10591 }
10592 
10593 void
10594 flush_mmu_tlb_region(
10595 	vm_offset_t va,
10596 	unsigned length)
10597 {
10598 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10599 	sync_tlb_flush();
10600 }
10601 
10602 unsigned int
10603 pmap_cache_attributes(
10604 	ppnum_t pn)
10605 {
10606 	pmap_paddr_t    paddr;
10607 	unsigned int    pai;
10608 	unsigned int    result;
10609 	pp_attr_t       pp_attr_current;
10610 
10611 	paddr = ptoa(pn);
10612 
10613 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10614 
10615 	if (!pa_valid(paddr)) {
10616 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10617 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10618 	}
10619 
10620 	result = VM_WIMG_DEFAULT;
10621 
10622 	pai = pa_index(paddr);
10623 
10624 	pp_attr_current = pp_attr_table[pai];
10625 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10626 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10627 	}
10628 	return result;
10629 }
10630 
10631 MARK_AS_PMAP_TEXT static void
10632 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10633 {
10634 	if ((wimg_bits_prev != wimg_bits_new)
10635 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10636 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10637 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10638 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10639 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10640 		pmap_sync_page_attributes_phys(pn);
10641 	}
10642 
10643 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10644 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10645 	}
10646 }
10647 
10648 MARK_AS_PMAP_TEXT __unused void
10649 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10650 {
10651 	pmap_paddr_t paddr = ptoa(pn);
10652 	const unsigned int pai = pa_index(paddr);
10653 
10654 	if (__improbable(!pa_valid(paddr))) {
10655 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10656 	}
10657 
10658 	pvh_lock(pai);
10659 
10660 #if XNU_MONITOR
10661 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10662 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10663 	}
10664 #endif
10665 
10666 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10667 
10668 	pvh_unlock(pai);
10669 
10670 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10671 }
10672 
10673 void *
10674 pmap_map_compressor_page(ppnum_t pn)
10675 {
10676 #if __ARM_PTE_PHYSMAP__
10677 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10678 	if (cacheattr != VM_WIMG_DEFAULT) {
10679 #if XNU_MONITOR
10680 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10681 #else
10682 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10683 #endif
10684 	}
10685 #endif
10686 	return (void*)phystokv(ptoa(pn));
10687 }
10688 
10689 void
10690 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10691 {
10692 #if __ARM_PTE_PHYSMAP__
10693 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10694 	if (cacheattr != VM_WIMG_DEFAULT) {
10695 #if XNU_MONITOR
10696 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10697 #else
10698 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10699 #endif
10700 	}
10701 #endif
10702 }
10703 
10704 /**
10705  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10706  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10707  *
10708  * @param page_list List of pages to be updated.
10709  * @param cacheattr The new cache attribute.
10710  */
10711 void
10712 pmap_batch_set_cache_attributes(
10713 	const unified_page_list_t *page_list,
10714 	unsigned int cacheattr)
10715 {
10716 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10717 
10718 	if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10719 		/**
10720 		 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10721 		 * In an ideal world we would just use these iterator functions within
10722 		 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10723 		 * that means we'll need to take special care to handle pending preemption and
10724 		 * if necessary return the iterator position out to this function and then re-enter
10725 		 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10726 		 * secure manner.  Not impossible, but also not trivial, so unless someone asks for
10727 		 * this perf improvement on the PPL I'm going to take the lazy approach here.
10728 		 */
10729 		unified_page_list_iterator_t iter;
10730 
10731 		for (unified_page_list_iterator_init(page_list, &iter);
10732 		    !unified_page_list_iterator_end(&iter);
10733 		    unified_page_list_iterator_next(&iter)) {
10734 			bool is_fictitious = false;
10735 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10736 			if (__probable(!is_fictitious)) {
10737 #if XNU_MONITOR
10738 				pmap_set_cache_attributes_ppl(pn, cacheattr);
10739 #else /* !XNU_MONITOR */
10740 				pmap_set_cache_attributes_internal(pn, cacheattr);
10741 #endif /* XNU_MONITOR */
10742 			}
10743 		}
10744 		return;
10745 	}
10746 
10747 	if (page_list->upl.upl_size == 0) {
10748 		return;
10749 	}
10750 
10751 	batch_set_cache_attr_state_t states;
10752 	states.page_index = 0;
10753 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10754 	states.tlb_flush_pass_needed = false;
10755 	states.rt_cache_flush_pass_needed = false;
10756 
10757 	/* Verify we are being called from a preemptible context. */
10758 	pmap_verify_preemptible();
10759 
10760 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10761 #if XNU_MONITOR
10762 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10763 		    states, page_list->upl.upl_size, cacheattr);
10764 #else /* !XNU_MONITOR */
10765 		states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10766 		    states, page_list->upl.upl_size, cacheattr);
10767 #endif /* XNU_MONITOR */
10768 	}
10769 
10770 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10771 }
10772 
10773 /**
10774  * Flushes TLB entries associated with the page specified by paddr, but do not
10775  * issue barriers yet.
10776  *
10777  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10778  */
10779 MARK_AS_PMAP_TEXT static void
10780 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10781 {
10782 #if __ARM_PTE_PHYSMAP__
10783 	/* Flush the physical aperture mappings. */
10784 	const vm_offset_t kva = phystokv(paddr);
10785 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10786 #endif /* __ARM_PTE_PHYSMAP__ */
10787 
10788 	/* Flush the mappings tracked in the ptes. */
10789 	const unsigned int pai = pa_index(paddr);
10790 	pv_entry_t **pv_h = pai_to_pvh(pai);
10791 
10792 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10793 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10794 
10795 	pvh_assert_locked(pai);
10796 
10797 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10798 		pte_p = pvh_ptep(pv_h);
10799 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10800 		pve_p = pvh_pve_list(pv_h);
10801 		pte_p = PT_ENTRY_NULL;
10802 	}
10803 
10804 	int pve_ptep_idx = 0;
10805 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10806 		if (pve_p != PV_ENTRY_NULL) {
10807 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10808 			if (pte_p == PT_ENTRY_NULL) {
10809 				goto flush_tlb_skip_pte;
10810 			}
10811 		}
10812 
10813 #ifdef PVH_FLAG_IOMMU
10814 		if (pvh_ptep_is_iommu(pte_p)) {
10815 			goto flush_tlb_skip_pte;
10816 		}
10817 #endif /* PVH_FLAG_IOMMU */
10818 		pmap_t pmap = ptep_get_pmap(pte_p);
10819 		vm_map_address_t va = ptep_get_va(pte_p);
10820 
10821 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10822 		    pmap, true, false);
10823 
10824 flush_tlb_skip_pte:
10825 		pte_p = PT_ENTRY_NULL;
10826 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10827 			pve_ptep_idx = 0;
10828 			pve_p = pve_next(pve_p);
10829 		}
10830 	}
10831 }
10832 
10833 /**
10834  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10835  *
10836  * @param pai The Physical Address Index of the entry.
10837  * @param cacheattr The new cache attribute.
10838  */
10839 MARK_AS_PMAP_TEXT static void
10840 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10841 {
10842 	pvh_assert_locked(pai);
10843 
10844 	pp_attr_t pp_attr_current, pp_attr_template;
10845 	do {
10846 		pp_attr_current = pp_attr_table[pai];
10847 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10848 
10849 		/**
10850 		 * WIMG bits should only be updated under the PVH lock, but we should do
10851 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10852 		 */
10853 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10854 }
10855 
10856 /**
10857  * Batch updates the cache attributes of a list of pages in three passes.
10858  *
10859  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10860  * In pass two, TLB entries are flushed for each page in the list if necessary.
10861  * In pass three, caches are cleaned for each page in the list if necessary.
10862  *
10863  * When running in PPL, this function may decide to return to the caller in response
10864  * to AST_URGENT.
10865  *
10866  * @param user_page_list List of pages to be updated.
10867  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10868  * @param page_cnt Number of pages in total in user_page_list.
10869  * @param cacheattr The new cache attributes.
10870  *
10871  * @return The new state of the state machine.
10872  */
10873 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10874 pmap_batch_set_cache_attributes_internal(
10875 #if XNU_MONITOR
10876 	volatile upl_page_info_t *user_page_list,
10877 #else /* !XNU_MONITOR */
10878 	upl_page_info_array_t user_page_list,
10879 #endif /* XNU_MONITOR */
10880 	batch_set_cache_attr_state_t states,
10881 	unsigned int page_cnt,
10882 	unsigned int cacheattr)
10883 {
10884 	uint64_t page_index = states.page_index;
10885 	uint64_t state = states.state;
10886 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10887 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10888 
10889 	/* For verifying progress. */
10890 	__assert_only const uint64_t page_index_old = page_index;
10891 	__assert_only const uint64_t state_old = state;
10892 
10893 	/* Assert page_index and state are within their range. */
10894 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10895 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10896 	}
10897 
10898 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10899 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10900 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10901 		while (page_index < page_cnt) {
10902 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10903 			const pmap_paddr_t paddr = ptoa(pn);
10904 
10905 			if (!pa_valid(paddr)) {
10906 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10907 			}
10908 
10909 			const unsigned int pai = pa_index(paddr);
10910 
10911 			/* Lock the page. */
10912 			pvh_lock(pai);
10913 
10914 #if XNU_MONITOR
10915 			if (ppattr_pa_test_monitor(paddr)) {
10916 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10917 			}
10918 #endif /* XNU_MONITOR */
10919 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10920 
10921 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10922 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10923 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10924 			}
10925 
10926 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10927 
10928 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10929 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10930 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10931 			}
10932 
10933 			/* Update the cache attributes in PTE and PP_ATTR table. */
10934 			if (wimg_bits_new != wimg_bits_prev) {
10935 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10936 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10937 			}
10938 
10939 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10940 				rt_cache_flush_pass_needed = true;
10941 			}
10942 
10943 			pvh_unlock(pai);
10944 
10945 			page_index++;
10946 
10947 #if XNU_MONITOR
10948 			/**
10949 			 * Check for AST_URGENT every page, as the pve list search in cache
10950 			 * update can take non-constant time.
10951 			 */
10952 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10953 				goto pbscai_exit;
10954 			}
10955 #endif /* XNU_MONITOR */
10956 		}
10957 
10958 		/* page_index == page_cnt && !pmap_pending_preemption() */
10959 		if (tlb_flush_pass_needed) {
10960 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10961 		} else if (rt_cache_flush_pass_needed) {
10962 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10963 		} else {
10964 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10965 		}
10966 		page_index = 0;
10967 
10968 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10969 		FLUSH_PTE_STRONG();
10970 
10971 #if XNU_MONITOR
10972 		if (__improbable(pmap_pending_preemption())) {
10973 			goto pbscai_exit;
10974 		}
10975 #endif /* XNU_MONITOR */
10976 	}
10977 
10978 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10979 		/**
10980 		 * Pass 2: for each physical page and for each mapping, we need to flush
10981 		 * the TLB for it.
10982 		 */
10983 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10984 		while (page_index < page_cnt) {
10985 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10986 
10987 			const pmap_paddr_t paddr = ptoa(pn);
10988 			if (!pa_valid(paddr)) {
10989 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10990 			}
10991 
10992 			const unsigned int pai = pa_index(paddr);
10993 
10994 			pvh_lock(pai);
10995 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10996 			pvh_unlock(pai);
10997 
10998 			page_index++;
10999 
11000 #if XNU_MONITOR
11001 			/**
11002 			 * Check for AST_URGENT every page, as the pve list search in cache
11003 			 * update can take non-constant time.
11004 			 */
11005 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11006 				goto pbscai_exit;
11007 			}
11008 #endif /* XNU_MONITOR */
11009 		}
11010 
11011 #if HAS_FEAT_XS
11012 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11013 		arm64_sync_tlb(false);
11014 #else
11015 		/**
11016 		 * For targets that distinguish between mild and strong DSB, mild DSB
11017 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11018 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11019 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11020 		 */
11021 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
11022 #endif
11023 
11024 		if (rt_cache_flush_pass_needed) {
11025 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
11026 		} else {
11027 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11028 		}
11029 		page_index = 0;
11030 
11031 #if XNU_MONITOR
11032 		if (__improbable(pmap_pending_preemption())) {
11033 			goto pbscai_exit;
11034 		}
11035 #endif /* XNU_MONITOR */
11036 	}
11037 
11038 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
11039 		/* Pass 3: Flush the cache if the page is recently set to RT */
11040 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
11041 #if !XNU_MONITOR
11042 		/**
11043 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
11044 		 * in the state where DC by VA instructions remain enabled.
11045 		 */
11046 		disable_preemption();
11047 #endif /* !XNU_MONITOR */
11048 
11049 		assert(get_preemption_level() > 0);
11050 
11051 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11052 		/**
11053 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11054 		 * and the host will handle cache maintenance for it. So we don't need to
11055 		 * worry about enabling the ops here for AVP.
11056 		 */
11057 		enable_dc_mva_ops();
11058 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11059 
11060 		while (page_index < page_cnt) {
11061 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11062 
11063 			if (!pa_valid(paddr)) {
11064 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11065 			}
11066 
11067 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11068 
11069 			page_index++;
11070 
11071 #if XNU_MONITOR
11072 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11073 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11074 				disable_dc_mva_ops();
11075 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11076 				goto pbscai_exit;
11077 			}
11078 #endif /* XNU_MONITOR */
11079 		}
11080 
11081 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11082 		disable_dc_mva_ops();
11083 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11084 
11085 #if !XNU_MONITOR
11086 		enable_preemption();
11087 #endif /* !XNU_MONITOR */
11088 
11089 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11090 		page_index = 0;
11091 	}
11092 
11093 #if XNU_MONITOR
11094 pbscai_exit:
11095 #endif /* XNU_MONITOR */
11096 	/* Assert page_index and state are within their range. */
11097 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11098 
11099 	/* Make sure we are making progress in this call. */
11100 	assert(page_index > page_index_old || state > state_old);
11101 
11102 	batch_set_cache_attr_state_t states_new;
11103 	states_new.page_index = page_index;
11104 	states_new.state = state;
11105 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11106 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11107 	return states_new;
11108 }
11109 
11110 MARK_AS_PMAP_TEXT static void
11111 pmap_set_cache_attributes_priv(
11112 	ppnum_t pn,
11113 	unsigned int cacheattr,
11114 	boolean_t external __unused)
11115 {
11116 	pmap_paddr_t    paddr;
11117 	unsigned int    pai;
11118 	pp_attr_t       pp_attr_current;
11119 	pp_attr_t       pp_attr_template;
11120 	unsigned int    wimg_bits_prev, wimg_bits_new;
11121 
11122 	paddr = ptoa(pn);
11123 
11124 	if (!pa_valid(paddr)) {
11125 		return;                         /* Not a managed page. */
11126 	}
11127 
11128 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
11129 		cacheattr = VM_WIMG_DEFAULT;
11130 	}
11131 
11132 	pai = pa_index(paddr);
11133 
11134 	pvh_lock(pai);
11135 
11136 #if XNU_MONITOR
11137 	if (external && ppattr_pa_test_monitor(paddr)) {
11138 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11139 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
11140 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11141 	}
11142 #endif
11143 
11144 	do {
11145 		pp_attr_current = pp_attr_table[pai];
11146 		wimg_bits_prev = VM_WIMG_DEFAULT;
11147 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11148 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11149 		}
11150 
11151 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11152 
11153 		/**
11154 		 * WIMG bits should only be updated under the PVH lock, but we should do
11155 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11156 		 */
11157 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11158 
11159 	wimg_bits_new = VM_WIMG_DEFAULT;
11160 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11161 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11162 	}
11163 
11164 	if (wimg_bits_new != wimg_bits_prev) {
11165 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
11166 	}
11167 
11168 	pvh_unlock(pai);
11169 
11170 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11171 }
11172 
11173 MARK_AS_PMAP_TEXT void
11174 pmap_set_cache_attributes_internal(
11175 	ppnum_t pn,
11176 	unsigned int cacheattr)
11177 {
11178 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11179 }
11180 
11181 void
11182 pmap_set_cache_attributes(
11183 	ppnum_t pn,
11184 	unsigned int cacheattr)
11185 {
11186 #if XNU_MONITOR
11187 	pmap_set_cache_attributes_ppl(pn, cacheattr);
11188 #else
11189 	pmap_set_cache_attributes_internal(pn, cacheattr);
11190 #endif
11191 }
11192 
11193 /**
11194  * Updates the page numbered ppnum to have attribute specified by attributes.
11195  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11196  * The necessity of the TLB flush is returned in case this function is called
11197  * in a batched manner and the TLB flush is intended to be done at a different
11198  * timing.
11199  *
11200  * @param ppnum Page Number of the page to be updated.
11201  * @param attributes The new cache attributes.
11202  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11203  *        immediately.
11204  *
11205  * @return Returns true if a TLB flush is needed for this update regardless of
11206  *         whether a flush has occurred already.
11207  */
11208 MARK_AS_PMAP_TEXT bool
11209 pmap_update_cache_attributes_locked(
11210 	ppnum_t ppnum,
11211 	unsigned attributes,
11212 	bool perform_tlbi)
11213 {
11214 	pmap_paddr_t    phys = ptoa(ppnum);
11215 	pv_entry_t      *pve_p;
11216 	pt_entry_t      *pte_p;
11217 	pv_entry_t      **pv_h;
11218 	pt_entry_t      tmplate;
11219 	unsigned int    pai;
11220 	boolean_t       tlb_flush_needed = false;
11221 
11222 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11223 
11224 	if (pmap_panic_dev_wimg_on_managed) {
11225 		switch (attributes & VM_WIMG_MASK) {
11226 		case VM_WIMG_IO:                        // nGnRnE
11227 		case VM_WIMG_POSTED:                    // nGnRE
11228 		/* supported on DRAM, but slow, so we disallow */
11229 
11230 		case VM_WIMG_POSTED_REORDERED:          // nGRE
11231 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11232 			/* unsupported on DRAM */
11233 
11234 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11235 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11236 			break;
11237 
11238 		default:
11239 			/* not device type memory, all good */
11240 
11241 			break;
11242 		}
11243 	}
11244 
11245 #if __ARM_PTE_PHYSMAP__
11246 	vm_offset_t kva = phystokv(phys);
11247 	pte_p = pmap_pte(kernel_pmap, kva);
11248 
11249 	tmplate = *pte_p;
11250 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11251 #if XNU_MONITOR
11252 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11253 #else
11254 	tmplate |= wimg_to_pte(attributes, phys);
11255 #endif
11256 	if (tmplate & ARM_PTE_HINT_MASK) {
11257 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11258 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
11259 	}
11260 
11261 	if (perform_tlbi) {
11262 		write_pte_strong(pte_p, tmplate);
11263 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11264 	} else {
11265 		write_pte_fast(pte_p, tmplate);
11266 	}
11267 	tlb_flush_needed = true;
11268 #endif
11269 
11270 	pai = pa_index(phys);
11271 
11272 	pv_h = pai_to_pvh(pai);
11273 
11274 	pte_p = PT_ENTRY_NULL;
11275 	pve_p = PV_ENTRY_NULL;
11276 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11277 		pte_p = pvh_ptep(pv_h);
11278 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11279 		pve_p = pvh_pve_list(pv_h);
11280 		pte_p = PT_ENTRY_NULL;
11281 	}
11282 
11283 	int pve_ptep_idx = 0;
11284 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11285 		vm_map_address_t va;
11286 		pmap_t          pmap;
11287 
11288 		if (pve_p != PV_ENTRY_NULL) {
11289 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11290 			if (pte_p == PT_ENTRY_NULL) {
11291 				goto cache_skip_pve;
11292 			}
11293 		}
11294 
11295 #ifdef PVH_FLAG_IOMMU
11296 		if (pvh_ptep_is_iommu(pte_p)) {
11297 			goto cache_skip_pve;
11298 		}
11299 #endif
11300 		pmap = ptep_get_pmap(pte_p);
11301 #if HAS_FEAT_XS
11302 		/**
11303 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11304 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11305 		 */
11306 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11307 #endif /* HAS_FEAT_XS */
11308 		va = ptep_get_va(pte_p);
11309 
11310 		tmplate = *pte_p;
11311 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11312 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11313 
11314 		if (perform_tlbi) {
11315 			write_pte_strong(pte_p, tmplate);
11316 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11317 			    pmap, true, false);
11318 		} else {
11319 			write_pte_fast(pte_p, tmplate);
11320 		}
11321 		tlb_flush_needed = true;
11322 
11323 cache_skip_pve:
11324 		pte_p = PT_ENTRY_NULL;
11325 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11326 			pve_ptep_idx = 0;
11327 			pve_p = pve_next(pve_p);
11328 		}
11329 	}
11330 	if (perform_tlbi && tlb_flush_needed) {
11331 #if HAS_FEAT_XS
11332 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11333 		arm64_sync_tlb(false);
11334 #else
11335 		/**
11336 		 * For targets that distinguish between mild and strong DSB, mild DSB
11337 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11338 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11339 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11340 		 */
11341 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11342 #endif
11343 	}
11344 
11345 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11346 
11347 	return tlb_flush_needed;
11348 }
11349 
11350 /**
11351  * Mark a pmap as being dedicated to use for a commpage mapping.
11352  * The pmap itself will never be activated on a CPU; its mappings will
11353  * only be embedded in userspace pmaps at a fixed virtual address.
11354  *
11355  * @param pmap the pmap to mark as belonging to a commpage.
11356  */
11357 static void
11358 pmap_set_commpage(pmap_t pmap)
11359 {
11360 #if XNU_MONITOR
11361 	assert(!pmap_ppl_locked_down);
11362 #endif
11363 	assert(pmap->type == PMAP_TYPE_USER);
11364 	pmap->type = PMAP_TYPE_COMMPAGE;
11365 	/*
11366 	 * Free the pmap's ASID.  This pmap should not ever be directly
11367 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11368 	 * ASID space contention but will also cause pmap_switch() to panic
11369 	 * if an attacker tries to activate this pmap.  Disable preemption to
11370 	 * accommodate the *_nopreempt spinlock in free_asid().
11371 	 */
11372 	mp_disable_preemption();
11373 	pmap_get_pt_ops(pmap)->free_id(pmap);
11374 	mp_enable_preemption();
11375 }
11376 
11377 static void
11378 pmap_update_tt3e(
11379 	pmap_t pmap,
11380 	vm_address_t address,
11381 	tt_entry_t template)
11382 {
11383 	tt_entry_t *ptep, pte;
11384 
11385 	ptep = pmap_tt3e(pmap, address);
11386 	if (ptep == NULL) {
11387 		panic("%s: no ptep?", __FUNCTION__);
11388 	}
11389 
11390 	pte = *ptep;
11391 	pte = tte_to_pa(pte) | template;
11392 	write_pte_strong(ptep, pte);
11393 }
11394 
11395 /* Note absence of non-global bit */
11396 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11397 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11398 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11399 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11400 
11401 /* Note absence of non-global bit and no-execute bit.  */
11402 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11403 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11404 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11405 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11406 
11407 void
11408 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11409     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11410 {
11411 	kern_return_t kr;
11412 	pmap_paddr_t data_pa = 0; // data address
11413 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11414 	pmap_paddr_t text_pa = 0; // text address
11415 
11416 	*kernel_data_addr = 0;
11417 	*kernel_text_addr = 0;
11418 	*user_text_addr = 0;
11419 
11420 #if XNU_MONITOR
11421 	data_pa = pmap_alloc_page_for_kern(0);
11422 	assert(data_pa);
11423 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11424 	ro_data_pa = pmap_alloc_page_for_kern(0);
11425 	assert(ro_data_pa);
11426 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11427 #if CONFIG_ARM_PFZ
11428 	text_pa = pmap_alloc_page_for_kern(0);
11429 	assert(text_pa);
11430 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11431 #endif
11432 
11433 #else /* XNU_MONITOR */
11434 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11435 	/*
11436 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11437 	 * mapped at page granularity, so a separate page for kernel RO data would not
11438 	 * be useful.
11439 	 */
11440 	ro_data_pa = data_pa;
11441 #if CONFIG_ARM_PFZ
11442 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11443 #endif
11444 
11445 #endif /* XNU_MONITOR */
11446 
11447 	/*
11448 	 * In order to avoid burning extra pages on mapping the shared page, we
11449 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11450 	 * translation tables from this pmap into other pmaps.  The level we
11451 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11452 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11453 	 *
11454 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11455 	 * shared cache).
11456 	 *
11457 	 * Note that we update parameters of the entry for our unique needs (NG
11458 	 * entry, etc.).
11459 	 */
11460 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11461 	assert(commpage_pmap_default != NULL);
11462 	pmap_set_commpage(commpage_pmap_default);
11463 
11464 	/* The user 64-bit mappings... */
11465 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11466 	assert(kr == KERN_SUCCESS);
11467 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11468 
11469 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11470 	assert(kr == KERN_SUCCESS);
11471 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11472 #if CONFIG_ARM_PFZ
11473 	/* User mapping of comm page text section for 64 bit mapping only
11474 	 *
11475 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11476 	 * user processes to get this page mapped in, they should never call into
11477 	 * this page.
11478 	 *
11479 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11480 	 * is slid in the same L3 as the data commpage.  It is either outside the
11481 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11482 	 * it is reserved and unavailable to mach VM for future mappings.
11483 	 */
11484 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11485 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11486 
11487 	vm_map_address_t commpage_text_va = 0;
11488 
11489 	do {
11490 		int text_leaf_index = random() % num_ptes;
11491 
11492 		// Generate a VA for the commpage text with the same root and twig index as data
11493 		// comm page, but with new leaf index we've just generated.
11494 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11495 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11496 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11497 
11498 	// Assert that this is empty
11499 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11500 	assert(ptep != PT_ENTRY_NULL);
11501 	assert(*ptep == ARM_TTE_EMPTY);
11502 
11503 	// At this point, we've found the address we want to insert our comm page at
11504 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11505 	assert(kr == KERN_SUCCESS);
11506 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11507 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11508 
11509 	*user_text_addr = commpage_text_va;
11510 #endif
11511 
11512 	/* ...and the user 32-bit mappings. */
11513 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11514 	assert(kr == KERN_SUCCESS);
11515 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11516 
11517 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11518 	assert(kr == KERN_SUCCESS);
11519 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11520 #if __ARM_MIXED_PAGE_SIZE__
11521 	/**
11522 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11523 	 * new set of page tables that point to the exact same 16K shared page as
11524 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11525 	 * the only part that contains relevant data.
11526 	 */
11527 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11528 	assert(commpage_pmap_4k != NULL);
11529 	pmap_set_commpage(commpage_pmap_4k);
11530 
11531 	/* The user 64-bit mappings... */
11532 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11533 	assert(kr == KERN_SUCCESS);
11534 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11535 
11536 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11537 	assert(kr == KERN_SUCCESS);
11538 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11539 
11540 	/* ...and the user 32-bit mapping. */
11541 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11542 	assert(kr == KERN_SUCCESS);
11543 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11544 
11545 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11546 	assert(kr == KERN_SUCCESS);
11547 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11548 #endif
11549 
11550 	/* For manipulation in kernel, go straight to physical page */
11551 	*kernel_data_addr = phystokv(data_pa);
11552 	assert(commpage_ro_data_kva == 0);
11553 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11554 	assert(commpage_text_kva == 0);
11555 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11556 }
11557 
11558 
11559 /*
11560  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11561  * with user controlled TTEs for regions that aren't explicitly reserved by the
11562  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11563  */
11564 #if (ARM_PGSHIFT == 14)
11565 /**
11566  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11567  * commpage completely above the maximum 32-bit userspace VA.
11568  */
11569 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11570 
11571 /**
11572  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11573  * userspace VAs can nest the commpage completely above the maximum 64-bit
11574  * userpace VA, but that technically isn't true on macOS. On those systems, the
11575  * commpage lives within the userspace VA range, but is protected by the VM as
11576  * a reserved region (see vm_reserved_regions[] definition for more info).
11577  */
11578 
11579 #elif (ARM_PGSHIFT == 12)
11580 /**
11581  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11582  * above the maximum userspace VA.
11583  */
11584 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11585 #else
11586 #error Nested shared page mapping is unsupported on this config
11587 #endif
11588 
11589 MARK_AS_PMAP_TEXT kern_return_t
11590 pmap_insert_commpage_internal(
11591 	pmap_t pmap)
11592 {
11593 	kern_return_t kr = KERN_SUCCESS;
11594 	vm_offset_t commpage_vaddr;
11595 	pt_entry_t *ttep, *src_ttep;
11596 	int options = 0;
11597 	pmap_t commpage_pmap = commpage_pmap_default;
11598 
11599 	/* Validate the pmap input before accessing its data. */
11600 	validate_pmap_mutable(pmap);
11601 
11602 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11603 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11604 
11605 #if __ARM_MIXED_PAGE_SIZE__
11606 #if !__ARM_16K_PG__
11607 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11608 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11609 #endif /* !__ARM_16K_PG__ */
11610 
11611 	/* Choose the correct shared page pmap to use. */
11612 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11613 	if (pmap_page_size == 16384) {
11614 		commpage_pmap = commpage_pmap_default;
11615 	} else if (pmap_page_size == 4096) {
11616 		commpage_pmap = commpage_pmap_4k;
11617 	} else {
11618 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11619 	}
11620 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11621 
11622 #if XNU_MONITOR
11623 	options |= PMAP_OPTIONS_NOWAIT;
11624 #endif /* XNU_MONITOR */
11625 
11626 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11627 #error We assume a single page.
11628 #endif
11629 
11630 	if (pmap_is_64bit(pmap)) {
11631 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11632 	} else {
11633 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11634 	}
11635 
11636 
11637 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11638 
11639 	/*
11640 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11641 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11642 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11643 	 * to "nest".
11644 	 *
11645 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11646 	 * nesting just means inserting pointers to pre-allocated tables inside of
11647 	 * the passed in pmap to allow us to share page tables (which map the shared
11648 	 * page) for every task. This saves at least one page of memory per process
11649 	 * compared to creating new page tables in every process for mapping the
11650 	 * shared page.
11651 	 */
11652 
11653 	/**
11654 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11655 	 * page's tables into place.
11656 	 */
11657 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11658 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11659 
11660 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11661 
11662 		if (kr != KERN_SUCCESS) {
11663 #if XNU_MONITOR
11664 			if (kr == KERN_RESOURCE_SHORTAGE) {
11665 				return kr;
11666 			} else
11667 #endif
11668 			if (kr == KERN_ABORTED) {
11669 				return kr;
11670 			} else {
11671 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11672 			}
11673 		}
11674 
11675 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11676 	}
11677 
11678 	if (*ttep != ARM_PTE_EMPTY) {
11679 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11680 	}
11681 
11682 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11683 
11684 	*ttep = *src_ttep;
11685 	FLUSH_PTE_STRONG();
11686 
11687 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11688 
11689 	return kr;
11690 }
11691 
11692 static void
11693 pmap_unmap_commpage(
11694 	pmap_t pmap)
11695 {
11696 	pt_entry_t *ttep;
11697 	vm_offset_t commpage_vaddr;
11698 	pmap_t commpage_pmap = commpage_pmap_default;
11699 
11700 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11701 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11702 
11703 #if __ARM_MIXED_PAGE_SIZE__
11704 #if !__ARM_16K_PG__
11705 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11706 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11707 #endif /* !__ARM_16K_PG__ */
11708 
11709 	/* Choose the correct shared page pmap to use. */
11710 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11711 	if (pmap_page_size == 16384) {
11712 		commpage_pmap = commpage_pmap_default;
11713 	} else if (pmap_page_size == 4096) {
11714 		commpage_pmap = commpage_pmap_4k;
11715 	} else {
11716 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11717 	}
11718 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11719 
11720 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11721 #error We assume a single page.
11722 #endif
11723 
11724 	if (pmap_is_64bit(pmap)) {
11725 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11726 	} else {
11727 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11728 	}
11729 
11730 
11731 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11732 
11733 	if (ttep == NULL) {
11734 		return;
11735 	}
11736 
11737 	/* It had better be mapped to the shared page. */
11738 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11739 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11740 	}
11741 
11742 	*ttep = ARM_TTE_EMPTY;
11743 	FLUSH_PTE_STRONG();
11744 
11745 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11746 	sync_tlb_flush();
11747 }
11748 
11749 void
11750 pmap_insert_commpage(
11751 	pmap_t pmap)
11752 {
11753 	kern_return_t kr = KERN_FAILURE;
11754 #if XNU_MONITOR
11755 	do {
11756 		kr = pmap_insert_commpage_ppl(pmap);
11757 
11758 		if (kr == KERN_RESOURCE_SHORTAGE) {
11759 			pmap_alloc_page_for_ppl(0);
11760 		}
11761 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11762 
11763 	pmap_ledger_check_balance(pmap);
11764 #else
11765 	do {
11766 		kr = pmap_insert_commpage_internal(pmap);
11767 	} while (kr == KERN_ABORTED);
11768 #endif
11769 
11770 	if (kr != KERN_SUCCESS) {
11771 		panic("%s: failed to insert the shared page, kr=%d, "
11772 		    "pmap=%p",
11773 		    __FUNCTION__, kr,
11774 		    pmap);
11775 	}
11776 }
11777 
11778 static boolean_t
11779 pmap_is_64bit(
11780 	pmap_t pmap)
11781 {
11782 	return pmap->is_64bit;
11783 }
11784 
11785 bool
11786 pmap_is_exotic(
11787 	pmap_t pmap __unused)
11788 {
11789 	return false;
11790 }
11791 
11792 
11793 /* ARMTODO -- an implementation that accounts for
11794  * holes in the physical map, if any.
11795  */
11796 boolean_t
11797 pmap_valid_page(
11798 	ppnum_t pn)
11799 {
11800 	return pa_valid(ptoa(pn));
11801 }
11802 
11803 boolean_t
11804 pmap_bootloader_page(
11805 	ppnum_t pn)
11806 {
11807 	pmap_paddr_t paddr = ptoa(pn);
11808 
11809 	if (pa_valid(paddr)) {
11810 		return FALSE;
11811 	}
11812 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11813 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11814 }
11815 
11816 MARK_AS_PMAP_TEXT boolean_t
11817 pmap_is_empty_internal(
11818 	pmap_t pmap,
11819 	vm_map_offset_t va_start,
11820 	vm_map_offset_t va_end)
11821 {
11822 	vm_map_offset_t block_start, block_end;
11823 	tt_entry_t *tte_p;
11824 
11825 	if (pmap == NULL) {
11826 		return TRUE;
11827 	}
11828 
11829 	validate_pmap(pmap);
11830 
11831 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11832 	unsigned int initial_not_in_kdp = not_in_kdp;
11833 
11834 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11835 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11836 	}
11837 
11838 
11839 	/* TODO: This will be faster if we increment ttep at each level. */
11840 	block_start = va_start;
11841 
11842 	while (block_start < va_end) {
11843 		pt_entry_t     *bpte_p, *epte_p;
11844 		pt_entry_t     *pte_p;
11845 
11846 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11847 		if (block_end > va_end) {
11848 			block_end = va_end;
11849 		}
11850 
11851 		tte_p = pmap_tte(pmap, block_start);
11852 		if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11853 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11854 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11855 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11856 
11857 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11858 				if (*pte_p != ARM_PTE_EMPTY) {
11859 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11860 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11861 					}
11862 					return FALSE;
11863 				}
11864 			}
11865 		}
11866 		block_start = block_end;
11867 	}
11868 
11869 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11870 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11871 	}
11872 
11873 	return TRUE;
11874 }
11875 
11876 boolean_t
11877 pmap_is_empty(
11878 	pmap_t pmap,
11879 	vm_map_offset_t va_start,
11880 	vm_map_offset_t va_end)
11881 {
11882 #if XNU_MONITOR
11883 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11884 #else
11885 	return pmap_is_empty_internal(pmap, va_start, va_end);
11886 #endif
11887 }
11888 
11889 vm_map_offset_t
11890 pmap_max_offset(
11891 	boolean_t               is64,
11892 	unsigned int    option)
11893 {
11894 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11895 }
11896 
11897 vm_map_offset_t
11898 pmap_max_64bit_offset(
11899 	__unused unsigned int option)
11900 {
11901 	vm_map_offset_t max_offset_ret = 0;
11902 
11903 #if defined(__arm64__)
11904 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11905 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11906 		max_offset_ret = arm64_pmap_max_offset_default;
11907 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11908 		max_offset_ret = min_max_offset;
11909 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11910 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11911 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11912 		if (arm64_pmap_max_offset_default) {
11913 			max_offset_ret = arm64_pmap_max_offset_default;
11914 		} else if (max_mem > 0xC0000000) {
11915 			// devices with > 3GB of memory
11916 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11917 		} else if (max_mem > 0x40000000) {
11918 			// devices with > 1GB and <= 3GB of memory
11919 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11920 		} else {
11921 			// devices with <= 1 GB of memory
11922 			max_offset_ret = min_max_offset;
11923 		}
11924 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11925 		if (arm64_pmap_max_offset_default) {
11926 			// Allow the boot-arg to override jumbo size
11927 			max_offset_ret = arm64_pmap_max_offset_default;
11928 		} else {
11929 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11930 		}
11931 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11932 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11933 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11934 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11935 	} else {
11936 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11937 	}
11938 
11939 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11940 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11941 		assert(max_offset_ret >= min_max_offset);
11942 	}
11943 #else
11944 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11945 #endif
11946 
11947 	return max_offset_ret;
11948 }
11949 
11950 vm_map_offset_t
11951 pmap_max_32bit_offset(
11952 	unsigned int option)
11953 {
11954 	vm_map_offset_t max_offset_ret = 0;
11955 
11956 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11957 		max_offset_ret = arm_pmap_max_offset_default;
11958 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11959 		max_offset_ret = VM_MAX_ADDRESS;
11960 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11961 		max_offset_ret = VM_MAX_ADDRESS;
11962 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11963 		if (arm_pmap_max_offset_default) {
11964 			max_offset_ret = arm_pmap_max_offset_default;
11965 		} else if (max_mem > 0x20000000) {
11966 			max_offset_ret = VM_MAX_ADDRESS;
11967 		} else {
11968 			max_offset_ret = VM_MAX_ADDRESS;
11969 		}
11970 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11971 		max_offset_ret = VM_MAX_ADDRESS;
11972 	} else {
11973 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11974 	}
11975 
11976 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11977 	return max_offset_ret;
11978 }
11979 
11980 #if CONFIG_DTRACE
11981 /*
11982  * Constrain DTrace copyin/copyout actions
11983  */
11984 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11985 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11986 
11987 kern_return_t
11988 dtrace_copyio_preflight(
11989 	__unused addr64_t va)
11990 {
11991 	if (current_map() == kernel_map) {
11992 		return KERN_FAILURE;
11993 	} else {
11994 		return KERN_SUCCESS;
11995 	}
11996 }
11997 
11998 kern_return_t
11999 dtrace_copyio_postflight(
12000 	__unused addr64_t va)
12001 {
12002 	return KERN_SUCCESS;
12003 }
12004 #endif /* CONFIG_DTRACE */
12005 
12006 
12007 void
12008 pmap_flush_context_init(__unused pmap_flush_context *pfc)
12009 {
12010 }
12011 
12012 
12013 void
12014 pmap_flush(
12015 	__unused pmap_flush_context *cpus_to_flush)
12016 {
12017 	/* not implemented yet */
12018 	return;
12019 }
12020 
12021 #if XNU_MONITOR
12022 
12023 /*
12024  * Enforce that the address range described by kva and nbytes is not currently
12025  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
12026  * unintentionally writing to PPL-owned memory.
12027  */
12028 void
12029 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
12030 {
12031 	vm_offset_t end;
12032 	if (os_add_overflow(kva, nbytes, &end)) {
12033 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12034 	}
12035 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12036 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12037 		unsigned int pai = pa_index(pa);
12038 		pp_attr_t attr;
12039 		if (__improbable(!pa_valid(pa))) {
12040 			panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12041 		}
12042 		pvh_lock(pai);
12043 		if (__improbable(ckva == phystokv(pa))) {
12044 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12045 		}
12046 		do {
12047 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12048 			if (__improbable(attr & PP_ATTR_MONITOR)) {
12049 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12050 			}
12051 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12052 		pvh_unlock(pai);
12053 		if (__improbable(kvtophys_nofail(ckva) != pa)) {
12054 			panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12055 		}
12056 	}
12057 }
12058 
12059 void
12060 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12061 {
12062 	vm_offset_t end;
12063 	if (os_add_overflow(kva, nbytes, &end)) {
12064 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12065 	}
12066 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12067 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12068 
12069 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12070 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12071 		}
12072 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12073 		ppattr_pa_clear_no_monitor(pa);
12074 	}
12075 }
12076 
12077 /**
12078  * Lock down a page, making all mappings read-only, and preventing further
12079  * mappings or removal of this particular kva's mapping. Effectively, it makes
12080  * the physical page at kva immutable (see the ppl_writable parameter for an
12081  * exception to this).
12082  *
12083  * @param kva Valid address to any mapping of the physical page to lockdown.
12084  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12085  * @param ppl_writable True if the PPL should still be able to write to the page
12086  *                     using the physical aperture mapping. False will make the
12087  *                     page read-only for both the kernel and PPL in the
12088  *                     physical aperture.
12089  */
12090 
12091 MARK_AS_PMAP_TEXT static void
12092 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12093 {
12094 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12095 }
12096 
12097 /**
12098  * Lock down a page, giving all mappings the specified maximum permissions, and
12099  * preventing further mappings or removal of this particular kva's mapping.
12100  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12101  * parameter for an exception to this).
12102  *
12103  * @param kva Valid address to any mapping of the physical page to lockdown.
12104  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12105  * @param ppl_writable True if the PPL should still be able to write to the page
12106  *                     using the physical aperture mapping. False will make the
12107  *                     page read-only for both the kernel and PPL in the
12108  *                     physical aperture.
12109  * @param prot Maximum permissions to allow in existing alias mappings
12110  */
12111 MARK_AS_PMAP_TEXT static void
12112 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12113 {
12114 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12115 	const unsigned int pai = pa_index(pa);
12116 
12117 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12118 	pvh_lock(pai);
12119 	pv_entry_t **pvh = pai_to_pvh(pai);
12120 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12121 
12122 	if (__improbable(ppattr_pa_test_monitor(pa))) {
12123 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12124 	}
12125 
12126 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12127 		panic("%s: %#lx already locked down/executable (%#llx)",
12128 		    __func__, kva, (uint64_t)pvh_flags);
12129 	}
12130 
12131 
12132 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12133 
12134 	/* Update the physical aperture mapping to prevent kernel write access. */
12135 	const unsigned int new_xprr_perm =
12136 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12137 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12138 
12139 	pvh_unlock(pai);
12140 
12141 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12142 
12143 	/**
12144 	 * Double-check that the mapping didn't change physical addresses before the
12145 	 * LOCKDOWN flag was set (there is a brief window between the above
12146 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12147 	 *
12148 	 * This doesn't solve the ABA problem, but this doesn't have to since once
12149 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
12150 	 * page without the LOCKDOWN flag already set (so any future mappings can
12151 	 * only be RO, and no existing mappings can be removed).
12152 	 */
12153 	if (kvtophys_nofail(kva) != pa) {
12154 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12155 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12156 	}
12157 }
12158 
12159 /**
12160  * Helper for releasing a page from being locked down to the PPL, making it writable to the
12161  * kernel once again.
12162  *
12163  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12164  *       to unlockdown a page that was never locked down, will panic.
12165  *
12166  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
12167  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12168  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12169  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12170  *                     deviation will result in a panic.
12171  */
12172 MARK_AS_PMAP_TEXT static void
12173 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12174 {
12175 	pvh_assert_locked(pai);
12176 	pv_entry_t **pvh = pai_to_pvh(pai);
12177 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12178 
12179 	if (__improbable(!(pvh_flags & lockdown_flag))) {
12180 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12181 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12182 	}
12183 
12184 
12185 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12186 
12187 	/* Restore the pre-lockdown physical aperture mapping permissions. */
12188 	const unsigned int old_xprr_perm =
12189 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12190 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12191 }
12192 
12193 /**
12194  * Release a page from being locked down to the PPL, making it writable to the
12195  * kernel once again.
12196  *
12197  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12198  *       to unlockdown a page that was never locked down, will panic.
12199  *
12200  * @param kva Valid address to any mapping of the physical page to unlockdown.
12201  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12202  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12203  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12204  *                     deviation will result in a panic.
12205  */
12206 MARK_AS_PMAP_TEXT static void
12207 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12208 {
12209 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12210 	const unsigned int pai = pa_index(pa);
12211 
12212 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12213 	pvh_lock(pai);
12214 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12215 	pvh_unlock(pai);
12216 }
12217 
12218 #else /* XNU_MONITOR */
12219 
12220 void __unused
12221 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12222 {
12223 }
12224 
12225 void __unused
12226 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12227 {
12228 }
12229 
12230 #endif /* !XNU_MONITOR */
12231 
12232 
12233 MARK_AS_PMAP_TEXT static inline void
12234 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12235 {
12236 #if XNU_MONITOR
12237 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12238 #else
12239 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12240 #endif
12241 }
12242 
12243 MARK_AS_PMAP_TEXT static inline void
12244 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12245 {
12246 #if XNU_MONITOR
12247 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12248 #else
12249 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12250 #endif
12251 }
12252 
12253 /**
12254  * Perform basic validation checks on the destination only and
12255  * corresponding offset/sizes prior to writing to a read only allocation.
12256  *
12257  * @note Should be called before writing to an allocation from the read
12258  * only allocator.
12259  *
12260  * @param zid The ID of the zone the allocation belongs to.
12261  * @param va VA of element being modified (destination).
12262  * @param offset Offset being written to, in the element.
12263  * @param new_data_size Size of modification.
12264  *
12265  */
12266 
12267 MARK_AS_PMAP_TEXT static void
12268 pmap_ro_zone_validate_element_dst(
12269 	zone_id_t           zid,
12270 	vm_offset_t         va,
12271 	vm_offset_t         offset,
12272 	vm_size_t           new_data_size)
12273 {
12274 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12275 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12276 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12277 	}
12278 
12279 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12280 
12281 	/* Check element is from correct zone and properly aligned */
12282 	zone_require_ro(zid, elem_size, (void*)va);
12283 
12284 	if (__improbable(new_data_size > (elem_size - offset))) {
12285 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12286 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12287 	}
12288 	if (__improbable(offset >= elem_size)) {
12289 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12290 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12291 	}
12292 }
12293 
12294 
12295 /**
12296  * Perform basic validation checks on the source, destination and
12297  * corresponding offset/sizes prior to writing to a read only allocation.
12298  *
12299  * @note Should be called before writing to an allocation from the read
12300  * only allocator.
12301  *
12302  * @param zid The ID of the zone the allocation belongs to.
12303  * @param va VA of element being modified (destination).
12304  * @param offset Offset being written to, in the element.
12305  * @param new_data Pointer to new data (source).
12306  * @param new_data_size Size of modification.
12307  *
12308  */
12309 
12310 MARK_AS_PMAP_TEXT static void
12311 pmap_ro_zone_validate_element(
12312 	zone_id_t           zid,
12313 	vm_offset_t         va,
12314 	vm_offset_t         offset,
12315 	const vm_offset_t   new_data,
12316 	vm_size_t           new_data_size)
12317 {
12318 	vm_offset_t sum = 0;
12319 
12320 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12321 		panic("%s: Integer addition overflow %p + %lu = %lu",
12322 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12323 	}
12324 
12325 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12326 }
12327 
12328 /**
12329  * Ensure that physical page is locked down before writing to it.
12330  *
12331  * @note Should be called before writing to an allocation from the read
12332  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12333  * ensure that it is called after the modification.
12334  *
12335  *
12336  * @param pa Physical address of the element being modified.
12337  * @param va Virtual address of element being modified.
12338  * @param size Size of the modification.
12339  *
12340  */
12341 
12342 MARK_AS_PMAP_TEXT static void
12343 pmap_ro_zone_lock_phy_page(
12344 	const pmap_paddr_t  pa,
12345 	vm_offset_t         va,
12346 	vm_size_t           size)
12347 {
12348 	if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12349 		panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12350 		    __func__, (unsigned long long)va, (unsigned long long)size);
12351 	}
12352 	const unsigned int pai = pa_index(pa);
12353 	pvh_lock(pai);
12354 
12355 	/* Ensure that the physical page is locked down */
12356 #if XNU_MONITOR
12357 	pv_entry_t **pvh = pai_to_pvh(pai);
12358 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12359 		panic("%s: Physical page not locked down %llx", __func__, pa);
12360 	}
12361 #endif /* XNU_MONITOR */
12362 }
12363 
12364 /**
12365  * Unlock physical page after writing to it.
12366  *
12367  * @note Should be called after writing to an allocation from the read
12368  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12369  * ensure that it has been called prior to the modification.
12370  *
12371  * @param pa Physical address of the element that was modified.
12372  * @param va Virtual address of element that was modified.
12373  * @param size Size of the modification.
12374  *
12375  */
12376 
12377 MARK_AS_PMAP_TEXT static void
12378 pmap_ro_zone_unlock_phy_page(
12379 	const pmap_paddr_t  pa,
12380 	vm_offset_t         va __unused,
12381 	vm_size_t           size __unused)
12382 {
12383 	const unsigned int pai = pa_index(pa);
12384 	pvh_unlock(pai);
12385 }
12386 
12387 /**
12388  * Function to copy kauth_cred from new_data to kv.
12389  * Function defined in "kern_prot.c"
12390  *
12391  * @note Will be removed upon completion of
12392  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12393  *
12394  * @param kv Address to copy new data to.
12395  * @param new_data Pointer to new data.
12396  *
12397  */
12398 
12399 extern void
12400 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12401 
12402 /**
12403  * Zalloc-specific memcpy that writes through the physical aperture
12404  * and ensures the element being modified is from a read-only zone.
12405  *
12406  * @note Designed to work only with the zone allocator's read-only submap.
12407  *
12408  * @param zid The ID of the zone to allocate from.
12409  * @param va VA of element to be modified.
12410  * @param offset Offset from element.
12411  * @param new_data Pointer to new data.
12412  * @param new_data_size	Size of modification.
12413  *
12414  */
12415 
12416 void
12417 pmap_ro_zone_memcpy(
12418 	zone_id_t           zid,
12419 	vm_offset_t         va,
12420 	vm_offset_t         offset,
12421 	const vm_offset_t   new_data,
12422 	vm_size_t           new_data_size)
12423 {
12424 #if XNU_MONITOR
12425 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12426 #else /* XNU_MONITOR */
12427 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12428 #endif /* XNU_MONITOR */
12429 }
12430 
12431 MARK_AS_PMAP_TEXT void
12432 pmap_ro_zone_memcpy_internal(
12433 	zone_id_t             zid,
12434 	vm_offset_t           va,
12435 	vm_offset_t           offset,
12436 	const vm_offset_t     new_data,
12437 	vm_size_t             new_data_size)
12438 {
12439 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12440 
12441 	if (!new_data || new_data_size == 0) {
12442 		return;
12443 	}
12444 
12445 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12446 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12447 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12448 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12449 }
12450 
12451 /**
12452  * Zalloc-specific function to atomically mutate fields of an element that
12453  * belongs to a read-only zone, via the physcial aperture.
12454  *
12455  * @note Designed to work only with the zone allocator's read-only submap.
12456  *
12457  * @param zid The ID of the zone the element belongs to.
12458  * @param va VA of element to be modified.
12459  * @param offset Offset in element.
12460  * @param op Atomic operation to perform.
12461  * @param value	Mutation value.
12462  *
12463  */
12464 
12465 uint64_t
12466 pmap_ro_zone_atomic_op(
12467 	zone_id_t             zid,
12468 	vm_offset_t           va,
12469 	vm_offset_t           offset,
12470 	zro_atomic_op_t       op,
12471 	uint64_t              value)
12472 {
12473 #if XNU_MONITOR
12474 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12475 #else /* XNU_MONITOR */
12476 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12477 #endif /* XNU_MONITOR */
12478 }
12479 
12480 MARK_AS_PMAP_TEXT uint64_t
12481 pmap_ro_zone_atomic_op_internal(
12482 	zone_id_t             zid,
12483 	vm_offset_t           va,
12484 	vm_offset_t           offset,
12485 	zro_atomic_op_t       op,
12486 	uint64_t              value)
12487 {
12488 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12489 	vm_size_t value_size = op & 0xf;
12490 
12491 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12492 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12493 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12494 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12495 
12496 	return value;
12497 }
12498 
12499 /**
12500  * bzero for allocations from read only zones, that writes through the
12501  * physical aperture.
12502  *
12503  * @note This is called by the zfree path of all allocations from read
12504  * only zones.
12505  *
12506  * @param zid The ID of the zone the allocation belongs to.
12507  * @param va VA of element to be zeroed.
12508  * @param offset Offset in the element.
12509  * @param size	Size of allocation.
12510  *
12511  */
12512 
12513 void
12514 pmap_ro_zone_bzero(
12515 	zone_id_t       zid,
12516 	vm_offset_t     va,
12517 	vm_offset_t     offset,
12518 	vm_size_t       size)
12519 {
12520 #if XNU_MONITOR
12521 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12522 #else /* XNU_MONITOR */
12523 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12524 #endif /* XNU_MONITOR */
12525 }
12526 
12527 MARK_AS_PMAP_TEXT void
12528 pmap_ro_zone_bzero_internal(
12529 	zone_id_t       zid,
12530 	vm_offset_t     va,
12531 	vm_offset_t     offset,
12532 	vm_size_t       size)
12533 {
12534 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12535 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12536 	pmap_ro_zone_lock_phy_page(pa, va, size);
12537 	bzero((void*)phystokv(pa), size);
12538 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12539 }
12540 
12541 /**
12542  * Removes write access from the Physical Aperture.
12543  *
12544  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12545  * @note Designed to work only with the zone allocator's read-only submap.
12546  *
12547  * @param va VA of the page to restore write access to.
12548  *
12549  */
12550 MARK_AS_PMAP_TEXT static void
12551 pmap_phys_write_disable(vm_address_t va)
12552 {
12553 #if XNU_MONITOR
12554 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12555 #else /* XNU_MONITOR */
12556 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12557 #endif /* XNU_MONITOR */
12558 }
12559 
12560 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12561 
12562 MARK_AS_PMAP_TEXT mach_vm_size_t
12563 pmap_query_resident_internal(
12564 	pmap_t                  pmap,
12565 	vm_map_address_t        start,
12566 	vm_map_address_t        end,
12567 	mach_vm_size_t          *compressed_bytes_p)
12568 {
12569 	mach_vm_size_t  resident_bytes = 0;
12570 	mach_vm_size_t  compressed_bytes = 0;
12571 
12572 	pt_entry_t     *bpte, *epte;
12573 	pt_entry_t     *pte_p;
12574 	tt_entry_t     *tte_p;
12575 
12576 	if (pmap == NULL) {
12577 		return PMAP_RESIDENT_INVALID;
12578 	}
12579 
12580 	validate_pmap(pmap);
12581 
12582 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12583 
12584 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12585 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12586 	    (end % pt_attr_page_size(pt_attr)))) {
12587 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12588 	}
12589 
12590 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12591 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12592 	}
12593 
12594 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12595 	tte_p = pmap_tte(pmap, start);
12596 	if (tte_p == (tt_entry_t *) NULL) {
12597 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12598 		return PMAP_RESIDENT_INVALID;
12599 	}
12600 	if (tte_is_valid_table(*tte_p)) {
12601 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12602 		bpte = &pte_p[pte_index(pt_attr, start)];
12603 		epte = &pte_p[pte_index(pt_attr, end)];
12604 
12605 		for (; bpte < epte; bpte++) {
12606 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12607 				compressed_bytes += pt_attr_page_size(pt_attr);
12608 			} else if (pa_valid(pte_to_pa(*bpte))) {
12609 				resident_bytes += pt_attr_page_size(pt_attr);
12610 			}
12611 		}
12612 	}
12613 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12614 
12615 	if (compressed_bytes_p) {
12616 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12617 		*compressed_bytes_p += compressed_bytes;
12618 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12619 	}
12620 
12621 	return resident_bytes;
12622 }
12623 
12624 mach_vm_size_t
12625 pmap_query_resident(
12626 	pmap_t                  pmap,
12627 	vm_map_address_t        start,
12628 	vm_map_address_t        end,
12629 	mach_vm_size_t          *compressed_bytes_p)
12630 {
12631 	mach_vm_size_t          total_resident_bytes;
12632 	mach_vm_size_t          compressed_bytes;
12633 	vm_map_address_t        va;
12634 
12635 
12636 	if (pmap == PMAP_NULL) {
12637 		if (compressed_bytes_p) {
12638 			*compressed_bytes_p = 0;
12639 		}
12640 		return 0;
12641 	}
12642 
12643 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12644 
12645 	total_resident_bytes = 0;
12646 	compressed_bytes = 0;
12647 
12648 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12649 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12650 	    VM_KERNEL_ADDRHIDE(end));
12651 
12652 	va = start;
12653 	while (va < end) {
12654 		vm_map_address_t l;
12655 		mach_vm_size_t resident_bytes;
12656 
12657 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12658 
12659 		if (l > end) {
12660 			l = end;
12661 		}
12662 #if XNU_MONITOR
12663 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12664 #else
12665 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12666 #endif
12667 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12668 			break;
12669 		}
12670 
12671 		total_resident_bytes += resident_bytes;
12672 
12673 		va = l;
12674 	}
12675 
12676 	if (compressed_bytes_p) {
12677 		*compressed_bytes_p = compressed_bytes;
12678 	}
12679 
12680 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12681 	    total_resident_bytes);
12682 
12683 	return total_resident_bytes;
12684 }
12685 
12686 #if MACH_ASSERT
12687 static void
12688 pmap_check_ledgers(
12689 	pmap_t pmap)
12690 {
12691 	int     pid;
12692 	char    *procname;
12693 
12694 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12695 		/*
12696 		 * This pmap was not or is no longer fully associated
12697 		 * with a task (e.g. the old pmap after a fork()/exec() or
12698 		 * spawn()).  Its "ledger" still points at a task that is
12699 		 * now using a different (and active) address space, so
12700 		 * we can't check that all the pmap ledgers are balanced here.
12701 		 *
12702 		 * If the "pid" is set, that means that we went through
12703 		 * pmap_set_process() in task_terminate_internal(), so
12704 		 * this task's ledger should not have been re-used and
12705 		 * all the pmap ledgers should be back to 0.
12706 		 */
12707 		return;
12708 	}
12709 
12710 	pid = pmap->pmap_pid;
12711 	procname = pmap->pmap_procname;
12712 
12713 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12714 }
12715 #endif /* MACH_ASSERT */
12716 
12717 void
12718 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12719 {
12720 }
12721 
12722 /**
12723  * The minimum shared region nesting size is used by the VM to determine when to
12724  * break up large mappings to nested regions. The smallest size that these
12725  * mappings can be broken into is determined by what page table level those
12726  * regions are being nested in at and the size of the page tables.
12727  *
12728  * For instance, if a nested region is nesting at L2 for a process utilizing
12729  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12730  * block entry).
12731  *
12732  * @param pmap The target pmap to determine the block size based on whether it's
12733  *             using 16KB or 4KB page tables.
12734  */
12735 uint64_t
12736 pmap_shared_region_size_min(__unused pmap_t pmap)
12737 {
12738 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12739 
12740 	/**
12741 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12742 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12743 	 * point to shared L3 page tables in the shared region pmap.
12744 	 */
12745 	return pt_attr_twig_size(pt_attr);
12746 }
12747 
12748 boolean_t
12749 pmap_enforces_execute_only(
12750 	pmap_t pmap)
12751 {
12752 	return pmap != kernel_pmap;
12753 }
12754 
12755 MARK_AS_PMAP_TEXT void
12756 pmap_set_vm_map_cs_enforced_internal(
12757 	pmap_t pmap,
12758 	bool new_value)
12759 {
12760 	validate_pmap_mutable(pmap);
12761 	pmap->pmap_vm_map_cs_enforced = new_value;
12762 }
12763 
12764 void
12765 pmap_set_vm_map_cs_enforced(
12766 	pmap_t pmap,
12767 	bool new_value)
12768 {
12769 #if XNU_MONITOR
12770 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12771 #else
12772 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12773 #endif
12774 }
12775 
12776 extern int cs_process_enforcement_enable;
12777 bool
12778 pmap_get_vm_map_cs_enforced(
12779 	pmap_t pmap)
12780 {
12781 	if (cs_process_enforcement_enable) {
12782 		return true;
12783 	}
12784 	return pmap->pmap_vm_map_cs_enforced;
12785 }
12786 
12787 MARK_AS_PMAP_TEXT void
12788 pmap_set_jit_entitled_internal(
12789 	__unused pmap_t pmap)
12790 {
12791 	return;
12792 }
12793 
12794 void
12795 pmap_set_jit_entitled(
12796 	pmap_t pmap)
12797 {
12798 #if XNU_MONITOR
12799 	pmap_set_jit_entitled_ppl(pmap);
12800 #else
12801 	pmap_set_jit_entitled_internal(pmap);
12802 #endif
12803 }
12804 
12805 bool
12806 pmap_get_jit_entitled(
12807 	__unused pmap_t pmap)
12808 {
12809 	return false;
12810 }
12811 
12812 MARK_AS_PMAP_TEXT void
12813 pmap_set_tpro_internal(
12814 	__unused pmap_t pmap)
12815 {
12816 	return;
12817 }
12818 
12819 void
12820 pmap_set_tpro(
12821 	pmap_t pmap)
12822 {
12823 #if XNU_MONITOR
12824 	pmap_set_tpro_ppl(pmap);
12825 #else /* XNU_MONITOR */
12826 	pmap_set_tpro_internal(pmap);
12827 #endif /* XNU_MONITOR */
12828 }
12829 
12830 bool
12831 pmap_get_tpro(
12832 	__unused pmap_t pmap)
12833 {
12834 	return false;
12835 }
12836 
12837 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12838 
12839 MARK_AS_PMAP_TEXT kern_return_t
12840 pmap_query_page_info_internal(
12841 	pmap_t          pmap,
12842 	vm_map_offset_t va,
12843 	int             *disp_p)
12844 {
12845 	pmap_paddr_t    pa;
12846 	int             disp;
12847 	unsigned int    pai;
12848 	pt_entry_t      *pte_p, pte;
12849 	pv_entry_t      **pv_h, *pve_p;
12850 
12851 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12852 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12853 		*disp_p = 0;
12854 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12855 		return KERN_INVALID_ARGUMENT;
12856 	}
12857 
12858 	validate_pmap(pmap);
12859 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12860 
12861 try_again:
12862 	disp = 0;
12863 	pte_p = pmap_pte(pmap, va);
12864 	if (pte_p == PT_ENTRY_NULL) {
12865 		goto done;
12866 	}
12867 	pte = *(volatile pt_entry_t*)pte_p;
12868 	pa = pte_to_pa(pte);
12869 	if (pa == 0) {
12870 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12871 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12872 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12873 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12874 			}
12875 		}
12876 	} else {
12877 		disp |= PMAP_QUERY_PAGE_PRESENT;
12878 		pai = pa_index(pa);
12879 		if (!pa_valid(pa)) {
12880 			goto done;
12881 		}
12882 		pvh_lock(pai);
12883 		if (pte != *(volatile pt_entry_t*)pte_p) {
12884 			/* something changed: try again */
12885 			pvh_unlock(pai);
12886 			pmap_query_page_info_retries++;
12887 			goto try_again;
12888 		}
12889 		pv_h = pai_to_pvh(pai);
12890 		pve_p = PV_ENTRY_NULL;
12891 		int pve_ptep_idx = 0;
12892 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12893 			pve_p = pvh_pve_list(pv_h);
12894 			while (pve_p != PV_ENTRY_NULL &&
12895 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12896 				pve_p = pve_next(pve_p);
12897 			}
12898 		}
12899 
12900 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12901 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12902 		} else if (ppattr_test_reusable(pai)) {
12903 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12904 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12905 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12906 		}
12907 		pvh_unlock(pai);
12908 	}
12909 
12910 done:
12911 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12912 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12913 	*disp_p = disp;
12914 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12915 	return KERN_SUCCESS;
12916 }
12917 
12918 kern_return_t
12919 pmap_query_page_info(
12920 	pmap_t          pmap,
12921 	vm_map_offset_t va,
12922 	int             *disp_p)
12923 {
12924 #if XNU_MONITOR
12925 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12926 #else
12927 	return pmap_query_page_info_internal(pmap, va, disp_p);
12928 #endif
12929 }
12930 
12931 
12932 
12933 uint32_t
12934 pmap_user_va_bits(pmap_t pmap __unused)
12935 {
12936 #if __ARM_MIXED_PAGE_SIZE__
12937 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12938 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12939 #else
12940 	return 64 - T0SZ_BOOT;
12941 #endif
12942 }
12943 
12944 uint32_t
12945 pmap_kernel_va_bits(void)
12946 {
12947 	return 64 - T1SZ_BOOT;
12948 }
12949 
12950 static vm_map_size_t
12951 pmap_user_va_size(pmap_t pmap)
12952 {
12953 	return 1ULL << pmap_user_va_bits(pmap);
12954 }
12955 
12956 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
12957 static vm_map_address_t
12958 pmap_strip_user_addr(pmap_t pmap, vm_map_address_t ptr)
12959 {
12960 	assert(pmap && pmap != kernel_pmap);
12961 
12962 	/*
12963 	 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR0 address.
12964 	 * Ignore the strip request.
12965 	 */
12966 	if ((ptr & TTBR_SELECTOR) != 0) {
12967 		return ptr;
12968 	}
12969 
12970 	/* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
12971 	return ptr & (pmap->max - 1);
12972 }
12973 
12974 static vm_map_address_t
12975 pmap_strip_kernel_addr(pmap_t pmap, vm_map_address_t ptr)
12976 {
12977 	assert(pmap && pmap == kernel_pmap);
12978 
12979 	/*
12980 	 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR1 address.
12981 	 * Ignore the strip request.
12982 	 */
12983 	if ((ptr & TTBR_SELECTOR) == 0) {
12984 		return ptr;
12985 	}
12986 
12987 	/* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
12988 	return ptr | pmap->min;
12989 }
12990 
12991 vm_map_address_t
12992 pmap_strip_addr(pmap_t pmap, vm_map_address_t ptr)
12993 {
12994 	assert(pmap);
12995 
12996 	return pmap == kernel_pmap ? pmap_strip_kernel_addr(pmap, ptr) :
12997 	       pmap_strip_user_addr(pmap, ptr);
12998 }
12999 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
13000 
13001 
13002 
13003 bool
13004 pmap_in_ppl(void)
13005 {
13006 	// Unsupported
13007 	return false;
13008 }
13009 
13010 __attribute__((__noreturn__))
13011 void
13012 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
13013 {
13014 	panic("%s called on an unsupported platform.", __FUNCTION__);
13015 }
13016 
13017 void *
13018 pmap_claim_reserved_ppl_page(void)
13019 {
13020 	// Unsupported
13021 	return NULL;
13022 }
13023 
13024 void
13025 pmap_free_reserved_ppl_page(void __unused *kva)
13026 {
13027 	// Unsupported
13028 }
13029 
13030 
13031 #if PMAP_CS_PPL_MONITOR
13032 
13033 /* Immutable part of the trust cache runtime */
13034 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
13035 
13036 /* Mutable part of the trust cache runtime */
13037 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
13038 
13039 /* Lock for the trust cache runtime */
13040 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
13041 
13042 MARK_AS_PMAP_TEXT kern_return_t
13043 pmap_check_trust_cache_runtime_for_uuid_internal(
13044 	const uint8_t check_uuid[kUUIDSize])
13045 {
13046 	kern_return_t ret = KERN_DENIED;
13047 
13048 	/* Lock the runtime as shared */
13049 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13050 
13051 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
13052 		&ppl_trust_cache_rt,
13053 		check_uuid,
13054 		NULL);
13055 
13056 	/* Unlock the runtime */
13057 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13058 
13059 	if (tc_ret.error == kTCReturnSuccess) {
13060 		ret = KERN_SUCCESS;
13061 	} else if (tc_ret.error == kTCReturnNotFound) {
13062 		ret = KERN_NOT_FOUND;
13063 	} else {
13064 		ret = KERN_FAILURE;
13065 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
13066 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13067 	}
13068 
13069 	return ret;
13070 }
13071 
13072 kern_return_t
13073 pmap_check_trust_cache_runtime_for_uuid(
13074 	const uint8_t check_uuid[kUUIDSize])
13075 {
13076 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
13077 }
13078 
13079 MARK_AS_PMAP_TEXT kern_return_t
13080 pmap_load_trust_cache_with_type_internal(
13081 	TCType_t type,
13082 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13083 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13084 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13085 {
13086 	kern_return_t ret = KERN_DENIED;
13087 	pmap_img4_payload_t *payload = NULL;
13088 	size_t img4_payload_len = 0;
13089 	size_t payload_len_aligned = 0;
13090 	size_t manifest_len_aligned = 0;
13091 
13092 	/* Ignore the auxiliary manifest until we add support for it */
13093 	(void)img4_aux_manifest;
13094 	(void)img4_aux_manifest_len;
13095 
13096 
13097 #if PMAP_CS_INCLUDE_CODE_SIGNING
13098 	if (pmap_cs) {
13099 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13100 			panic("trust cache type not loadable from interface: %u", type);
13101 		} else if (type >= kTCTypeTotal) {
13102 			panic("attempted to load an unsupported trust cache type: %u", type);
13103 		}
13104 
13105 		/* Validate entitlement for the calling process */
13106 		if (TCTypeConfig[type].entitlementValue != NULL) {
13107 			const bool entitlement_satisfied = check_entitlement_pmap(
13108 				NULL,
13109 				"com.apple.private.pmap.load-trust-cache",
13110 				TCTypeConfig[type].entitlementValue,
13111 				false,
13112 				true);
13113 
13114 			if (entitlement_satisfied == false) {
13115 				panic("attempted to load trust cache without entitlement: %u", type);
13116 			}
13117 		}
13118 	}
13119 #endif
13120 
13121 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13122 	ret = pmap_reserve_ppl_page();
13123 	if (ret != KERN_SUCCESS) {
13124 		if (ret != KERN_RESOURCE_SHORTAGE) {
13125 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13126 		}
13127 		return ret;
13128 	}
13129 
13130 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
13131 	payload_len_aligned = round_page(pmap_img4_payload_len);
13132 	manifest_len_aligned = round_page(img4_manifest_len);
13133 
13134 	/* Ensure we have valid data passed in */
13135 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13136 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13137 
13138 	/*
13139 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13140 	 * data structure used by libTrustCache to manage the payload. We need to be able to
13141 	 * write to that data structure, so we keep the payload PPL writable.
13142 	 */
13143 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13144 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13145 
13146 	/* Should be safe to read from this now */
13147 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
13148 
13149 	/* Acquire a writable version of the trust cache data structure */
13150 	TrustCache_t *trust_cache = &payload->trust_cache;
13151 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13152 
13153 	/* Calculate the correct length of the img4 payload */
13154 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13155 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13156 	}
13157 
13158 	/* Exclusively lock the runtime */
13159 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13160 
13161 	/* Load the trust cache */
13162 	TCReturn_t tc_ret = amfi->TrustCache.load(
13163 		&ppl_trust_cache_rt,
13164 		type,
13165 		trust_cache,
13166 		(const uintptr_t)payload->img4_payload, img4_payload_len,
13167 		(const uintptr_t)img4_manifest, img4_manifest_len);
13168 
13169 	/* Unlock the runtime */
13170 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13171 
13172 	if (tc_ret.error == kTCReturnSuccess) {
13173 		ret = KERN_SUCCESS;
13174 	} else {
13175 		if (tc_ret.error == kTCReturnDuplicate) {
13176 			ret = KERN_ALREADY_IN_SET;
13177 		} else {
13178 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13179 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13180 
13181 			ret = KERN_FAILURE;
13182 		}
13183 
13184 		/* Unlock the payload data */
13185 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13186 		trust_cache = NULL;
13187 		payload = NULL;
13188 	}
13189 
13190 	/* Unlock the manifest since it is no longer needed */
13191 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13192 
13193 	/* Return the CoreCrypto reserved page back to the free list */
13194 	pmap_release_reserved_ppl_page();
13195 
13196 	return ret;
13197 }
13198 
13199 kern_return_t
13200 pmap_load_trust_cache_with_type(
13201 	TCType_t type,
13202 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13203 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13204 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13205 {
13206 	kern_return_t ret = KERN_DENIED;
13207 
13208 	ret = pmap_load_trust_cache_with_type_ppl(
13209 		type,
13210 		pmap_img4_payload, pmap_img4_payload_len,
13211 		img4_manifest, img4_manifest_len,
13212 		img4_aux_manifest, img4_aux_manifest_len);
13213 
13214 	while (ret == KERN_RESOURCE_SHORTAGE) {
13215 		/* Allocate a page from the free list */
13216 		pmap_alloc_page_for_ppl(0);
13217 
13218 		/* Attempt the call again */
13219 		ret = pmap_load_trust_cache_with_type_ppl(
13220 			type,
13221 			pmap_img4_payload, pmap_img4_payload_len,
13222 			img4_manifest, img4_manifest_len,
13223 			img4_aux_manifest, img4_aux_manifest_len);
13224 	}
13225 
13226 	return ret;
13227 }
13228 
13229 MARK_AS_PMAP_TEXT kern_return_t
13230 pmap_query_trust_cache_safe(
13231 	TCQueryType_t query_type,
13232 	const uint8_t cdhash[kTCEntryHashSize],
13233 	TrustCacheQueryToken_t *query_token)
13234 {
13235 	kern_return_t ret = KERN_NOT_FOUND;
13236 
13237 	/* Validate the query type preemptively */
13238 	if (query_type >= kTCQueryTypeTotal) {
13239 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13240 		return KERN_INVALID_ARGUMENT;
13241 	}
13242 
13243 	/* Lock the runtime as shared */
13244 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13245 
13246 	TCReturn_t tc_ret = amfi->TrustCache.query(
13247 		&ppl_trust_cache_rt,
13248 		query_type,
13249 		cdhash,
13250 		query_token);
13251 
13252 	/* Unlock the runtime */
13253 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13254 
13255 	if (tc_ret.error == kTCReturnSuccess) {
13256 		ret = KERN_SUCCESS;
13257 	} else if (tc_ret.error == kTCReturnNotFound) {
13258 		ret = KERN_NOT_FOUND;
13259 	} else {
13260 		ret = KERN_FAILURE;
13261 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13262 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13263 	}
13264 
13265 	return ret;
13266 }
13267 
13268 MARK_AS_PMAP_TEXT kern_return_t
13269 pmap_query_trust_cache_internal(
13270 	TCQueryType_t query_type,
13271 	const uint8_t cdhash[kTCEntryHashSize],
13272 	TrustCacheQueryToken_t *query_token)
13273 {
13274 	kern_return_t ret = KERN_NOT_FOUND;
13275 	TrustCacheQueryToken_t query_token_safe = {0};
13276 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13277 
13278 	/* Copy in the CDHash into PPL storage */
13279 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13280 
13281 	/* Query through the safe API since we're in the PPL now */
13282 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13283 
13284 	if (query_token != NULL) {
13285 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13286 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13287 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13288 	}
13289 
13290 	return ret;
13291 }
13292 
13293 kern_return_t
13294 pmap_query_trust_cache(
13295 	TCQueryType_t query_type,
13296 	const uint8_t cdhash[kTCEntryHashSize],
13297 	TrustCacheQueryToken_t *query_token)
13298 {
13299 	kern_return_t ret = KERN_NOT_FOUND;
13300 
13301 	ret = pmap_query_trust_cache_ppl(
13302 		query_type,
13303 		cdhash,
13304 		query_token);
13305 
13306 	return ret;
13307 }
13308 
13309 MARK_AS_PMAP_DATA uint8_t ppl_developer_mode_set = 0;
13310 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13311 
13312 MARK_AS_PMAP_TEXT void
13313 pmap_toggle_developer_mode_internal(
13314 	bool state)
13315 {
13316 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13317 	/*
13318 	 * On internal builds, we may call into the PPL twice in order to enable developer
13319 	 * mode during early boot and during data migration. The latter does not happen for
13320 	 * non-internal builds, and thus those only need to support a single transition to
13321 	 * enabling developer mode.
13322 	 */
13323 	const uint8_t epoch_enable = 2;
13324 #else
13325 	const uint8_t epoch_enable = 1;
13326 #endif
13327 
13328 	/*
13329 	 * We don't really care if the state is false -- in that case, the transition can
13330 	 * happen as many times as needed. However, we still need to increment whenever we
13331 	 * set the state as such. This is partly because we need to track whether we have
13332 	 * actually resolved the state or not, and also because we expect developer mode
13333 	 * to only be enabled during the first or second (internal-only) call into this
13334 	 * function.
13335 	 */
13336 	uint8_t epoch = os_atomic_inc_orig(&ppl_developer_mode_set, relaxed);
13337 
13338 	if (state == os_atomic_load(&ppl_developer_mode_storage, relaxed)) {
13339 		return;
13340 	} else if ((state == true) && (epoch >= epoch_enable)) {
13341 		panic("PMAP_CS: enabling developer mode incorrectly [%u]", epoch);
13342 	}
13343 
13344 	/* Update the developer mode state on the system */
13345 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13346 }
13347 
13348 void
13349 pmap_toggle_developer_mode(
13350 	bool state)
13351 {
13352 	pmap_toggle_developer_mode_ppl(state);
13353 }
13354 
13355 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13356 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13357 
13358 #pragma mark Image4 - New
13359 
13360 typedef struct _pmap_image4_dispatch {
13361 	image4_cs_trap_t selector;
13362 	image4_cs_trap_handler_t handler;
13363 } pmap_image4_dispatch_t;
13364 
13365 MARK_AS_PMAP_TEXT static errno_t
13366 _pmap_image4_monitor_trap_set_release_type(
13367 	const pmap_image4_dispatch_t *dispatch,
13368 	const void *input_data)
13369 {
13370 	/*
13371 	 * csmx_release_type --> __cs_copy
13372 	 */
13373 	image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13374 
13375 	/* Copy the input data to prevent ToCToU */
13376 	memcpy(&input, input_data, sizeof(input));
13377 
13378 	/* Dispatch to AppleImage4 */
13379 	return dispatch->handler(
13380 		dispatch->selector,
13381 		&input, sizeof(input),
13382 		NULL, NULL);
13383 }
13384 
13385 
13386 
13387 MARK_AS_PMAP_TEXT static errno_t
13388 _pmap_image4_monitor_trap_nonce_set(
13389 	const pmap_image4_dispatch_t *dispatch,
13390 	const void *input_data)
13391 {
13392 	/*
13393 	 * csmx_clear --> __cs_copy
13394 	 * csmx_cipher --> __cs_copy
13395 	 */
13396 	image4_cs_trap_argv_nonce_set_t input = {0};
13397 
13398 	/* Copy the input data to prevent ToCToU */
13399 	memcpy(&input, input_data, sizeof(input));
13400 
13401 	/* Dispatch to AppleImage4 */
13402 	return dispatch->handler(
13403 		dispatch->selector,
13404 		&input, sizeof(input),
13405 		NULL, NULL);
13406 }
13407 
13408 MARK_AS_PMAP_TEXT static errno_t
13409 _pmap_image4_monitor_trap_nonce_roll(
13410 	const pmap_image4_dispatch_t *dispatch,
13411 	const void *input_data)
13412 {
13413 	image4_cs_trap_argv_nonce_roll_t input = {0};
13414 
13415 	/* Copy the input data to prevent ToCToU */
13416 	memcpy(&input, input_data, sizeof(input));
13417 
13418 	/* Dispatch to AppleImage4 */
13419 	return dispatch->handler(
13420 		dispatch->selector,
13421 		&input, sizeof(input),
13422 		NULL, NULL);
13423 }
13424 
13425 MARK_AS_PMAP_TEXT static errno_t
13426 _pmap_image4_monitor_trap_image_activate(
13427 	const pmap_image4_dispatch_t *dispatch,
13428 	const void *input_data)
13429 {
13430 	/*
13431 	 * csmx_payload (csmx_payload_len) --> __cs_xfer
13432 	 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13433 	 */
13434 	image4_cs_trap_argv_image_activate_t input = {0};
13435 
13436 	/* Copy the input data to prevent ToCToU */
13437 	memcpy(&input, input_data, sizeof(input));
13438 
13439 	/* Validate the payload region */
13440 	pmap_cs_assert_addr(
13441 		input.csmx_payload, round_page(input.csmx_payload_len),
13442 		false, false);
13443 
13444 	/* Validate the manifest region */
13445 	pmap_cs_assert_addr(
13446 		input.csmx_manifest, round_page(input.csmx_manifest_len),
13447 		false, false);
13448 
13449 	/* Lockdown the payload region */
13450 	pmap_cs_lockdown_pages(
13451 		input.csmx_payload, round_page(input.csmx_payload_len), false);
13452 
13453 	/* Lockdown the manifest region */
13454 	pmap_cs_lockdown_pages(
13455 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13456 
13457 	/* Dispatch the handler */
13458 	errno_t err = dispatch->handler(
13459 		dispatch->selector,
13460 		&input, sizeof(input),
13461 		NULL, NULL);
13462 
13463 	/*
13464 	 * Image activation always returns the manifest back to the kernel since it isn't
13465 	 * needed once the evaluation of the image has been completed. The payload must
13466 	 * remain owned by the monitor if the activation was successful.
13467 	 */
13468 	if (err != 0) {
13469 		/* Unlock the payload region */
13470 		pmap_cs_unlockdown_pages(
13471 			input.csmx_payload, round_page(input.csmx_payload_len), false);
13472 	}
13473 
13474 	/* Unlock the manifest region */
13475 	pmap_cs_unlockdown_pages(
13476 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13477 
13478 	return err;
13479 }
13480 
13481 MARK_AS_PMAP_TEXT static errno_t
13482 _pmap_image4_monitor_trap_passthrough(
13483 	__unused const pmap_image4_dispatch_t *dispatch,
13484 	__unused const void *input_data,
13485 	__unused size_t input_size)
13486 {
13487 #if DEVELOPMENT || DEBUG || KASAN
13488 	return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13489 #else
13490 	pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13491 	return ENOSYS;
13492 #endif
13493 }
13494 
13495 MARK_AS_PMAP_TEXT errno_t
13496 pmap_image4_monitor_trap_internal(
13497 	image4_cs_trap_t selector,
13498 	const void *input_data,
13499 	size_t input_size)
13500 {
13501 	kern_return_t ret = KERN_DENIED;
13502 	errno_t err = EPERM;
13503 
13504 	/* Acquire the handler for this selector */
13505 	image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13506 	if (handler == NULL) {
13507 		pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13508 		return EINVAL;
13509 	}
13510 
13511 	/* Verify input size for the handler */
13512 	if (input_size != image4_cs_trap_vector_size(selector)) {
13513 		pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13514 		return EINVAL;
13515 	}
13516 
13517 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13518 	ret = pmap_reserve_ppl_page();
13519 	if (ret != KERN_SUCCESS) {
13520 		if (ret == KERN_RESOURCE_SHORTAGE) {
13521 			return ENOMEM;
13522 		}
13523 		pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13524 		return EPERM;
13525 	}
13526 
13527 	/* Setup dispatch parameters */
13528 	pmap_image4_dispatch_t dispatch = {
13529 		.selector = selector,
13530 		.handler = handler
13531 	};
13532 
13533 	switch (selector) {
13534 	case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13535 		err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13536 		break;
13537 
13538 	case IMAGE4_CS_TRAP_NONCE_SET:
13539 		err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13540 		break;
13541 
13542 	case IMAGE4_CS_TRAP_NONCE_ROLL:
13543 		err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13544 		break;
13545 
13546 	case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13547 		err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13548 		break;
13549 
13550 	default:
13551 		err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13552 		break;
13553 	}
13554 
13555 	/* Return the CoreCrypto reserved page back to the free list */
13556 	pmap_release_reserved_ppl_page();
13557 
13558 	return err;
13559 }
13560 
13561 errno_t
13562 pmap_image4_monitor_trap(
13563 	image4_cs_trap_t selector,
13564 	const void *input_data,
13565 	size_t input_size)
13566 {
13567 	errno_t err = EPERM;
13568 
13569 	err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13570 	while (err == ENOMEM) {
13571 		/* Allocate a page from the free list */
13572 		pmap_alloc_page_for_ppl(0);
13573 
13574 		/* Call the monitor dispatch again */
13575 		err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13576 	}
13577 
13578 	return err;
13579 }
13580 
13581 #endif /* PMAP_CS_PPL_MONITOR */
13582 
13583 #if PMAP_CS_INCLUDE_CODE_SIGNING
13584 
13585 static int
13586 pmap_cs_profiles_rbtree_compare(
13587 	void *profile0,
13588 	void *profile1)
13589 {
13590 	if (profile0 < profile1) {
13591 		return -1;
13592 	} else if (profile0 > profile1) {
13593 		return 1;
13594 	}
13595 	return 0;
13596 }
13597 
13598 /* Red-black tree for managing provisioning profiles */
13599 MARK_AS_PMAP_DATA static
13600 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13601 
13602 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13603 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13604 
13605 /* Lock for the profile red-black tree */
13606 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13607 
13608 void
13609 pmap_initialize_provisioning_profiles(void)
13610 {
13611 	/* Initialize the profiles red-black tree lock */
13612 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13613 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13614 
13615 	/* Initialize the red-black tree itself */
13616 	RB_INIT(&pmap_cs_registered_profiles);
13617 
13618 	printf("initialized PPL provisioning profile data\n");
13619 }
13620 
13621 static bool
13622 pmap_is_testflight_profile(
13623 	pmap_cs_profile_t *profile_obj)
13624 {
13625 	const char *entitlement_name = "beta-reports-active";
13626 	const size_t entitlement_length = strlen(entitlement_name);
13627 	CEQueryOperation_t query[2] = {0};
13628 
13629 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13630 	if (profile_obj->entitlements_ctx == NULL) {
13631 		return false;
13632 	}
13633 
13634 	/* Build our CoreEntitlements query */
13635 	query[0].opcode = kCEOpSelectKey;
13636 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13637 	query[0].parameters.stringParameter.length = entitlement_length;
13638 	query[1] = CEMatchBool(true);
13639 
13640 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13641 		profile_obj->entitlements_ctx,
13642 		query, 2);
13643 
13644 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13645 		return true;
13646 	}
13647 
13648 	return false;
13649 }
13650 
13651 static bool
13652 pmap_is_development_profile(
13653 	pmap_cs_profile_t *profile_obj)
13654 {
13655 	/* Check for UPP */
13656 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13657 		*profile_obj->profile_ctx,
13658 		CESelectDictValue("ProvisionsAllDevices"));
13659 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13660 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13661 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13662 			return false;
13663 		}
13664 	}
13665 
13666 	/* Check for TestFlight profile */
13667 	if (pmap_is_testflight_profile(profile_obj) == true) {
13668 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13669 		return false;
13670 	}
13671 
13672 	pmap_cs_log_info("%p: development profile", profile_obj);
13673 	return true;
13674 }
13675 
13676 static kern_return_t
13677 pmap_initialize_profile_entitlements(
13678 	pmap_cs_profile_t *profile_obj)
13679 {
13680 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13681 		*profile_obj->profile_ctx,
13682 		CESelectDictValue("Entitlements"));
13683 
13684 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13685 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13686 		profile_obj->entitlements_ctx = NULL;
13687 
13688 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13689 		return KERN_NOT_FOUND;
13690 	}
13691 
13692 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13693 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13694 
13695 	CEValidationResult ce_result = {0};
13696 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13697 		pmap_cs_core_entitlements_runtime,
13698 		&ce_result,
13699 		der_start, der_end);
13700 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13701 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13702 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13703 
13704 		return KERN_ABORTED;
13705 	}
13706 
13707 	struct CEQueryContext query_ctx = {0};
13708 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13709 		pmap_cs_core_entitlements_runtime,
13710 		ce_result,
13711 		&query_ctx);
13712 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13713 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13714 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13715 
13716 		return KERN_ABORTED;
13717 	}
13718 
13719 	/* Setup the entitlements context within the profile object */
13720 	profile_obj->entitlements_ctx_storage = query_ctx;
13721 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13722 
13723 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13724 	return KERN_SUCCESS;
13725 }
13726 
13727 kern_return_t
13728 pmap_register_provisioning_profile_internal(
13729 	const vm_address_t payload_addr,
13730 	const vm_size_t payload_size)
13731 {
13732 	kern_return_t ret = KERN_DENIED;
13733 	pmap_cs_profile_t *profile_obj = NULL;
13734 	pmap_profile_payload_t *profile_payload = NULL;
13735 	vm_size_t max_profile_blob_size = 0;
13736 	const uint8_t *profile_content = NULL;
13737 	size_t profile_content_length = 0;
13738 
13739 
13740 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13741 	ret = pmap_reserve_ppl_page();
13742 	if (ret != KERN_SUCCESS) {
13743 		if (ret != KERN_RESOURCE_SHORTAGE) {
13744 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13745 		}
13746 		return ret;
13747 	}
13748 
13749 	/* Ensure we have valid data passed in */
13750 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13751 
13752 	/*
13753 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13754 	 * data structure used by the PPL to manage the payload. We need to be able to write
13755 	 * to that data structure, so we keep the payload PPL writable.
13756 	 */
13757 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13758 
13759 	/* Should be safe to read from this now */
13760 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13761 
13762 	/* Ensure the profile blob size provided is valid */
13763 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13764 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13765 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13766 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13767 	}
13768 
13769 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13770 	const bool allow_development_root_cert = true;
13771 #else
13772 	const bool allow_development_root_cert = false;
13773 #endif
13774 
13775 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13776 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13777 		allow_development_root_cert,
13778 		&profile_content, &profile_content_length);
13779 
13780 	/* Release the PPL page allocated for CoreCrypto */
13781 	pmap_release_reserved_ppl_page();
13782 
13783 	if (ct_result != 0) {
13784 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13785 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13786 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13787 		    profile_content, profile_content_length);
13788 	}
13789 
13790 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13791 		pmap_cs_core_entitlements_runtime,
13792 		CCDER_CONSTRUCTED_SET,
13793 		false,
13794 		profile_content, profile_content + profile_content_length);
13795 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13796 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13797 	}
13798 
13799 	/* Acquire a writable version of the profile data structure */
13800 	profile_obj = &profile_payload->profile_obj_storage;
13801 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13802 
13803 	profile_obj->original_payload = profile_payload;
13804 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13805 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13806 	os_atomic_store(&profile_obj->reference_count, 0, release);
13807 
13808 	/* Setup the entitlements provisioned by the profile */
13809 	ret = pmap_initialize_profile_entitlements(profile_obj);
13810 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13811 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13812 	}
13813 
13814 	/* Setup properties of the profile */
13815 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13816 
13817 	/* Mark as validated since it passed all checks */
13818 	profile_obj->profile_validated = true;
13819 
13820 	/* Add the profile to the red-black tree */
13821 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13822 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13823 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13824 	}
13825 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13826 
13827 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13828 	return KERN_SUCCESS;
13829 }
13830 
13831 kern_return_t
13832 pmap_register_provisioning_profile(
13833 	const vm_address_t payload_addr,
13834 	const vm_size_t payload_size)
13835 {
13836 	kern_return_t ret = KERN_DENIED;
13837 
13838 	ret = pmap_register_provisioning_profile_ppl(
13839 		payload_addr,
13840 		payload_size);
13841 
13842 	while (ret == KERN_RESOURCE_SHORTAGE) {
13843 		/* Allocate a page from the free list */
13844 		pmap_alloc_page_for_ppl(0);
13845 
13846 		/* Attempt the call again */
13847 		ret = pmap_register_provisioning_profile_ppl(
13848 			payload_addr,
13849 			payload_size);
13850 	}
13851 
13852 	return ret;
13853 }
13854 
13855 kern_return_t
13856 pmap_unregister_provisioning_profile_internal(
13857 	pmap_cs_profile_t *profile_obj)
13858 {
13859 	kern_return_t ret = KERN_DENIED;
13860 
13861 	/* Lock the red-black tree exclusively */
13862 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13863 
13864 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13865 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13866 	}
13867 
13868 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13869 	if (reference_count != 0) {
13870 		ret = KERN_FAILURE;
13871 		goto exit;
13872 	}
13873 
13874 	/* Remove the profile from the red-black tree */
13875 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13876 
13877 	/* Unregistration was a success */
13878 	ret = KERN_SUCCESS;
13879 
13880 exit:
13881 	/* Unlock the red-black tree */
13882 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13883 
13884 	if (ret == KERN_SUCCESS) {
13885 		/* Get the original payload address */
13886 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13887 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13888 
13889 		/* Get the original payload size */
13890 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13891 		payload_size = round_page(payload_size);
13892 
13893 		/* Unlock the profile payload */
13894 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13895 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13896 		    profile_payload, payload_size);
13897 
13898 		profile_obj = NULL;
13899 	}
13900 	return ret;
13901 }
13902 
13903 kern_return_t
13904 pmap_unregister_provisioning_profile(
13905 	pmap_cs_profile_t *profile_obj)
13906 {
13907 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13908 }
13909 
13910 kern_return_t
13911 pmap_associate_provisioning_profile_internal(
13912 	pmap_cs_code_directory_t *cd_entry,
13913 	pmap_cs_profile_t *profile_obj)
13914 {
13915 	kern_return_t ret = KERN_DENIED;
13916 
13917 	/* Acquire the lock on the code directory */
13918 	pmap_cs_lock_code_directory(cd_entry);
13919 
13920 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13921 		pmap_cs_log_error("disallowing profile association with verified signature");
13922 		goto exit;
13923 	} else if (cd_entry->profile_obj != NULL) {
13924 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13925 		goto exit;
13926 	}
13927 
13928 	/* Lock the red-black tree as shared */
13929 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13930 
13931 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13932 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13933 	} else if (profile_obj->profile_validated == false) {
13934 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13935 	}
13936 
13937 	/* Associate the profile with the signature */
13938 	cd_entry->profile_obj = profile_obj;
13939 
13940 	/* Increment the reference count on the profile object */
13941 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13942 	if (reference_count == 0) {
13943 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13944 	}
13945 
13946 	/* Unlock the red-black tree */
13947 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13948 
13949 	/* Association was a success */
13950 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13951 	ret = KERN_SUCCESS;
13952 
13953 exit:
13954 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13955 
13956 	return ret;
13957 }
13958 
13959 kern_return_t
13960 pmap_associate_provisioning_profile(
13961 	pmap_cs_code_directory_t *cd_entry,
13962 	pmap_cs_profile_t *profile_obj)
13963 {
13964 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13965 }
13966 
13967 kern_return_t
13968 pmap_disassociate_provisioning_profile_internal(
13969 	pmap_cs_code_directory_t *cd_entry)
13970 {
13971 	pmap_cs_profile_t *profile_obj = NULL;
13972 	kern_return_t ret = KERN_DENIED;
13973 
13974 	/* Acquire the lock on the code directory */
13975 	pmap_cs_lock_code_directory(cd_entry);
13976 
13977 	if (cd_entry->profile_obj == NULL) {
13978 		ret = KERN_NOT_FOUND;
13979 		goto exit;
13980 	}
13981 	profile_obj = cd_entry->profile_obj;
13982 
13983 	/* Disassociate the profile from the signature */
13984 	cd_entry->profile_obj = NULL;
13985 
13986 	/* Disassociation was a success */
13987 	ret = KERN_SUCCESS;
13988 
13989 exit:
13990 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13991 
13992 	if (ret == KERN_SUCCESS) {
13993 		/* Decrement the reference count on the profile object */
13994 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13995 		if (reference_count == UINT32_MAX) {
13996 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13997 		}
13998 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13999 	}
14000 	return ret;
14001 }
14002 
14003 kern_return_t
14004 pmap_disassociate_provisioning_profile(
14005 	pmap_cs_code_directory_t *cd_entry)
14006 {
14007 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
14008 }
14009 
14010 kern_return_t
14011 pmap_associate_kernel_entitlements_internal(
14012 	pmap_cs_code_directory_t *cd_entry,
14013 	const void *kernel_entitlements)
14014 {
14015 	kern_return_t ret = KERN_DENIED;
14016 
14017 	if (kernel_entitlements == NULL) {
14018 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
14019 	}
14020 
14021 	/* Acquire the lock on the code directory */
14022 	pmap_cs_lock_code_directory(cd_entry);
14023 
14024 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
14025 		ret = KERN_DENIED;
14026 		goto out;
14027 	} else if (cd_entry->kernel_entitlements != NULL) {
14028 		ret = KERN_DENIED;
14029 		goto out;
14030 	}
14031 	cd_entry->kernel_entitlements = kernel_entitlements;
14032 
14033 	/* Association was a success */
14034 	ret = KERN_SUCCESS;
14035 
14036 out:
14037 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14038 	return ret;
14039 }
14040 
14041 kern_return_t
14042 pmap_associate_kernel_entitlements(
14043 	pmap_cs_code_directory_t *cd_entry,
14044 	const void *kernel_entitlements)
14045 {
14046 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
14047 }
14048 
14049 kern_return_t
14050 pmap_resolve_kernel_entitlements_internal(
14051 	pmap_t pmap,
14052 	const void **kernel_entitlements)
14053 {
14054 	const void *entitlements = NULL;
14055 	pmap_cs_code_directory_t *cd_entry = NULL;
14056 	kern_return_t ret = KERN_DENIED;
14057 
14058 	/* Validate the PMAP object */
14059 	validate_pmap(pmap);
14060 
14061 	/* Ensure no kernel PMAP */
14062 	if (pmap == kernel_pmap) {
14063 		return KERN_NOT_FOUND;
14064 	}
14065 
14066 	/* Attempt a shared lock on the PMAP */
14067 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
14068 		return KERN_ABORTED;
14069 	}
14070 
14071 	/*
14072 	 * Acquire the code signature from the PMAP. This function is called when
14073 	 * performing an entitlement check, and since we've confirmed this isn't
14074 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
14075 	 * with a code signature.
14076 	 */
14077 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
14078 	if (cd_entry == NULL) {
14079 		ret = KERN_NOT_FOUND;
14080 		goto out;
14081 	}
14082 
14083 	entitlements = cd_entry->kernel_entitlements;
14084 	if (entitlements == NULL) {
14085 		ret = KERN_NOT_FOUND;
14086 		goto out;
14087 	}
14088 
14089 	/* Pin and write out the entitlements object pointer */
14090 	if (kernel_entitlements != NULL) {
14091 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14092 		*kernel_entitlements = entitlements;
14093 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14094 	}
14095 
14096 	/* Successfully resolved the entitlements */
14097 	ret = KERN_SUCCESS;
14098 
14099 out:
14100 	/* Unlock the code signature object */
14101 	if (cd_entry != NULL) {
14102 		lck_rw_unlock_shared(&cd_entry->rwlock);
14103 		cd_entry = NULL;
14104 	}
14105 
14106 	/* Unlock the PMAP object */
14107 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
14108 
14109 	return ret;
14110 }
14111 
14112 kern_return_t
14113 pmap_resolve_kernel_entitlements(
14114 	pmap_t pmap,
14115 	const void **kernel_entitlements)
14116 {
14117 	kern_return_t ret = KERN_DENIED;
14118 
14119 	do {
14120 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14121 	} while (ret == KERN_ABORTED);
14122 
14123 	return ret;
14124 }
14125 
14126 kern_return_t
14127 pmap_accelerate_entitlements_internal(
14128 	pmap_cs_code_directory_t *cd_entry)
14129 {
14130 	const coreentitlements_t *CoreEntitlements = NULL;
14131 	const CS_SuperBlob *superblob = NULL;
14132 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14133 	size_t signature_length = 0;
14134 	size_t acceleration_length = 0;
14135 	size_t required_length = 0;
14136 	kern_return_t ret = KERN_DENIED;
14137 
14138 	/* Setup the CoreEntitlements interface */
14139 	CoreEntitlements = &amfi->CoreEntitlements;
14140 
14141 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14142 
14143 	/* Acquire the lock on the code directory */
14144 	pmap_cs_lock_code_directory(cd_entry);
14145 
14146 	/*
14147 	 * Only reconstituted code signatures can be accelerated. This is only a policy
14148 	 * decision we make since this allows us to re-use any unused space within the
14149 	 * locked down code signature region. There is also a decent bit of validation
14150 	 * within the reconstitution function to ensure blobs are ordered and do not
14151 	 * contain any padding around them which can cause issues here.
14152 	 *
14153 	 * This also serves as a check to ensure the signature is trusted.
14154 	 */
14155 	if (cd_entry->unneeded_code_signature_unlocked == false) {
14156 		ret = KERN_DENIED;
14157 		goto out;
14158 	}
14159 
14160 	if (cd_entry->ce_ctx == NULL) {
14161 		ret = KERN_SUCCESS;
14162 		goto out;
14163 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14164 		ret = KERN_SUCCESS;
14165 		goto out;
14166 	}
14167 
14168 	/* We only support accelerating when size <= PAGE_SIZE */
14169 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14170 	if (ce_err != CoreEntitlements->kNoError) {
14171 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14172 			/* Small entitlement blobs aren't eligible */
14173 			ret = KERN_SUCCESS;
14174 			goto out;
14175 		}
14176 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14177 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14178 	} else if (acceleration_length > PAGE_SIZE) {
14179 		ret = KERN_ABORTED;
14180 		goto out;
14181 	}
14182 	assert(acceleration_length > 0);
14183 
14184 	superblob = cd_entry->superblob;
14185 	signature_length = ntohl(superblob->length);
14186 
14187 	/* Adjust the required length for the overhead structure -- can't overflow */
14188 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14189 	if (required_length > PAGE_SIZE) {
14190 		ret = KERN_ABORTED;
14191 		goto out;
14192 	}
14193 
14194 	/*
14195 	 * First we'll check if the code signature has enough space within the locked down
14196 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14197 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
14198 	 * free list.
14199 	 *
14200 	 * When we're storing the buffer within the code signature, we also need to make
14201 	 * sure we account for alignment of the buffer.
14202 	 */
14203 	const vm_address_t align_mask = sizeof(void*) - 1;
14204 	size_t required_length_within_sig = required_length + align_mask;
14205 
14206 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14207 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14208 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14209 
14210 		/* We need to resolve to the physical aperture */
14211 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14212 		acceleration_buf = (void*)phystokv(phys_addr);
14213 
14214 		/* Ensure the offset within the page wasn't lost */
14215 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14216 
14217 		acceleration_buf->allocated = false;
14218 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14219 	} else {
14220 		if (required_length <= pmap_cs_blob_limit) {
14221 			struct pmap_cs_blob *bucket = NULL;
14222 			size_t bucket_size = 0;
14223 
14224 			/* Allocate a buffer from the blob allocator */
14225 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14226 			if (ret != KERN_SUCCESS) {
14227 				goto out;
14228 			}
14229 			acceleration_buf = (void*)bucket->blob;
14230 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14231 		} else {
14232 			pmap_paddr_t phys_addr = 0;
14233 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14234 			if (ret != KERN_SUCCESS) {
14235 				goto out;
14236 			}
14237 			acceleration_buf = (void*)phystokv(phys_addr);
14238 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14239 		}
14240 		acceleration_buf->allocated = true;
14241 	}
14242 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14243 	acceleration_buf->length = acceleration_length;
14244 
14245 	/* Take the acceleration buffer lock */
14246 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14247 
14248 	/* Setup the global acceleration buffer state */
14249 	pmap_cs_acceleration_buf = acceleration_buf;
14250 
14251 	/* Accelerate the entitlements */
14252 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14253 	if (ce_err != CoreEntitlements->kNoError) {
14254 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14255 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14256 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14257 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14258 	}
14259 
14260 	/*
14261 	 * The global acceleration buffer lock is unlocked by the allocation function itself
14262 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14263 	 * an assert that the lock is unlocked here since another thread could have acquired
14264 	 * it by now.
14265 	 */
14266 	ret = KERN_SUCCESS;
14267 
14268 out:
14269 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14270 	return ret;
14271 }
14272 
14273 kern_return_t
14274 pmap_accelerate_entitlements(
14275 	pmap_cs_code_directory_t *cd_entry)
14276 {
14277 	kern_return_t ret = KERN_DENIED;
14278 
14279 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
14280 	while (ret == KERN_RESOURCE_SHORTAGE) {
14281 		/* Allocate a page for the PPL */
14282 		pmap_alloc_page_for_ppl(0);
14283 
14284 		/* Try again */
14285 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
14286 	}
14287 
14288 	return ret;
14289 }
14290 
14291 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14292 
14293 MARK_AS_PMAP_TEXT bool
14294 pmap_lookup_in_loaded_trust_caches_internal(
14295 	const uint8_t cdhash[CS_CDHASH_LEN])
14296 {
14297 	kern_return_t kr = KERN_NOT_FOUND;
14298 
14299 #if PMAP_CS_PPL_MONITOR
14300 	/*
14301 	 * If we have the PPL monitor, then this function can only be called from
14302 	 * within the PPL. Calling it directly would've caused a panic, so we can
14303 	 * assume that we're in the PPL here.
14304 	 */
14305 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14306 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14307 
14308 	kr = pmap_query_trust_cache_safe(
14309 		kTCQueryTypeLoadable,
14310 		cdhash_safe,
14311 		NULL);
14312 #else
14313 	kr = query_trust_cache(
14314 		kTCQueryTypeLoadable,
14315 		cdhash,
14316 		NULL);
14317 #endif
14318 
14319 	if (kr == KERN_SUCCESS) {
14320 		return true;
14321 	}
14322 	return false;
14323 }
14324 
14325 bool
14326 pmap_lookup_in_loaded_trust_caches(
14327 	const uint8_t cdhash[CS_CDHASH_LEN])
14328 {
14329 #if XNU_MONITOR
14330 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14331 #else
14332 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14333 #endif
14334 }
14335 
14336 MARK_AS_PMAP_TEXT uint32_t
14337 pmap_lookup_in_static_trust_cache_internal(
14338 	const uint8_t cdhash[CS_CDHASH_LEN])
14339 {
14340 	TrustCacheQueryToken_t query_token = {0};
14341 	kern_return_t kr = KERN_NOT_FOUND;
14342 	uint64_t flags = 0;
14343 	uint8_t hash_type = 0;
14344 
14345 #if PMAP_CS_PPL_MONITOR
14346 	/*
14347 	 * If we have the PPL monitor, then this function can only be called from
14348 	 * within the PPL. Calling it directly would've caused a panic, so we can
14349 	 * assume that we're in the PPL here.
14350 	 */
14351 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14352 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14353 
14354 	kr = pmap_query_trust_cache_safe(
14355 		kTCQueryTypeStatic,
14356 		cdhash_safe,
14357 		&query_token);
14358 #else
14359 	kr = query_trust_cache(
14360 		kTCQueryTypeStatic,
14361 		cdhash,
14362 		&query_token);
14363 #endif
14364 
14365 	if (kr == KERN_SUCCESS) {
14366 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
14367 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14368 
14369 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14370 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14371 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14372 	}
14373 
14374 	return 0;
14375 }
14376 
14377 uint32_t
14378 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14379 {
14380 #if XNU_MONITOR
14381 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14382 #else
14383 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
14384 #endif
14385 }
14386 
14387 #if PMAP_CS_INCLUDE_CODE_SIGNING
14388 
14389 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14390 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14391 
14392 MARK_AS_PMAP_TEXT void
14393 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14394 {
14395 
14396 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14397 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14398 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14399 
14400 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14401 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14402 }
14403 
14404 MARK_AS_PMAP_TEXT bool
14405 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14406 {
14407 	bool match = false;
14408 
14409 	/* Lockdown mode disallows compilation service */
14410 	if (ppl_lockdown_mode_enabled == true) {
14411 		return false;
14412 	}
14413 
14414 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14415 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14416 		match = true;
14417 	}
14418 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14419 
14420 	if (match) {
14421 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14422 	}
14423 
14424 	return match;
14425 }
14426 
14427 void
14428 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14429 {
14430 #if XNU_MONITOR
14431 	pmap_set_compilation_service_cdhash_ppl(cdhash);
14432 #else
14433 	pmap_set_compilation_service_cdhash_internal(cdhash);
14434 #endif
14435 }
14436 
14437 bool
14438 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14439 {
14440 #if XNU_MONITOR
14441 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
14442 #else
14443 	return pmap_match_compilation_service_cdhash_internal(cdhash);
14444 #endif
14445 }
14446 
14447 /*
14448  * As part of supporting local signing on the device, we need the PMAP layer
14449  * to store the local signing key so that PMAP_CS can validate with it. We
14450  * store it at the PMAP layer such that it is accessible to both AMFI and
14451  * PMAP_CS should they need it.
14452  */
14453 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14454 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14455 
14456 MARK_AS_PMAP_TEXT void
14457 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14458 {
14459 	bool key_set = false;
14460 
14461 	/*
14462 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14463 	 * a successful exchange means that the local signing public key has _not_ been
14464 	 * set. In case the key has been set, we panic as we would never expect the
14465 	 * kernel to attempt to set the key more than once.
14466 	 */
14467 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14468 
14469 	if (key_set) {
14470 		panic("attempted to set the local signing public key multiple times");
14471 	}
14472 
14473 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14474 	pmap_cs_log_info("set local signing public key");
14475 }
14476 
14477 void
14478 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14479 {
14480 #if XNU_MONITOR
14481 	return pmap_set_local_signing_public_key_ppl(public_key);
14482 #else
14483 	return pmap_set_local_signing_public_key_internal(public_key);
14484 #endif
14485 }
14486 
14487 uint8_t*
14488 pmap_get_local_signing_public_key(void)
14489 {
14490 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14491 
14492 	if (key_set) {
14493 		return pmap_local_signing_public_key;
14494 	}
14495 
14496 	return NULL;
14497 }
14498 
14499 /*
14500  * Locally signed applications need to be explicitly authorized by an entitled application
14501  * before we allow them to run.
14502  */
14503 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14504 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14505 
14506 MARK_AS_PMAP_TEXT void
14507 pmap_unrestrict_local_signing_internal(
14508 	const uint8_t cdhash[CS_CDHASH_LEN])
14509 {
14510 
14511 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14512 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14513 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14514 
14515 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14516 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14517 }
14518 
14519 void
14520 pmap_unrestrict_local_signing(
14521 	const uint8_t cdhash[CS_CDHASH_LEN])
14522 {
14523 #if XNU_MONITOR
14524 	return pmap_unrestrict_local_signing_ppl(cdhash);
14525 #else
14526 	return pmap_unrestrict_local_signing_internal(cdhash);
14527 #endif
14528 }
14529 
14530 #if PMAP_CS
14531 MARK_AS_PMAP_TEXT static void
14532 pmap_restrict_local_signing(void)
14533 {
14534 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14535 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14536 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14537 }
14538 
14539 MARK_AS_PMAP_TEXT static bool
14540 pmap_local_signing_restricted(
14541 	const uint8_t cdhash[CS_CDHASH_LEN])
14542 {
14543 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14544 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14545 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14546 
14547 	return ret != 0;
14548 }
14549 
14550 #endif
14551 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14552 
14553 MARK_AS_PMAP_TEXT void
14554 pmap_footprint_suspend_internal(
14555 	vm_map_t        map,
14556 	boolean_t       suspend)
14557 {
14558 #if DEVELOPMENT || DEBUG
14559 	if (suspend) {
14560 		current_thread()->pmap_footprint_suspended = TRUE;
14561 		map->pmap->footprint_was_suspended = TRUE;
14562 	} else {
14563 		current_thread()->pmap_footprint_suspended = FALSE;
14564 	}
14565 #else /* DEVELOPMENT || DEBUG */
14566 	(void) map;
14567 	(void) suspend;
14568 #endif /* DEVELOPMENT || DEBUG */
14569 }
14570 
14571 void
14572 pmap_footprint_suspend(
14573 	vm_map_t map,
14574 	boolean_t suspend)
14575 {
14576 #if XNU_MONITOR
14577 	pmap_footprint_suspend_ppl(map, suspend);
14578 #else
14579 	pmap_footprint_suspend_internal(map, suspend);
14580 #endif
14581 }
14582 
14583 MARK_AS_PMAP_TEXT void
14584 pmap_nop_internal(pmap_t pmap __unused)
14585 {
14586 	validate_pmap_mutable(pmap);
14587 }
14588 
14589 void
14590 pmap_nop(pmap_t pmap)
14591 {
14592 #if XNU_MONITOR
14593 	pmap_nop_ppl(pmap);
14594 #else
14595 	pmap_nop_internal(pmap);
14596 #endif
14597 }
14598 
14599 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14600 
14601 struct page_table_dump_header {
14602 	uint64_t pa;
14603 	uint64_t num_entries;
14604 	uint64_t start_va;
14605 	uint64_t end_va;
14606 };
14607 
14608 static kern_return_t
14609 pmap_dump_page_tables_recurse(pmap_t pmap,
14610     const tt_entry_t *ttp,
14611     unsigned int cur_level,
14612     unsigned int level_mask,
14613     uint64_t start_va,
14614     void *buf_start,
14615     void *buf_end,
14616     size_t *bytes_copied)
14617 {
14618 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14619 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14620 
14621 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14622 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14623 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14624 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14625 
14626 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14627 
14628 	if (cur_level == pt_attr_root_level(pt_attr)) {
14629 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14630 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14631 	}
14632 
14633 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14634 	const tt_entry_t *tt_end = &ttp[num_entries];
14635 
14636 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14637 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14638 	}
14639 
14640 	if (level_mask & (1U << cur_level)) {
14641 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14642 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14643 		header->num_entries = num_entries;
14644 		header->start_va = start_va;
14645 		header->end_va = start_va + (num_entries * size);
14646 
14647 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14648 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14649 	}
14650 	uint64_t current_va = start_va;
14651 
14652 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14653 		tt_entry_t tte = *ttep;
14654 
14655 		if (!(tte & valid_mask)) {
14656 			continue;
14657 		}
14658 
14659 		if ((tte & type_mask) == type_block) {
14660 			continue;
14661 		} else {
14662 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14663 				panic("%s: corrupt entry %#llx at %p, "
14664 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14665 				    __FUNCTION__, tte, ttep,
14666 				    ttp, cur_level, bufp, buf_end);
14667 			}
14668 
14669 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14670 
14671 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14672 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14673 
14674 			if (recurse_result != KERN_SUCCESS) {
14675 				return recurse_result;
14676 			}
14677 		}
14678 	}
14679 
14680 	return KERN_SUCCESS;
14681 }
14682 
14683 kern_return_t
14684 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14685 {
14686 	if (not_in_kdp) {
14687 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14688 	}
14689 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14690 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14691 }
14692 
14693 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14694 
14695 kern_return_t
14696 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14697     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14698 {
14699 	return KERN_NOT_SUPPORTED;
14700 }
14701 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14702 
14703 
14704 #ifdef CONFIG_XNUPOST
14705 #ifdef __arm64__
14706 static volatile bool pmap_test_took_fault = false;
14707 
14708 static bool
14709 pmap_test_fault_handler(arm_saved_state_t * state)
14710 {
14711 	bool retval                 = false;
14712 	uint64_t esr                = get_saved_state_esr(state);
14713 	esr_exception_class_t class = ESR_EC(esr);
14714 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14715 
14716 	if ((class == ESR_EC_DABORT_EL1) &&
14717 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14718 		pmap_test_took_fault = true;
14719 		/* return to the instruction immediately after the call to NX page */
14720 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14721 		retval = true;
14722 	}
14723 
14724 	return retval;
14725 }
14726 
14727 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14728 static NOKASAN bool
14729 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14730 {
14731 	pmap_t old_pmap = NULL;
14732 	thread_t thread = current_thread();
14733 
14734 	pmap_test_took_fault = false;
14735 
14736 	/*
14737 	 * We're potentially switching pmaps without using the normal thread
14738 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14739 	 * memory accesses.
14740 	 */
14741 	uint64_t old_int_state = pmap_interrupts_disable();
14742 	mp_disable_preemption();
14743 
14744 	if (pmap != NULL) {
14745 		old_pmap = current_pmap();
14746 		pmap_switch(pmap, thread);
14747 
14748 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14749 #if __ARM_PAN_AVAILABLE__
14750 		__builtin_arm_wsr("pan", 0);
14751 #endif /* __ARM_PAN_AVAILABLE__ */
14752 	}
14753 
14754 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14755 
14756 	if (is_write) {
14757 		*((volatile uint64_t*)(va)) = 0xdec0de;
14758 	} else {
14759 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14760 		(void)tmp;
14761 	}
14762 
14763 	/* Save the fault bool, and undo the gross stuff we did. */
14764 	bool took_fault = pmap_test_took_fault;
14765 	ml_expect_fault_end();
14766 
14767 	if (pmap != NULL) {
14768 #if __ARM_PAN_AVAILABLE__
14769 		__builtin_arm_wsr("pan", 1);
14770 #endif /* __ARM_PAN_AVAILABLE__ */
14771 
14772 		pmap_switch(old_pmap, thread);
14773 	}
14774 
14775 	mp_enable_preemption();
14776 	pmap_interrupts_restore(old_int_state);
14777 	bool retval = (took_fault == should_fault);
14778 	return retval;
14779 }
14780 
14781 static bool
14782 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14783 {
14784 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14785 
14786 	if (!retval) {
14787 		T_FAIL("%s: %s, "
14788 		    "pmap=%p, va=%p, should_fault=%u",
14789 		    __func__, should_fault ? "did not fault" : "faulted",
14790 		    pmap, (void*)va, (unsigned)should_fault);
14791 	}
14792 
14793 	return retval;
14794 }
14795 
14796 static bool
14797 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14798 {
14799 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14800 
14801 	if (!retval) {
14802 		T_FAIL("%s: %s, "
14803 		    "pmap=%p, va=%p, should_fault=%u",
14804 		    __func__, should_fault ? "did not fault" : "faulted",
14805 		    pmap, (void*)va, (unsigned)should_fault);
14806 	}
14807 
14808 	return retval;
14809 }
14810 
14811 static bool
14812 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14813 {
14814 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14815 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14816 
14817 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14818 
14819 	if (!retval) {
14820 		T_FAIL("%s: bits=%u, "
14821 		    "pa=%p, should_be_set=%u",
14822 		    __func__, bits,
14823 		    (void*)pa, should_be_set);
14824 	}
14825 
14826 	return retval;
14827 }
14828 
14829 static __attribute__((noinline)) bool
14830 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14831 {
14832 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14833 	return retval;
14834 }
14835 
14836 static int
14837 pmap_test_test_config(unsigned int flags)
14838 {
14839 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14840 	unsigned int map_count = 0;
14841 	unsigned long page_ratio = 0;
14842 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14843 
14844 	if (!pmap) {
14845 		panic("Failed to allocate pmap");
14846 	}
14847 
14848 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14849 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14850 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14851 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14852 
14853 	if (pmap_page_size <= native_page_size) {
14854 		page_ratio = native_page_size / pmap_page_size;
14855 	} else {
14856 		/*
14857 		 * We claim to support a page_ratio of less than 1, which is
14858 		 * not currently supported by the pmap layer; panic.
14859 		 */
14860 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14861 		    "flags=%u",
14862 		    __func__, native_page_size, pmap_page_size,
14863 		    flags);
14864 	}
14865 
14866 	if (PAGE_RATIO > 1) {
14867 		/*
14868 		 * The kernel is deliberately pretending to have 16KB pages.
14869 		 * The pmap layer has code that supports this, so pretend the
14870 		 * page size is larger than it is.
14871 		 */
14872 		pmap_page_size = PAGE_SIZE;
14873 		native_page_size = PAGE_SIZE;
14874 	}
14875 
14876 	/*
14877 	 * Get two pages from the VM; one to be mapped wired, and one to be
14878 	 * mapped nonwired.
14879 	 */
14880 	vm_page_t unwired_vm_page = vm_page_grab();
14881 	vm_page_t wired_vm_page = vm_page_grab();
14882 
14883 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14884 		panic("Failed to grab VM pages");
14885 	}
14886 
14887 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14888 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14889 
14890 	pmap_paddr_t pa = ptoa(pn);
14891 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14892 
14893 	/*
14894 	 * We'll start mappings at the second twig TT.  This keeps us from only
14895 	 * using the first entry in each TT, which would trivially be address
14896 	 * 0; one of the things we will need to test is retrieving the VA for
14897 	 * a given PTE.
14898 	 */
14899 	vm_map_address_t va_base = pmap_twig_size;
14900 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14901 
14902 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14903 		/*
14904 		 * Not exactly a functional failure, but this test relies on
14905 		 * there being a spare PTE slot we can use to pin the TT.
14906 		 */
14907 		panic("Cannot pin translation table");
14908 	}
14909 
14910 	/*
14911 	 * Create the wired mapping; this will prevent the pmap layer from
14912 	 * reclaiming our test TTs, which would interfere with this test
14913 	 * ("interfere" -> "make it panic").
14914 	 */
14915 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14916 
14917 #if XNU_MONITOR
14918 	/*
14919 	 * If the PPL is enabled, make sure that the kernel cannot write
14920 	 * to PPL memory.
14921 	 */
14922 	if (!pmap_ppl_disable) {
14923 		T_LOG("Validate that kernel cannot write to PPL memory.");
14924 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14925 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14926 	}
14927 #endif
14928 
14929 	/*
14930 	 * Create read-only mappings of the nonwired page; if the pmap does
14931 	 * not use the same page size as the kernel, create multiple mappings
14932 	 * so that the kernel page is fully mapped.
14933 	 */
14934 	for (map_count = 0; map_count < page_ratio; map_count++) {
14935 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14936 	}
14937 
14938 	/* Validate that all the PTEs have the expected PA and VA. */
14939 	for (map_count = 0; map_count < page_ratio; map_count++) {
14940 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14941 
14942 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14943 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14944 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14945 		}
14946 
14947 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14948 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14949 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14950 		}
14951 	}
14952 
14953 	T_LOG("Validate that reads to our mapping do not fault.");
14954 	pmap_test_read(pmap, va_base, false);
14955 
14956 	T_LOG("Validate that writes to our mapping fault.");
14957 	pmap_test_write(pmap, va_base, true);
14958 
14959 	T_LOG("Make the first mapping writable.");
14960 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14961 
14962 	T_LOG("Validate that writes to our mapping do not fault.");
14963 	pmap_test_write(pmap, va_base, false);
14964 
14965 
14966 	T_LOG("Test XO mapping");
14967 	kern_return_t kr = pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14968 	if (pmap_allows_xo(pmap)) {
14969 		if (kr != KERN_SUCCESS) {
14970 			T_FAIL("XO mapping returned 0x%x instead of KERN_SUCCESS", (unsigned int)kr);
14971 		}
14972 	} else if (kr != KERN_PROTECTION_FAILURE) {
14973 		T_FAIL("XO mapping returned 0x%x instead of KERN_PROTECTION_FAILURE", (unsigned int)kr);
14974 	}
14975 
14976 	T_LOG("Make the first mapping RX");
14977 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE | VM_PROT_READ, VM_PROT_EXECUTE, 0, false);
14978 
14979 	T_LOG("Validate that reads to our mapping do not fault.");
14980 	pmap_test_read(pmap, va_base, false);
14981 
14982 	T_LOG("Validate that writes to our mapping fault.");
14983 	pmap_test_write(pmap, va_base, true);
14984 
14985 
14986 	/*
14987 	 * For page ratios of greater than 1: validate that writes to the other
14988 	 * mappings still fault.  Remove the mappings afterwards (we're done
14989 	 * with page ratio testing).
14990 	 */
14991 	for (map_count = 1; map_count < page_ratio; map_count++) {
14992 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14993 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14994 	}
14995 
14996 	T_LOG("Mark the page unreferenced and unmodified.");
14997 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14998 	pmap_test_check_refmod(pa, 0);
14999 
15000 	/*
15001 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
15002 	 * different protection/fault_type settings, and confirm that the
15003 	 * ref/mod state matches our expectations at each step.
15004 	 */
15005 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
15006 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
15007 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15008 
15009 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
15010 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15011 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
15012 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15013 
15014 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
15015 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15016 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
15017 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15018 
15019 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
15020 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
15021 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15022 
15023 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
15024 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15025 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15026 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15027 
15028 	/*
15029 	 * Shared memory testing; we'll have two mappings; one read-only,
15030 	 * one read-write.
15031 	 */
15032 	vm_map_address_t rw_base = va_base;
15033 	vm_map_address_t ro_base = va_base + pmap_page_size;
15034 
15035 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15036 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
15037 
15038 	/*
15039 	 * Test that we take faults as expected for unreferenced/unmodified
15040 	 * pages.  Also test the arm_fast_fault interface, to ensure that
15041 	 * mapping permissions change as expected.
15042 	 */
15043 	T_LOG("!ref/!mod: expect no access");
15044 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15045 	pmap_test_read_write(pmap, ro_base, false, false);
15046 	pmap_test_read_write(pmap, rw_base, false, false);
15047 
15048 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
15049 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
15050 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
15051 	pmap_test_read_write(pmap, ro_base, true, false);
15052 	pmap_test_read_write(pmap, rw_base, true, false);
15053 
15054 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
15055 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
15056 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15057 	pmap_test_read_write(pmap, ro_base, true, false);
15058 	pmap_test_read_write(pmap, rw_base, true, true);
15059 
15060 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
15061 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
15062 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
15063 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
15064 	pmap_test_read_write(pmap, ro_base, true, false);
15065 	pmap_test_read_write(pmap, rw_base, true, true);
15066 
15067 	T_LOG("RW protect both mappings; should not change protections.");
15068 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15069 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15070 	pmap_test_read_write(pmap, ro_base, true, false);
15071 	pmap_test_read_write(pmap, rw_base, true, true);
15072 
15073 	T_LOG("Read protect both mappings; RW mapping should become RO.");
15074 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
15075 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
15076 	pmap_test_read_write(pmap, ro_base, true, false);
15077 	pmap_test_read_write(pmap, rw_base, true, false);
15078 
15079 	T_LOG("RW protect the page; mappings should not change protections.");
15080 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15081 	pmap_page_protect(pn, VM_PROT_ALL);
15082 	pmap_test_read_write(pmap, ro_base, true, false);
15083 	pmap_test_read_write(pmap, rw_base, true, true);
15084 
15085 	T_LOG("Read protect the page; RW mapping should become RO.");
15086 	pmap_page_protect(pn, VM_PROT_READ);
15087 	pmap_test_read_write(pmap, ro_base, true, false);
15088 	pmap_test_read_write(pmap, rw_base, true, false);
15089 
15090 	T_LOG("Validate that disconnect removes all known mappings of the page.");
15091 	pmap_disconnect(pn);
15092 	if (!pmap_verify_free(pn)) {
15093 		T_FAIL("Page still has mappings");
15094 	}
15095 
15096 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
15097 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
15098 	pmap_destroy(pmap);
15099 
15100 	T_LOG("Release the pages back to the VM.");
15101 	vm_page_lock_queues();
15102 	vm_page_free(unwired_vm_page);
15103 	vm_page_free(wired_vm_page);
15104 	vm_page_unlock_queues();
15105 
15106 	T_LOG("Testing successful!");
15107 	return 0;
15108 }
15109 #endif /* __arm64__ */
15110 
15111 kern_return_t
15112 pmap_test(void)
15113 {
15114 	T_LOG("Starting pmap_tests");
15115 #ifdef __arm64__
15116 	int flags = 0;
15117 	flags |= PMAP_CREATE_64BIT;
15118 
15119 #if __ARM_MIXED_PAGE_SIZE__
15120 	T_LOG("Testing VM_PAGE_SIZE_4KB");
15121 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15122 	T_LOG("Testing VM_PAGE_SIZE_16KB");
15123 	pmap_test_test_config(flags);
15124 #else /* __ARM_MIXED_PAGE_SIZE__ */
15125 	pmap_test_test_config(flags);
15126 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15127 
15128 #endif /* __arm64__ */
15129 	T_PASS("completed pmap_test successfully");
15130 	return KERN_SUCCESS;
15131 }
15132 #endif /* CONFIG_XNUPOST */
15133 
15134 /*
15135  * The following function should never make it to RELEASE code, since
15136  * it provides a way to get the PPL to modify text pages.
15137  */
15138 #if DEVELOPMENT || DEBUG
15139 
15140 #define ARM_UNDEFINED_INSN 0xe7f000f0
15141 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15142 
15143 /**
15144  * Forcibly overwrite executable text with an illegal instruction.
15145  *
15146  * @note Only used for xnu unit testing.
15147  *
15148  * @param pa The physical address to corrupt.
15149  *
15150  * @return KERN_SUCCESS on success.
15151  */
15152 kern_return_t
15153 pmap_test_text_corruption(pmap_paddr_t pa)
15154 {
15155 #if XNU_MONITOR
15156 	return pmap_test_text_corruption_ppl(pa);
15157 #else /* XNU_MONITOR */
15158 	return pmap_test_text_corruption_internal(pa);
15159 #endif /* XNU_MONITOR */
15160 }
15161 
15162 MARK_AS_PMAP_TEXT kern_return_t
15163 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15164 {
15165 	vm_offset_t va = phystokv(pa);
15166 	unsigned int pai = pa_index(pa);
15167 
15168 	assert(pa_valid(pa));
15169 
15170 	pvh_lock(pai);
15171 
15172 	pv_entry_t **pv_h  = pai_to_pvh(pai);
15173 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15174 #if defined(PVH_FLAG_EXEC)
15175 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15176 
15177 	if (need_ap_twiddle) {
15178 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15179 	}
15180 #endif /* defined(PVH_FLAG_EXEC) */
15181 
15182 	/*
15183 	 * The low bit in an instruction address indicates a THUMB instruction
15184 	 */
15185 	if (va & 1) {
15186 		va &= ~(vm_offset_t)1;
15187 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15188 	} else {
15189 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
15190 	}
15191 
15192 #if defined(PVH_FLAG_EXEC)
15193 	if (need_ap_twiddle) {
15194 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15195 	}
15196 #endif /* defined(PVH_FLAG_EXEC) */
15197 
15198 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15199 
15200 	pvh_unlock(pai);
15201 
15202 	return KERN_SUCCESS;
15203 }
15204 
15205 #endif /* DEVELOPMENT || DEBUG */
15206