xref: /xnu-12377.1.9/osfmk/arm/pmap/pmap.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2011-2021, 2023-2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51 
52 #include <os/overflow.h>
53 
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map_xnu.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object_internal.h>
60 #include <vm/vm_page_internal.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm_internal.h>
63 
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70 
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74 #include <machine/machine_routines.h>
75 
76 #include <arm/caches_internal.h>
77 #include <arm/cpu_data.h>
78 #include <arm/cpu_data_internal.h>
79 #include <arm/cpu_capabilities.h>
80 #include <arm/cpu_number.h>
81 #include <arm/machine_cpu.h>
82 #include <arm/misc_protos.h>
83 #include <arm/pmap/pmap_internal.h>
84 #include <arm/trap_internal.h>
85 
86 #include <arm64/proc_reg.h>
87 #include <pexpert/arm64/boot.h>
88 #include <arm64/ppl/sart.h>
89 #include <arm64/ppl/uat.h>
90 
91 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
92 #include <arm64/amcc_rorgn.h>
93 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94 
95 #include <pexpert/device_tree.h>
96 
97 #include <san/kasan.h>
98 #include <sys/cdefs.h>
99 
100 #if defined(HAS_APPLE_PAC)
101 #include <ptrauth.h>
102 #endif
103 
104 #ifdef CONFIG_XNUPOST
105 #include <tests/xnupost.h>
106 #endif
107 
108 
109 
110 #if HIBERNATION
111 #include <IOKit/IOHibernatePrivate.h>
112 #endif /* HIBERNATION */
113 
114 #define PMAP_L1_MAX_ENTRY (ARM_PTE_T1_REGION_MASK(TCR_EL1_BOOT) >> ARM_TT_L1_SHIFT)
115 #define PMAP_ROOT_ALLOC_SIZE ((PMAP_L1_MAX_ENTRY + 1) * sizeof(tt_entry_t))
116 
117 #ifndef __ARM64_PMAP_SUBPAGE_L1__
118 _Static_assert(ARM_PGBYTES == PMAP_ROOT_ALLOC_SIZE, "Unexpected L1 Size");
119 #endif
120 
121 #if __ARM_VMSA__ != 8
122 #error Unknown __ARM_VMSA__
123 #endif
124 
125 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
126 
127 extern u_int32_t random(void); /* from <libkern/libkern.h> */
128 
129 static bool alloc_asid(pmap_t pmap);
130 static void free_asid(pmap_t pmap);
131 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
132 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
133 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
134 
135 const struct page_table_ops native_pt_ops =
136 {
137 	.alloc_id = alloc_asid,
138 	.free_id = free_asid,
139 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
140 	.flush_tlb_async = flush_mmu_tlb_full_asid_async,
141 	.wimg_to_pte = wimg_to_pte,
142 };
143 
144 const struct page_table_level_info pmap_table_level_info_16k[] =
145 {
146 	[0] = {
147 		.size       = ARM_16K_TT_L0_SIZE,
148 		.offmask    = ARM_16K_TT_L0_OFFMASK,
149 		.shift      = ARM_16K_TT_L0_SHIFT,
150 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
151 		.valid_mask = ARM_TTE_VALID,
152 		.type_mask  = ARM_TTE_TYPE_MASK,
153 		.type_block = ARM_TTE_TYPE_BLOCK
154 	},
155 	[1] = {
156 		.size       = ARM_16K_TT_L1_SIZE,
157 		.offmask    = ARM_16K_TT_L1_OFFMASK,
158 		.shift      = ARM_16K_TT_L1_SHIFT,
159 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
160 		.valid_mask = ARM_TTE_VALID,
161 		.type_mask  = ARM_TTE_TYPE_MASK,
162 		.type_block = ARM_TTE_TYPE_BLOCK
163 	},
164 	[2] = {
165 		.size       = ARM_16K_TT_L2_SIZE,
166 		.offmask    = ARM_16K_TT_L2_OFFMASK,
167 		.shift      = ARM_16K_TT_L2_SHIFT,
168 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
169 		.valid_mask = ARM_TTE_VALID,
170 		.type_mask  = ARM_TTE_TYPE_MASK,
171 		.type_block = ARM_TTE_TYPE_BLOCK
172 	},
173 	[3] = {
174 		.size       = ARM_16K_TT_L3_SIZE,
175 		.offmask    = ARM_16K_TT_L3_OFFMASK,
176 		.shift      = ARM_16K_TT_L3_SHIFT,
177 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
178 		.valid_mask = ARM_PTE_TYPE_VALID,
179 		.type_mask  = ARM_TTE_TYPE_MASK,
180 		.type_block = ARM_TTE_TYPE_L3BLOCK
181 	}
182 };
183 
184 const struct page_table_level_info pmap_table_level_info_4k[] =
185 {
186 	[0] = {
187 		.size       = ARM_4K_TT_L0_SIZE,
188 		.offmask    = ARM_4K_TT_L0_OFFMASK,
189 		.shift      = ARM_4K_TT_L0_SHIFT,
190 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
191 		.valid_mask = ARM_TTE_VALID,
192 		.type_mask  = ARM_TTE_TYPE_MASK,
193 		.type_block = ARM_TTE_TYPE_BLOCK
194 	},
195 	[1] = {
196 		.size       = ARM_4K_TT_L1_SIZE,
197 		.offmask    = ARM_4K_TT_L1_OFFMASK,
198 		.shift      = ARM_4K_TT_L1_SHIFT,
199 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
200 		.valid_mask = ARM_TTE_VALID,
201 		.type_mask  = ARM_TTE_TYPE_MASK,
202 		.type_block = ARM_TTE_TYPE_BLOCK
203 	},
204 	[2] = {
205 		.size       = ARM_4K_TT_L2_SIZE,
206 		.offmask    = ARM_4K_TT_L2_OFFMASK,
207 		.shift      = ARM_4K_TT_L2_SHIFT,
208 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
209 		.valid_mask = ARM_TTE_VALID,
210 		.type_mask  = ARM_TTE_TYPE_MASK,
211 		.type_block = ARM_TTE_TYPE_BLOCK
212 	},
213 	[3] = {
214 		.size       = ARM_4K_TT_L3_SIZE,
215 		.offmask    = ARM_4K_TT_L3_OFFMASK,
216 		.shift      = ARM_4K_TT_L3_SHIFT,
217 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
218 		.valid_mask = ARM_PTE_TYPE_VALID,
219 		.type_mask  = ARM_TTE_TYPE_MASK,
220 		.type_block = ARM_TTE_TYPE_L3BLOCK
221 	}
222 };
223 
224 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
225 {
226 	[0] = { /* Unused */
227 		.size       = ARM_4K_TT_L0_SIZE,
228 		.offmask    = ARM_4K_TT_L0_OFFMASK,
229 		.shift      = ARM_4K_TT_L0_SHIFT,
230 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
231 		.valid_mask = ARM_TTE_VALID,
232 		.type_mask  = ARM_TTE_TYPE_MASK,
233 		.type_block = ARM_TTE_TYPE_BLOCK
234 	},
235 	[1] = { /* Concatenated, so index mask is larger than normal */
236 		.size       = ARM_4K_TT_L1_SIZE,
237 		.offmask    = ARM_4K_TT_L1_OFFMASK,
238 		.shift      = ARM_4K_TT_L1_SHIFT,
239 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
240 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
241 #else
242 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
243 #endif
244 		.valid_mask = ARM_TTE_VALID,
245 		.type_mask  = ARM_TTE_TYPE_MASK,
246 		.type_block = ARM_TTE_TYPE_BLOCK
247 	},
248 	[2] = {
249 		.size       = ARM_4K_TT_L2_SIZE,
250 		.offmask    = ARM_4K_TT_L2_OFFMASK,
251 		.shift      = ARM_4K_TT_L2_SHIFT,
252 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
253 		.valid_mask = ARM_TTE_VALID,
254 		.type_mask  = ARM_TTE_TYPE_MASK,
255 		.type_block = ARM_TTE_TYPE_BLOCK
256 	},
257 	[3] = {
258 		.size       = ARM_4K_TT_L3_SIZE,
259 		.offmask    = ARM_4K_TT_L3_OFFMASK,
260 		.shift      = ARM_4K_TT_L3_SHIFT,
261 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
262 		.valid_mask = ARM_PTE_TYPE_VALID,
263 		.type_mask  = ARM_TTE_TYPE_MASK,
264 		.type_block = ARM_TTE_TYPE_L3BLOCK
265 	}
266 };
267 
268 const struct page_table_attr pmap_pt_attr_4k = {
269 	.pta_level_info = pmap_table_level_info_4k,
270 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
271 #if __ARM_MIXED_PAGE_SIZE__
272 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
273 #else /* __ARM_MIXED_PAGE_SIZE__ */
274 #if __ARM_16K_PG__
275 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
276 #else /* __ARM_16K_PG__ */
277 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
278 #endif /* __ARM_16K_PG__ */
279 #endif /* __ARM_MIXED_PAGE_SIZE__ */
280 	.pta_max_level  = PMAP_TT_L3_LEVEL,
281 	.pta_ops = &native_pt_ops,
282 	.ap_ro = ARM_PTE_AP(AP_RORO),
283 	.ap_rw = ARM_PTE_AP(AP_RWRW),
284 	.ap_rona = ARM_PTE_AP(AP_RONA),
285 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
286 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
287 	.ap_x = ARM_PTE_PNX,
288 #if __ARM_MIXED_PAGE_SIZE__
289 	.pta_tcr_value  = TCR_EL1_4KB,
290 #endif /* __ARM_MIXED_PAGE_SIZE__ */
291 	.pta_page_size  = 4096,
292 	.pta_pagezero_size = 4096,
293 	.pta_page_shift = 12,
294 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
295 };
296 
297 const struct page_table_attr pmap_pt_attr_16k = {
298 	.pta_level_info = pmap_table_level_info_16k,
299 	.pta_root_level = PMAP_TT_L1_LEVEL,
300 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
301 	.pta_max_level  = PMAP_TT_L3_LEVEL,
302 	.pta_ops = &native_pt_ops,
303 	.ap_ro = ARM_PTE_AP(AP_RORO),
304 	.ap_rw = ARM_PTE_AP(AP_RWRW),
305 	.ap_rona = ARM_PTE_AP(AP_RONA),
306 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
307 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
308 	.ap_x = ARM_PTE_PNX,
309 #if __ARM_MIXED_PAGE_SIZE__
310 	.pta_tcr_value  = TCR_EL1_16KB,
311 #endif /* __ARM_MIXED_PAGE_SIZE__ */
312 	.pta_page_size  = 16384,
313 	.pta_pagezero_size = 16384,
314 	.pta_page_shift = 14,
315 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
316 };
317 
318 #if __ARM_16K_PG__
319 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
320 #else /* !__ARM_16K_PG__ */
321 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
322 #endif /* !__ARM_16K_PG__ */
323 
324 
325 #if DEVELOPMENT || DEBUG
326 int vm_footprint_suspend_allowed = 1;
327 
328 extern int pmap_ledgers_panic;
329 extern int pmap_ledgers_panic_leeway;
330 
331 #endif /* DEVELOPMENT || DEBUG */
332 
333 #if DEVELOPMENT || DEBUG
334 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
335 	(current_thread()->pmap_footprint_suspended)
336 #else /* DEVELOPMENT || DEBUG */
337 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
338 #endif /* DEVELOPMENT || DEBUG */
339 
340 
341 /*
342  * Represents a tlb range that will be flushed before exiting
343  * the ppl.
344  * Used by phys_attribute_clear_range to defer flushing pages in
345  * this range until the end of the operation.
346  */
347 typedef struct pmap_tlb_flush_range {
348 	pmap_t ptfr_pmap;
349 	vm_map_address_t ptfr_start;
350 	vm_map_address_t ptfr_end;
351 	bool ptfr_flush_needed;
352 } pmap_tlb_flush_range_t;
353 
354 #if XNU_MONITOR
355 /*
356  * PPL External References.
357  */
358 extern vm_offset_t   segPPLDATAB;
359 extern unsigned long segSizePPLDATA;
360 extern vm_offset_t   segPPLTEXTB;
361 extern unsigned long segSizePPLTEXT;
362 extern vm_offset_t   segPPLDATACONSTB;
363 extern unsigned long segSizePPLDATACONST;
364 
365 
366 /*
367  * PPL Global Variables
368  */
369 
370 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
371 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
372 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
373 #else
374 const boolean_t pmap_ppl_disable = FALSE;
375 #endif
376 
377 /*
378  * Indicates if the PPL has started applying APRR.
379  * This variable is accessed from various assembly trampolines, so be sure to change
380  * those if you change the size or layout of this variable.
381  */
382 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
383 
384 extern void *pmap_stacks_start;
385 extern void *pmap_stacks_end;
386 
387 #endif /* !XNU_MONITOR */
388 
389 
390 
391 /* Virtual memory region for early allocation */
392 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
393 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
394 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
395 
396 extern uint8_t bootstrap_pagetables[];
397 
398 extern unsigned int not_in_kdp;
399 
400 extern vm_offset_t first_avail;
401 
402 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
403 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
404 extern vm_offset_t     static_memory_end;
405 
406 extern const vm_map_address_t physmap_base;
407 extern const vm_map_address_t physmap_end;
408 
409 extern int maxproc, hard_maxproc;
410 
411 /* The number of address bits one TTBR can cover. */
412 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
413 
414 /*
415  * The bounds on our TTBRs.  These are for sanity checking that
416  * an address is accessible by a TTBR before we attempt to map it.
417  */
418 
419 /* The level of the root of a page table. */
420 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
421 
422 /* The number of entries in the root TT of a page table. */
423 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
424 
425 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
426 const pmap_t    kernel_pmap = &kernel_pmap_store;
427 
428 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
429 
430 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
431 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
432 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
433 
434 typedef struct tt_free_entry {
435 	struct tt_free_entry    *next;
436 } tt_free_entry_t;
437 
438 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
439 
440 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
441 unsigned int    free_page_size_tt_count MARK_AS_PMAP_DATA;
442 unsigned int    free_page_size_tt_max MARK_AS_PMAP_DATA;
443 #define FREE_PAGE_SIZE_TT_MAX   4
444 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
445 unsigned int    free_tt_count MARK_AS_PMAP_DATA;
446 unsigned int    free_tt_max MARK_AS_PMAP_DATA;
447 
448 #define TT_FREE_ENTRY_NULL      ((tt_free_entry_t *) 0)
449 
450 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0;        /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
451 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0;        /* leaf user pagetable pages, in units of PAGE_SIZE */
452 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
453 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
454 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
455 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
456 
457 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
458 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
459 
460 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
462 
463 /* Lock group used for all pmap object locks. */
464 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
465 
466 #if DEVELOPMENT || DEBUG
467 int nx_enabled = 1;                                     /* enable no-execute protection */
468 int allow_data_exec  = 0;                               /* No apps may execute data */
469 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
470 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
471 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
472 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
473 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
474 #else /* DEVELOPMENT || DEBUG */
475 const int nx_enabled = 1;                                       /* enable no-execute protection */
476 const int allow_data_exec  = 0;                         /* No apps may execute data */
477 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
478 #endif /* DEVELOPMENT || DEBUG */
479 
480 /**
481  * This variable is set true during hibernation entry to protect pmap data structures
482  * during image copying, and reset false on hibernation exit.
483  */
484 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
485 
486 #if MACH_ASSERT
487 static void pmap_check_ledgers(pmap_t pmap);
488 #else
489 static inline void
pmap_check_ledgers(__unused pmap_t pmap)490 pmap_check_ledgers(__unused pmap_t pmap)
491 {
492 }
493 #endif /* MACH_ASSERT */
494 
495 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
496 
497 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
498 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
499 
500 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
501 
502 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
503 #if defined(__arm64__)
504 /* end of shared region + 512MB for various purposes */
505 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
506 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
507     "Minimum address space size outside allowable range");
508 
509 // Max offset is 15.375GB for devices with "large" memory config
510 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
511 // Max offset is 11.375GB for devices with "small" memory config
512 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
513 
514 
515 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
516     "Large device address space size outside allowable range");
517 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
518     "Small device address space size outside allowable range");
519 
520 #  ifdef XNU_TARGET_OS_OSX
521 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
522 #  else
523 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
524 #  endif
525 #endif /* __arm64__ */
526 
527 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
528 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
529 #else
530 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
531 #endif
532 
533 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
534 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
535 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
536 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
537 #if !HAS_16BIT_ASID
538 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
539 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
540 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
541 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
542 #else
543 static uint16_t last_allocated_asid = 0;
544 #endif /* !HAS_16BIT_ASID */
545 
546 #if HAS_SPECRES_DEBUGGING
547 /* A debug flag that controls SPECRES instructions behavior facilitating perf evaluation. */
548 static SECURITY_READ_ONLY_LATE(uint64_t) specres_debug = 0;
549 #endif /* HAS_SPECRES_DEBUGGING */
550 
551 
552 #if __ARM_MIXED_PAGE_SIZE__
553 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
554 #endif
555 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
556 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
557 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
558 
559 /* PTE Define Macros */
560 
561 #define ARM_PTE_IS_COMPRESSED(x, p) \
562 	((((x) & 0x3) == 0) && /* PTE is not valid... */                      \
563 	 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */      \
564 	 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */       \
565 	 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
566 	        (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
567 
568 #define pte_is_wired(pte)                                                               \
569 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
570 
571 #define pte_was_writeable(pte) \
572 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
573 
574 #define pte_set_was_writeable(pte, was_writeable) \
575 	do {                                         \
576 	        if ((was_writeable)) {               \
577 	                (pte) |= ARM_PTE_WRITEABLE;  \
578 	        } else {                             \
579 	                (pte) &= ~ARM_PTE_WRITEABLE; \
580 	        }                                    \
581 	} while(0)
582 
583 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)584 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
585 {
586 	if (wired) {
587 		*ptep |= ARM_PTE_WIRED;
588 	} else {
589 		*ptep &= ~ARM_PTE_WIRED;
590 	}
591 	/*
592 	 * Do not track wired page count for kernel pagetable pages.  Kernel mappings are
593 	 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
594 	 * never reclaimed.
595 	 */
596 	if (pmap == kernel_pmap) {
597 		return;
598 	}
599 	unsigned short *ptd_wiredcnt_ptr;
600 	ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
601 	if (wired) {
602 		os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
603 	} else {
604 		unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
605 		if (__improbable(prev_wired == 0)) {
606 			panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
607 		}
608 	}
609 }
610 
611 #if HAS_FEAT_XS
612 
613 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)614 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
615 {
616 	if (__improbable(pt_attr->stage2)) {
617 		return false;
618 	}
619 	switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
620 	case CACHE_ATTRINDX_DISABLE_XS:
621 	case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
622 		return true;
623 	default:
624 		return false;
625 	}
626 }
627 
628 #endif /* HAS_FEAT_XS */
629 
630 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) {                                               \
631 	pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
632 	arm64_sync_tlb(strong);                                                                               \
633 }
634 
635 /*
636  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
637  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
638  * will observe the updated PTE.
639  */
640 #define FLUSH_PTE()                                                                     \
641 	__builtin_arm_dmb(DMB_ISH);
642 
643 /*
644  * Synchronize updates to PTEs that were previously valid and thus may be cached in
645  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
646  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
647  * program order will not issue until the DSB completes.  Prior loads may be reordered
648  * after the barrier, but their behavior should not be materially affected by the
649  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
650  * matter for loads until the access is re-driven well after the TLB update is
651  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
652  * we should be in a position to handle access faults.  For "voluntary" PTE access
653  * restriction due to unmapping or protection, the decision to restrict access should
654  * have a data dependency on prior loads in order to avoid a data race.
655  */
656 #define FLUSH_PTE_STRONG()                                                             \
657 	__builtin_arm_dsb(DSB_ISHST);
658 
659 /**
660  * Write enough page table entries to map a single VM page. On systems where the
661  * VM page size does not match the hardware page size, multiple page table
662  * entries will need to be written.
663  *
664  * @note This function does not emit a barrier to ensure these page table writes
665  *       have completed before continuing. This is commonly needed. In the case
666  *       where a DMB or DSB barrier is needed, then use the write_pte() and
667  *       write_pte_strong() functions respectively instead of this one.
668  *
669  * @param ptep Pointer to the first page table entry to update.
670  * @param pte The value to write into each page table entry. In the case that
671  *            multiple PTEs are updated to a non-empty value, then the address
672  *            in this value will automatically be incremented for each PTE
673  *            write.
674  */
675 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)676 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
677 {
678 	/**
679 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
680 	 * systems, which is why it's checked at runtime instead of compile time.
681 	 * The "unreachable" warning needs to be suppressed because it still is a
682 	 * compile time constant on some systems.
683 	 */
684 	__unreachable_ok_push
685 	if (TEST_PAGE_RATIO_4) {
686 		if (((uintptr_t)ptep) & 0x1f) {
687 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
688 			    __func__, ptep, (void*)pte);
689 		}
690 
691 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
692 			/**
693 			 * If we're writing an empty/compressed PTE value, then don't
694 			 * auto-increment the address for each PTE write.
695 			 */
696 			*ptep = pte;
697 			*(ptep + 1) = pte;
698 			*(ptep + 2) = pte;
699 			*(ptep + 3) = pte;
700 		} else {
701 			*ptep = pte;
702 			*(ptep + 1) = pte | 0x1000;
703 			*(ptep + 2) = pte | 0x2000;
704 			*(ptep + 3) = pte | 0x3000;
705 		}
706 	} else {
707 		*ptep = pte;
708 	}
709 	__unreachable_ok_pop
710 }
711 
712 /**
713  * Writes enough page table entries to map a single VM page and then ensures
714  * those writes complete by executing a Data Memory Barrier.
715  *
716  * @note The DMB issued by this function is not strong enough to protect against
717  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
718  *       instruction is going to immediately be called after this write, it's
719  *       recommended to call write_pte_strong() instead of this function.
720  *
721  * See the function header for write_pte_fast() for more details on the
722  * parameters.
723  */
724 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)725 write_pte(pt_entry_t *ptep, pt_entry_t pte)
726 {
727 	write_pte_fast(ptep, pte);
728 	FLUSH_PTE();
729 }
730 
731 /**
732  * Writes enough page table entries to map a single VM page and then ensures
733  * those writes complete by executing a Data Synchronization Barrier. This
734  * barrier provides stronger guarantees than the DMB executed by write_pte().
735  *
736  * @note This function is useful if you're going to immediately flush the TLB
737  *       after making the PTE write. A DSB is required to protect against the
738  *       TLB invalidate being reordered before the PTE write.
739  *
740  * See the function header for write_pte_fast() for more details on the
741  * parameters.
742  */
743 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)744 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
745 {
746 	write_pte_fast(ptep, pte);
747 	FLUSH_PTE_STRONG();
748 }
749 
750 /**
751  * Retrieve the pmap structure for the thread running on the current CPU.
752  */
753 pmap_t
current_pmap()754 current_pmap()
755 {
756 	const pmap_t current = vm_map_pmap(current_thread()->map);
757 
758 	assert(current != NULL);
759 
760 #if XNU_MONITOR
761 	/**
762 	 * On PPL-enabled systems, it's important that PPL policy decisions aren't
763 	 * decided by kernel-writable memory. This function is used in various parts
764 	 * of the PPL, and besides validating that the pointer returned by this
765 	 * function is indeed a pmap structure, it's also important to ensure that
766 	 * it's actually the current thread's pmap. This is because different pmaps
767 	 * will have access to different entitlements based on the code signature of
768 	 * their loaded process. So if a different user pmap is set in the current
769 	 * thread structure (in an effort to bypass code signing restrictions), even
770 	 * though the structure would validate correctly as it is a real pmap
771 	 * structure, it should fail here.
772 	 *
773 	 * This only needs to occur for user pmaps because the kernel pmap's root
774 	 * page table is always the same as TTBR1 (it's set during bootstrap and not
775 	 * changed so it'd be redundant to check), and its code signing fields are
776 	 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
777 	 * it shouldn't be possible to set those fields. Due to that, an attacker
778 	 * setting the current thread's pmap to the kernel pmap as a way to bypass
779 	 * this check won't accomplish anything as it doesn't provide any extra code
780 	 * signing entitlements.
781 	 */
782 	if ((current != kernel_pmap) &&
783 	    ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
784 		panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
785 		    "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
786 	}
787 #endif /* XNU_MONITOR */
788 
789 	return current;
790 }
791 
792 #if DEVELOPMENT || DEBUG
793 
794 /*
795  * Trace levels are controlled by a bitmask in which each
796  * level can be enabled/disabled by the (1<<level) position
797  * in the boot arg
798  * Level 0: PPL extension functionality
799  * Level 1: pmap lifecycle (create/destroy/switch)
800  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
801  * Level 3: internal state management (attributes/fast-fault)
802  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
803  */
804 
805 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
806 
807 #define PMAP_TRACE(level, ...) \
808 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
809 	        KDBG_RELEASE(__VA_ARGS__); \
810 	}
811 #else /* DEVELOPMENT || DEBUG */
812 
813 #define PMAP_TRACE(level, ...)
814 
815 #endif /* DEVELOPMENT || DEBUG */
816 
817 
818 /*
819  * Internal function prototypes (forward declarations).
820  */
821 
822 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
823 
824 static void pmap_set_reference(ppnum_t pn);
825 
826 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
827 
828 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
829 
830 static kern_return_t pmap_expand(
831 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
832 
833 static int pmap_remove_range(
834 	pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
835 
836 static tt_entry_t *pmap_tt1_allocate(
837 	pmap_t, vm_size_t, unsigned int);
838 
839 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
840 
841 static void pmap_tt1_deallocate(
842 	pmap_t, tt_entry_t *, vm_size_t, unsigned int);
843 
844 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
845 
846 static kern_return_t pmap_tt_allocate(
847 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
848 
849 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
850 
851 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
852 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
853 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
854 
855 #define PMAP_TT_DEALLOCATE_NOBLOCK      0x1
856 
857 
858 static void pmap_unmap_commpage(
859 	pmap_t pmap);
860 
861 static boolean_t
862 pmap_is_64bit(pmap_t);
863 
864 
865 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
866 
867 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
868 
869 static bool pmap_update_cache_attributes_locked(
870 	ppnum_t, unsigned, bool);
871 
872 static boolean_t arm_clear_fast_fault(
873 	ppnum_t ppnum,
874 	vm_prot_t fault_type,
875 	pt_entry_t *pte_p);
876 
877 static void pmap_trim_self(pmap_t pmap);
878 static void pmap_trim_subord(pmap_t subord);
879 
880 
881 /*
882  * Temporary prototypes, while we wait for pmap_enter to move to taking an
883  * address instead of a page number.
884  */
885 static kern_return_t
886 pmap_enter_addr(
887 	pmap_t pmap,
888 	vm_map_address_t v,
889 	pmap_paddr_t pa,
890 	vm_prot_t prot,
891 	vm_prot_t fault_type,
892 	unsigned int flags,
893 	boolean_t wired);
894 
895 kern_return_t
896 pmap_enter_options_addr(
897 	pmap_t pmap,
898 	vm_map_address_t v,
899 	pmap_paddr_t pa,
900 	vm_prot_t prot,
901 	vm_prot_t fault_type,
902 	unsigned int flags,
903 	boolean_t wired,
904 	unsigned int options,
905 	__unused void   *arg,
906 	__unused pmap_mapping_type_t mapping_type);
907 
908 #ifdef CONFIG_XNUPOST
909 kern_return_t pmap_test(void);
910 #endif /* CONFIG_XNUPOST */
911 
912 PMAP_SUPPORT_PROTOTYPES(
913 	kern_return_t,
914 	arm_fast_fault, (pmap_t pmap,
915 	vm_map_address_t va,
916 	vm_prot_t fault_type,
917 	bool was_af_fault,
918 	bool from_user), ARM_FAST_FAULT_INDEX);
919 
920 PMAP_SUPPORT_PROTOTYPES(
921 	boolean_t,
922 	arm_force_fast_fault, (ppnum_t ppnum,
923 	vm_prot_t allow_mode,
924 	int options), ARM_FORCE_FAST_FAULT_INDEX);
925 
926 MARK_AS_PMAP_TEXT static boolean_t
927 arm_force_fast_fault_with_flush_range(
928 	ppnum_t ppnum,
929 	vm_prot_t allow_mode,
930 	int options,
931 	pmap_tlb_flush_range_t *flush_range);
932 
933 /**
934  * Definition of the states driving the batch cache attributes update
935  * state machine.
936  */
937 typedef struct {
938 	uint64_t page_index : 32,           /* The page index to be operated on */
939 	    state : 8,                      /* The current state of the update machine */
940 	    tlb_flush_pass_needed : 1,      /* Tracking whether the tlb flush pass is necessary */
941 	    rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
942 	:0;
943 } batch_set_cache_attr_state_t;
944 
945 /* Possible values of the "state" field. */
946 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS             1
947 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS           2
948 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS         3
949 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE                    4
950 
951 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
952 
953 PMAP_SUPPORT_PROTOTYPES(
954 	batch_set_cache_attr_state_t,
955 	pmap_batch_set_cache_attributes, (
956 #if XNU_MONITOR
957 		volatile upl_page_info_t *user_page_list,
958 #else /* !XNU_MONITOR */
959 		upl_page_info_array_t user_page_list,
960 #endif /* XNU_MONITOR */
961 		batch_set_cache_attr_state_t state,
962 		unsigned int page_cnt,
963 		unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
964 
965 PMAP_SUPPORT_PROTOTYPES(
966 	kern_return_t,
967 	pmap_change_wiring, (pmap_t pmap,
968 	vm_map_address_t v,
969 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
970 
971 PMAP_SUPPORT_PROTOTYPES(
972 	pmap_t,
973 	pmap_create_options, (ledger_t ledger,
974 	vm_map_size_t size,
975 	unsigned int flags,
976 	kern_return_t * kr), PMAP_CREATE_INDEX);
977 
978 PMAP_SUPPORT_PROTOTYPES(
979 	void,
980 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
981 
982 PMAP_SUPPORT_PROTOTYPES(
983 	kern_return_t,
984 	pmap_enter_options, (pmap_t pmap,
985 	vm_map_address_t v,
986 	pmap_paddr_t pa,
987 	vm_prot_t prot,
988 	vm_prot_t fault_type,
989 	unsigned int flags,
990 	boolean_t wired,
991 	unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
992 
993 PMAP_SUPPORT_PROTOTYPES(
994 	pmap_paddr_t,
995 	pmap_find_pa, (pmap_t pmap,
996 	addr64_t va), PMAP_FIND_PA_INDEX);
997 
998 PMAP_SUPPORT_PROTOTYPES(
999 	kern_return_t,
1000 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1001 
1002 
1003 PMAP_SUPPORT_PROTOTYPES(
1004 	boolean_t,
1005 	pmap_is_empty, (pmap_t pmap,
1006 	vm_map_offset_t va_start,
1007 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1008 
1009 
1010 PMAP_SUPPORT_PROTOTYPES(
1011 	unsigned int,
1012 	pmap_map_cpu_windows_copy, (ppnum_t pn,
1013 	vm_prot_t prot,
1014 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1015 
1016 PMAP_SUPPORT_PROTOTYPES(
1017 	void,
1018 	pmap_ro_zone_memcpy, (zone_id_t zid,
1019 	vm_offset_t va,
1020 	vm_offset_t offset,
1021 	const vm_offset_t new_data,
1022 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1023 
1024 PMAP_SUPPORT_PROTOTYPES(
1025 	uint64_t,
1026 	pmap_ro_zone_atomic_op, (zone_id_t zid,
1027 	vm_offset_t va,
1028 	vm_offset_t offset,
1029 	zro_atomic_op_t op,
1030 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1031 
1032 PMAP_SUPPORT_PROTOTYPES(
1033 	void,
1034 	pmap_ro_zone_bzero, (zone_id_t zid,
1035 	vm_offset_t va,
1036 	vm_offset_t offset,
1037 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1038 
1039 PMAP_SUPPORT_PROTOTYPES(
1040 	kern_return_t,
1041 	pmap_set_shared_region, (pmap_t grand,
1042 	pmap_t subord,
1043 	addr64_t vstart,
1044 	uint64_t size), PMAP_SET_SHARED_REGION_INDEX);
1045 
1046 PMAP_SUPPORT_PROTOTYPES(
1047 	vm_map_offset_t,
1048 	pmap_nest, (pmap_t grand,
1049 	pmap_t subord,
1050 	addr64_t vstart,
1051 	uint64_t size,
1052 	vm_map_offset_t vrestart,
1053 	kern_return_t * krp), PMAP_NEST_INDEX);
1054 
1055 PMAP_SUPPORT_PROTOTYPES(
1056 	void,
1057 	pmap_page_protect_options, (ppnum_t ppnum,
1058 	vm_prot_t prot,
1059 	unsigned int options,
1060 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1061 
1062 PMAP_SUPPORT_PROTOTYPES(
1063 	vm_map_address_t,
1064 	pmap_protect_options, (pmap_t pmap,
1065 	vm_map_address_t start,
1066 	vm_map_address_t end,
1067 	vm_prot_t prot,
1068 	unsigned int options,
1069 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1070 
1071 PMAP_SUPPORT_PROTOTYPES(
1072 	kern_return_t,
1073 	pmap_query_page_info, (pmap_t pmap,
1074 	vm_map_offset_t va,
1075 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1076 
1077 PMAP_SUPPORT_PROTOTYPES(
1078 	mach_vm_size_t,
1079 	pmap_query_resident, (pmap_t pmap,
1080 	vm_map_address_t start,
1081 	vm_map_address_t end,
1082 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1083 
1084 PMAP_SUPPORT_PROTOTYPES(
1085 	void,
1086 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1087 
1088 PMAP_SUPPORT_PROTOTYPES(
1089 	vm_map_address_t,
1090 	pmap_remove_options, (pmap_t pmap,
1091 	vm_map_address_t start,
1092 	vm_map_address_t end,
1093 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1094 
1095 
1096 PMAP_SUPPORT_PROTOTYPES(
1097 	void,
1098 	pmap_set_cache_attributes, (ppnum_t pn,
1099 	unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1100 
1101 PMAP_SUPPORT_PROTOTYPES(
1102 	void,
1103 	pmap_update_compressor_page, (ppnum_t pn,
1104 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1105 
1106 PMAP_SUPPORT_PROTOTYPES(
1107 	void,
1108 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1109 
1110 #if MACH_ASSERT || XNU_MONITOR
1111 PMAP_SUPPORT_PROTOTYPES(
1112 	void,
1113 	pmap_set_process, (pmap_t pmap,
1114 	int pid,
1115 	char *procname), PMAP_SET_PROCESS_INDEX);
1116 #endif
1117 
1118 PMAP_SUPPORT_PROTOTYPES(
1119 	void,
1120 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1121 
1122 PMAP_SUPPORT_PROTOTYPES(
1123 	vm_map_offset_t,
1124 	pmap_unnest_options, (pmap_t grand,
1125 	addr64_t vaddr,
1126 	uint64_t size,
1127 	vm_map_offset_t vrestart,
1128 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1129 
1130 PMAP_SUPPORT_PROTOTYPES(
1131 	void,
1132 	phys_attribute_set, (ppnum_t pn,
1133 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1134 
1135 PMAP_SUPPORT_PROTOTYPES(
1136 	void,
1137 	phys_attribute_clear, (ppnum_t pn,
1138 	unsigned int bits,
1139 	int options,
1140 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1141 
1142 #if __ARM_RANGE_TLBI__
1143 PMAP_SUPPORT_PROTOTYPES(
1144 	vm_map_address_t,
1145 	phys_attribute_clear_range, (pmap_t pmap,
1146 	vm_map_address_t start,
1147 	vm_map_address_t end,
1148 	unsigned int bits,
1149 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1150 #endif /* __ARM_RANGE_TLBI__ */
1151 
1152 
1153 PMAP_SUPPORT_PROTOTYPES(
1154 	void,
1155 	pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1156 
1157 PMAP_SUPPORT_PROTOTYPES(
1158 	void,
1159 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1160 
1161 PMAP_SUPPORT_PROTOTYPES(
1162 	void,
1163 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1164 
1165 PMAP_SUPPORT_PROTOTYPES(
1166 	void,
1167 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1168 
1169 PMAP_SUPPORT_PROTOTYPES(
1170 	void,
1171 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1172 
1173 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1174 PMAP_SUPPORT_PROTOTYPES(
1175 	void,
1176 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1177 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1178 
1179 /* Definition of the states used by pmap_trim(). */
1180 typedef enum {
1181 	/* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1182 	PMAP_TRIM_STATE_START = 0,
1183 
1184 	/* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1185 	PMAP_TRIM_STATE_GRAND_BEFORE,
1186 
1187 	/* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1188 	PMAP_TRIM_STATE_GRAND_AFTER,
1189 
1190 	/* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1191 	PMAP_TRIM_STATE_SUBORD,
1192 
1193 	/* Marks that trimming is finished. */
1194 	PMAP_TRIM_STATE_DONE,
1195 
1196 	/* Sentry enum for sanity checks. */
1197 	PMAP_TRIM_STATE_COUNT,
1198 } pmap_trim_state_t;
1199 
1200 PMAP_SUPPORT_PROTOTYPES(
1201 	pmap_trim_state_t,
1202 	pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1203 
1204 #if HAS_APPLE_PAC
1205 PMAP_SUPPORT_PROTOTYPES(
1206 	void *,
1207 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1208 PMAP_SUPPORT_PROTOTYPES(
1209 	void *,
1210 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1211 #endif /* HAS_APPLE_PAC */
1212 
1213 
1214 
1215 
1216 PMAP_SUPPORT_PROTOTYPES(
1217 	kern_return_t,
1218 	pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1219 	PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1220 
1221 PMAP_SUPPORT_PROTOTYPES(
1222 	kern_return_t,
1223 	pmap_load_trust_cache_with_type, (TCType_t type,
1224 	const vm_address_t pmap_img4_payload,
1225 	const vm_size_t pmap_img4_payload_len,
1226 	const vm_address_t img4_manifest,
1227 	const vm_size_t img4_manifest_len,
1228 	const vm_address_t img4_aux_manifest,
1229 	const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1230 
1231 PMAP_SUPPORT_PROTOTYPES(
1232 	void,
1233 	pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1234 
1235 PMAP_SUPPORT_PROTOTYPES(
1236 	kern_return_t,
1237 	pmap_query_trust_cache, (TCQueryType_t query_type,
1238 	const uint8_t cdhash[kTCEntryHashSize],
1239 	TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1240 
1241 PMAP_SUPPORT_PROTOTYPES(
1242 	errno_t,
1243 	pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1244 	const void *input_data,
1245 	size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1246 
1247 #if PMAP_CS_INCLUDE_CODE_SIGNING
1248 
1249 PMAP_SUPPORT_PROTOTYPES(
1250 	kern_return_t,
1251 	pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1252 	const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1253 
1254 PMAP_SUPPORT_PROTOTYPES(
1255 	kern_return_t,
1256 	pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1257 	PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1258 
1259 PMAP_SUPPORT_PROTOTYPES(
1260 	kern_return_t,
1261 	pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1262 	pmap_cs_profile_t * profile_obj),
1263 	PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1264 
1265 PMAP_SUPPORT_PROTOTYPES(
1266 	kern_return_t,
1267 	pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1268 	PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1269 
1270 PMAP_SUPPORT_PROTOTYPES(
1271 	kern_return_t,
1272 	pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1273 	const void *kernel_entitlements),
1274 	PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1275 
1276 PMAP_SUPPORT_PROTOTYPES(
1277 	kern_return_t,
1278 	pmap_resolve_kernel_entitlements, (pmap_t pmap,
1279 	const void **kernel_entitlements),
1280 	PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1281 
1282 PMAP_SUPPORT_PROTOTYPES(
1283 	kern_return_t,
1284 	pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1285 	PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1286 
1287 PMAP_SUPPORT_PROTOTYPES(
1288 	kern_return_t,
1289 	pmap_cs_allow_invalid, (pmap_t pmap),
1290 	PMAP_CS_ALLOW_INVALID_INDEX);
1291 
1292 PMAP_SUPPORT_PROTOTYPES(
1293 	void,
1294 	pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1295 	PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1296 
1297 PMAP_SUPPORT_PROTOTYPES(
1298 	bool,
1299 	pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1300 	PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1301 
1302 PMAP_SUPPORT_PROTOTYPES(
1303 	void,
1304 	pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1305 	PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1306 
1307 PMAP_SUPPORT_PROTOTYPES(
1308 	void,
1309 	pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1310 	PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1311 
1312 #endif
1313 
1314 PMAP_SUPPORT_PROTOTYPES(
1315 	uint32_t,
1316 	pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1317 
1318 PMAP_SUPPORT_PROTOTYPES(
1319 	bool,
1320 	pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1321 
1322 PMAP_SUPPORT_PROTOTYPES(
1323 	void,
1324 	pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1325 
1326 void pmap_footprint_suspend(vm_map_t    map,
1327     boolean_t   suspend);
1328 PMAP_SUPPORT_PROTOTYPES(
1329 	void,
1330 	pmap_footprint_suspend, (vm_map_t map,
1331 	boolean_t suspend),
1332 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1333 
1334 
1335 
1336 
1337 #if DEVELOPMENT || DEBUG
1338 PMAP_SUPPORT_PROTOTYPES(
1339 	kern_return_t,
1340 	pmap_test_text_corruption, (pmap_paddr_t),
1341 	PMAP_TEST_TEXT_CORRUPTION_INDEX);
1342 #endif /* DEVELOPMENT || DEBUG */
1343 
1344 /*
1345  * The low global vector page is mapped at a fixed alias.
1346  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1347  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1348  * to check both addresses anyway for backward compatibility. So for now
1349  * we leave H6 and H7 where they were.
1350  */
1351 #if (ARM_PGSHIFT == 14)
1352 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1353 #else
1354 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1355 #endif
1356 
1357 
1358 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1359 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1360 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1361 
1362 #if XNU_MONITOR
1363 
1364 #if __has_feature(ptrauth_calls)
1365 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1366 #else
1367 #define __ptrauth_ppl_handler
1368 #endif
1369 
1370 /*
1371  * Table of function pointers used for PPL dispatch.
1372  */
1373 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1374 	[ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1375 	[ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1376 	[MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1377 	[PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1378 	[PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1379 	[PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1380 	[PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1381 	[PMAP_CREATE_INDEX] = pmap_create_options_internal,
1382 	[PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1383 	[PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1384 	[PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1385 	[PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1386 	[PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1387 	[PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1388 	[PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1389 	[PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1390 	[PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1391 	[PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1392 	[PMAP_SET_SHARED_REGION_INDEX] = pmap_set_shared_region_internal,
1393 	[PMAP_NEST_INDEX] = pmap_nest_internal,
1394 	[PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1395 	[PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1396 	[PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1397 	[PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1398 	[PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1399 	[PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1400 	[PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1401 	[PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1402 	[PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1403 	[PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1404 	[PMAP_SWITCH_INDEX] = pmap_switch_internal,
1405 	[PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1406 	[PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1407 	[PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1408 	[PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1409 	[PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1410 	[PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1411 	[PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1412 	[PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1413 	[PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1414 	[PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1415 	[PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1416 	[PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1417 	[PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1418 	[PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1419 	[PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1420 	[PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1421 #if PMAP_CS_INCLUDE_CODE_SIGNING
1422 	[PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1423 	[PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1424 	[PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1425 	[PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1426 	[PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1427 	[PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1428 	[PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1429 	[PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1430 	[PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1431 	[PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1432 	[PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1433 #endif
1434 	[PMAP_TRIM_INDEX] = pmap_trim_internal,
1435 	[PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1436 	[PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1437 	[PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1438 #if HAS_APPLE_PAC
1439 	[PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1440 	[PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1441 #endif /* HAS_APPLE_PAC */
1442 #if __ARM_RANGE_TLBI__
1443 	[PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1444 #endif /* __ARM_RANGE_TLBI__ */
1445 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1446 	[PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1447 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1448 	[PMAP_NOP_INDEX] = pmap_nop_internal,
1449 
1450 #if DEVELOPMENT || DEBUG
1451 	[PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1452 #endif /* DEVELOPMENT || DEBUG */
1453 
1454 };
1455 #endif
1456 
1457 #if XNU_MONITOR
1458 /**
1459  * A convenience function for setting protections on a single physical
1460  * aperture or static region mapping without invalidating the TLB.
1461  *
1462  * @note This function does not perform any TLB invalidations. That must be done
1463  *       separately to be able to safely use the updated mapping.
1464  *
1465  * @note This function understands the difference between the VM page size and
1466  *       the kernel page size and will update multiple PTEs if the sizes differ.
1467  *       In other words, enough PTEs will always get updated to change the
1468  *       permissions on a PAGE_SIZE amount of memory.
1469  *
1470  * @note The PVH lock for the physical page represented by this mapping must
1471  *       already be locked.
1472  *
1473  * @note This function assumes the caller has already verified that the PTE
1474  *       pointer does indeed point to a physical aperture or static region page
1475  *       table. Please validate your inputs before passing it along to this
1476  *       function.
1477  *
1478  * @param ptep Pointer to the physical aperture or static region page table to
1479  *             update with a new XPRR index.
1480  * @param expected_perm The XPRR index that is expected to already exist at the
1481  *                      current mapping. If the current index doesn't match this
1482  *                      then the system will panic.
1483  * @param new_perm The new XPRR index to update the mapping with.
1484  */
1485 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1486 pmap_set_pte_xprr_perm(
1487 	pt_entry_t * const ptep,
1488 	unsigned int expected_perm,
1489 	unsigned int new_perm)
1490 {
1491 	assert(ptep != NULL);
1492 
1493 	pt_entry_t spte = *ptep;
1494 	pvh_assert_locked(pa_index(pte_to_pa(spte)));
1495 
1496 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1497 		panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1498 		    __func__, ptep, new_perm, expected_perm);
1499 	}
1500 
1501 	/**
1502 	 * The PTE involved should be valid, should not have the hint bit set, and
1503 	 * should have the expected XPRR index.
1504 	 */
1505 	if (__improbable(!pte_is_valid(spte))) {
1506 		panic_plain("%s: physical aperture or static region PTE is invalid, "
1507 		    "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1508 		    __func__, ptep, spte, new_perm, expected_perm);
1509 	}
1510 
1511 	if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1512 		panic_plain("%s: physical aperture or static region PTE has hint bit "
1513 		    "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1514 		    __func__, ptep, spte, new_perm, expected_perm);
1515 	}
1516 
1517 	if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1518 		panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1519 		    "ptep=%p, new_perm=%u, expected_perm=%u",
1520 		    __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1521 	}
1522 
1523 	pt_entry_t template = spte;
1524 	template &= ~ARM_PTE_XPRR_MASK;
1525 	template |= xprr_perm_to_pte(new_perm);
1526 
1527 	write_pte_strong(ptep, template);
1528 }
1529 
1530 /**
1531  * Update the protections on a single physical aperture mapping and invalidate
1532  * the TLB so the mapping can be used.
1533  *
1534  * @note The PVH lock for the physical page must already be locked.
1535  *
1536  * @param pai The physical address index of the page whose physical aperture
1537  *            mapping will be updated with new permissions.
1538  * @param expected_perm The XPRR index that is expected to already exist at the
1539  *                      current mapping. If the current index doesn't match this
1540  *                      then the system will panic.
1541  * @param new_perm The new XPRR index to update the mapping with.
1542  */
1543 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1544 pmap_set_xprr_perm(
1545 	unsigned int pai,
1546 	unsigned int expected_perm,
1547 	unsigned int new_perm)
1548 {
1549 	pvh_assert_locked(pai);
1550 
1551 	const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1552 	pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1553 
1554 	pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1555 
1556 	native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1557 	sync_tlb_flush();
1558 }
1559 
1560 /**
1561  * Update the protections on a range of physical aperture or static region
1562  * mappings and invalidate the TLB so the mappings can be used.
1563  *
1564  * @note Static region mappings can only be updated before machine_lockdown().
1565  *       Physical aperture mappings can be updated at any time.
1566  *
1567  * @param start The starting virtual address of the static region or physical
1568  *              aperture range whose permissions will be updated.
1569  * @param end The final (inclusive) virtual address of the static region or
1570  *            physical aperture range whose permissions will be updated.
1571  * @param expected_perm The XPRR index that is expected to already exist at the
1572  *                      current mappings. If the current indices don't match
1573  *                      this then the system will panic.
1574  * @param new_perm The new XPRR index to update the mappings with.
1575  */
1576 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1577 pmap_set_range_xprr_perm(
1578 	vm_address_t start,
1579 	vm_address_t end,
1580 	unsigned int expected_perm,
1581 	unsigned int new_perm)
1582 {
1583 	/**
1584 	 * Validate our arguments; any invalid argument will be grounds for a panic.
1585 	 */
1586 	if (__improbable((start | end) & ARM_PGMASK)) {
1587 		panic_plain("%s: start or end not page aligned, "
1588 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1589 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1590 	}
1591 
1592 	if (__improbable(start > end)) {
1593 		panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1594 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1595 	}
1596 
1597 	const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1598 	const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1599 
1600 	if (__improbable(!(in_physmap || in_static))) {
1601 		panic_plain("%s: address not in static region or physical aperture, "
1602 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1603 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1604 	}
1605 
1606 	if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1607 		panic_plain("%s: invalid XPRR index, "
1608 		    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1609 		    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1610 	}
1611 
1612 	/*
1613 	 * Walk over the PTEs for the given range, and set the protections on those
1614 	 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1615 	 * one twig entry (whichever twig entry currently maps "va").
1616 	 */
1617 	vm_address_t va = start;
1618 	while (va < end) {
1619 		/**
1620 		 * Get the last VA that the twig entry for "va" maps. All of the leaf
1621 		 * PTEs from va to tte_va_end will have their permissions updated.
1622 		 */
1623 		vm_address_t tte_va_end =
1624 		    (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1625 
1626 		if (tte_va_end > end) {
1627 			tte_va_end = end;
1628 		}
1629 
1630 		tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1631 
1632 		if (ttep == NULL) {
1633 			panic_plain("%s: physical aperture or static region tte is NULL, "
1634 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1635 			    __func__, (void *)start, (void *)end, new_perm, expected_perm);
1636 		}
1637 
1638 		tt_entry_t tte = *ttep;
1639 
1640 		if (!tte_is_valid_table(tte)) {
1641 			panic_plain("%s: tte=0x%llx is not a table type entry, "
1642 			    "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1643 			    tte, (void *)start, (void *)end, new_perm, expected_perm);
1644 		}
1645 
1646 		/* Walk over the given L3 page table page and update the PTEs. */
1647 		pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1648 		pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1649 		const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1650 		pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1651 
1652 		/**
1653 		 * The current PTE pointer is incremented by the page ratio (ratio of
1654 		 * VM page size to kernel hardware page size) because one call to
1655 		 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1656 		 * a PAGE_SIZE worth of hardware pages.
1657 		 */
1658 		for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1659 		    cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1660 			unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1661 			pvh_lock(pai);
1662 			pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1663 			pvh_unlock(pai);
1664 		}
1665 
1666 		va = tte_va_end;
1667 	}
1668 
1669 	PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1670 }
1671 
1672 #endif /* XNU_MONITOR */
1673 
1674 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1675 PMAP_ZINFO_PALLOC(
1676 	pmap_t pmap, int bytes)
1677 {
1678 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1679 }
1680 
1681 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1682 PMAP_ZINFO_PFREE(
1683 	pmap_t pmap,
1684 	int bytes)
1685 {
1686 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1687 }
1688 
1689 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1690 pmap_tt_ledger_credit(
1691 	pmap_t          pmap,
1692 	vm_size_t       size)
1693 {
1694 	if (pmap != kernel_pmap) {
1695 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1696 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1697 	}
1698 }
1699 
1700 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1701 pmap_tt_ledger_debit(
1702 	pmap_t          pmap,
1703 	vm_size_t       size)
1704 {
1705 	if (pmap != kernel_pmap) {
1706 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1707 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1708 	}
1709 }
1710 
1711 static inline void
pmap_update_plru(uint16_t asid_index __unused)1712 pmap_update_plru(uint16_t asid_index __unused)
1713 {
1714 #if !HAS_16BIT_ASID
1715 	if (__probable(pmap_asid_plru)) {
1716 		unsigned plru_index = asid_index >> 6;
1717 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1718 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1719 			asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1720 		}
1721 	}
1722 #endif /* !HAS_16BIT_ASID */
1723 }
1724 
1725 static bool
alloc_asid(pmap_t pmap)1726 alloc_asid(pmap_t pmap)
1727 {
1728 	int vasid = -1;
1729 	uint16_t hw_asid;
1730 
1731 	pmap_simple_lock(&asid_lock);
1732 
1733 #if !HAS_16BIT_ASID
1734 	if (__probable(pmap_asid_plru)) {
1735 		unsigned plru_index = 0;
1736 		uint64_t lowest_gen = asid_plru_generation[0];
1737 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1738 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1739 			if (asid_plru_generation[i] < lowest_gen) {
1740 				plru_index = i;
1741 				lowest_gen = asid_plru_generation[i];
1742 				lowest_gen_bitmap = asid_plru_bitmap[i];
1743 			}
1744 		}
1745 
1746 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1747 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1748 			if (temp_plru) {
1749 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1750 #if DEVELOPMENT || DEBUG
1751 				++pmap_asid_hits;
1752 #endif
1753 				break;
1754 			}
1755 		}
1756 	}
1757 #else
1758 	/**
1759 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1760 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1761 	 * However, we first try to allocate starting from the position of the most-recently allocated
1762 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1763 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1764 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1765 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1766 	 * logic, without requiring prohibitively expensive RCTX instructions.
1767 	 */
1768 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1769 #endif /* !HAS_16BIT_ASID */
1770 	if (__improbable(vasid < 0)) {
1771 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1772 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1773 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1774 #if DEVELOPMENT || DEBUG
1775 		++pmap_asid_misses;
1776 #endif
1777 	}
1778 	if (__improbable(vasid < 0)) {
1779 		pmap_simple_unlock(&asid_lock);
1780 		return false;
1781 	}
1782 	assert((uint32_t)vasid < pmap_max_asids);
1783 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1784 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1785 #if HAS_16BIT_ASID
1786 	last_allocated_asid = (uint16_t)vasid;
1787 #endif /* HAS_16BIT_ASID */
1788 	pmap_simple_unlock(&asid_lock);
1789 	hw_asid = (uint16_t)(vasid % asid_chunk_size);
1790 	pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1791 	if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1792 		/* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1793 		 * reassign to a reserved VASID. */
1794 		assert(pmap->sw_asid < UINT8_MAX);
1795 		pmap->sw_asid = UINT8_MAX;
1796 		/* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1797 		 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1798 		hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1799 		assert(hw_asid < MAX_HW_ASIDS);
1800 	}
1801 	pmap_update_plru(hw_asid);
1802 	hw_asid += 1;  // Account for ASID 0, which is reserved for the kernel
1803 #if __ARM_KERNEL_PROTECT__
1804 	hw_asid <<= 1;  // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1805 #endif
1806 	pmap->hw_asid = hw_asid;
1807 	return true;
1808 }
1809 
1810 static void
free_asid(pmap_t pmap)1811 free_asid(pmap_t pmap)
1812 {
1813 	unsigned int vasid;
1814 	uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1815 	if (__improbable(hw_asid == 0)) {
1816 		return;
1817 	}
1818 
1819 #if __ARM_KERNEL_PROTECT__
1820 	hw_asid >>= 1;
1821 #endif
1822 	hw_asid -= 1;
1823 
1824 #if HAS_16BIT_ASID
1825 	vasid = hw_asid;
1826 #else
1827 	if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1828 		vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1829 	} else {
1830 		vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1831 	}
1832 
1833 	if (__probable(pmap_asid_plru)) {
1834 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1835 	}
1836 #endif /* HAS_16BIT_ASID */
1837 	pmap_simple_lock(&asid_lock);
1838 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1839 	bitmap_set(&asid_bitmap[0], vasid);
1840 	pmap_simple_unlock(&asid_lock);
1841 }
1842 
1843 
1844 boolean_t
pmap_valid_address(pmap_paddr_t addr)1845 pmap_valid_address(
1846 	pmap_paddr_t addr)
1847 {
1848 	return pa_valid(addr);
1849 }
1850 
1851 
1852 
1853 
1854 
1855 
1856 /*
1857  *      Map memory at initialization.  The physical addresses being
1858  *      mapped are not managed and are never unmapped.
1859  *
1860  *      For now, VM is already on, we only need to map the
1861  *      specified memory.
1862  */
1863 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1864 pmap_map(
1865 	vm_map_address_t virt,
1866 	vm_offset_t start,
1867 	vm_offset_t end,
1868 	vm_prot_t prot,
1869 	unsigned int flags)
1870 {
1871 	kern_return_t   kr;
1872 	vm_size_t       ps;
1873 
1874 	ps = PAGE_SIZE;
1875 	while (start < end) {
1876 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1877 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1878 
1879 		if (kr != KERN_SUCCESS) {
1880 			panic("%s: failed pmap_enter, "
1881 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1882 			    __FUNCTION__,
1883 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1884 		}
1885 
1886 		virt += ps;
1887 		start += ps;
1888 	}
1889 	return virt;
1890 }
1891 
1892 #if XNU_MONITOR
1893 /**
1894  * Remove kernel writeablity from an IO PTE value if the page is owned by
1895  * guarded mode software.
1896  *
1897  * @param paddr The physical address of the page which has to be non-DRAM.
1898  * @param tmplate The PTE value to be evaluated.
1899  *
1900  * @return A new PTE value with permission bits modified.
1901  */
1902 static inline
1903 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1904 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1905 {
1906 	assert(!pa_valid(paddr));
1907 
1908 	const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1909 
1910 	if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1911 		/* PPL to own the page by converting KERN_RW to PPL_RW. */
1912 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1913 		switch (xprr_perm) {
1914 		case XPRR_KERN_RO_PERM:
1915 			break;
1916 		case XPRR_KERN_RW_PERM:
1917 			tmplate &= ~ARM_PTE_XPRR_MASK;
1918 			tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1919 			break;
1920 		default:
1921 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1922 		}
1923 	}
1924 
1925 	return tmplate;
1926 }
1927 #endif /* XNU_MONITOR */
1928 
1929 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1930 pmap_map_bd_with_options(
1931 	vm_map_address_t virt,
1932 	vm_offset_t start,
1933 	vm_offset_t end,
1934 	vm_prot_t prot,
1935 	int32_t options)
1936 {
1937 	pt_entry_t      mem_attr;
1938 
1939 	if (__improbable(start & PAGE_MASK)) {
1940 		panic("%s: start 0x%lx is not page aligned", __func__, start);
1941 	}
1942 
1943 	if (__improbable(end & PAGE_MASK)) {
1944 		panic("%s: end 0x%lx is not page aligned", __func__, end);
1945 	}
1946 
1947 	if (__improbable(!gDramBase || !gDramSize)) {
1948 		panic("%s: gDramBase/gDramSize not initialized", __func__);
1949 	}
1950 
1951 	const bool first_page_is_dram = is_dram_addr(start);
1952 	for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1953 		if (first_page_is_dram != is_dram_addr(pa)) {
1954 			panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1955 			    __func__, pa, first_page_is_dram ? "is not" : "is");
1956 		}
1957 	}
1958 
1959 	switch (options & PMAP_MAP_BD_MASK) {
1960 	case PMAP_MAP_BD_WCOMB:
1961 		if (is_dram_addr(start)) {
1962 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1963 		} else {
1964 #if HAS_FEAT_XS
1965 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1966 #else /* HAS_FEAT_XS */
1967 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1968 #endif /* HAS_FEAT_XS */
1969 #if DEBUG || DEVELOPMENT
1970 			pmap_wcrt_on_non_dram_count_increment_atomic();
1971 #endif /* DEBUG || DEVELOPMENT */
1972 		}
1973 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1974 		break;
1975 	case PMAP_MAP_BD_POSTED:
1976 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1977 		break;
1978 	case PMAP_MAP_BD_POSTED_REORDERED:
1979 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1980 		break;
1981 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1982 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1983 		break;
1984 	default:
1985 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1986 		break;
1987 	}
1988 
1989 	/* not cacheable and not buffered */
1990 	pt_entry_t tmplate = pa_to_pte(start)
1991 	    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1992 	    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1993 	    | mem_attr;
1994 
1995 #if __ARM_KERNEL_PROTECT__
1996 	tmplate |= ARM_PTE_NG;
1997 #endif /* __ARM_KERNEL_PROTECT__ */
1998 
1999 	vm_map_address_t vaddr = virt;
2000 	vm_offset_t paddr = start;
2001 	while (paddr < end) {
2002 		pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
2003 		if (ptep == PT_ENTRY_NULL) {
2004 			panic("pmap_map_bd");
2005 		}
2006 
2007 		/**
2008 		 * For every iteration, the paddr encoded in tmplate is incrementing,
2009 		 * but we always start with the original AP bits defined at the top
2010 		 * of the function in tmplate and only modify the AP bits in the pte
2011 		 * variable.
2012 		 */
2013 		pt_entry_t pte;
2014 #if XNU_MONITOR
2015 		if (!pa_valid(paddr)) {
2016 			pte = pmap_construct_io_pte(paddr, tmplate);
2017 		} else {
2018 			pte = tmplate;
2019 		}
2020 #else /* !XNU_MONITOR */
2021 		pte = tmplate;
2022 #endif
2023 
2024 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2025 		write_pte_strong(ptep, pte);
2026 
2027 		pte_increment_pa(tmplate);
2028 		vaddr += PAGE_SIZE;
2029 		paddr += PAGE_SIZE;
2030 	}
2031 
2032 	if (end >= start) {
2033 		flush_mmu_tlb_region(virt, (unsigned)(end - start));
2034 	}
2035 
2036 	return vaddr;
2037 }
2038 
2039 /*
2040  *      Back-door routine for mapping kernel VM at initialization.
2041  *      Useful for mapping memory outside the range
2042  *      [vm_first_phys, vm_last_phys] (i.e., devices).
2043  *      Otherwise like pmap_map.
2044  */
2045 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)2046 pmap_map_bd(
2047 	vm_map_address_t virt,
2048 	vm_offset_t start,
2049 	vm_offset_t end,
2050 	vm_prot_t prot)
2051 {
2052 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
2053 }
2054 
2055 /*
2056  *      Back-door routine for mapping kernel VM at initialization.
2057  *      Useful for mapping memory specific physical addresses in early
2058  *      boot (i.e., before kernel_map is initialized).
2059  *
2060  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
2061  */
2062 
2063 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2064 pmap_map_high_window_bd(
2065 	vm_offset_t pa_start,
2066 	vm_size_t len,
2067 	vm_prot_t prot)
2068 {
2069 	pt_entry_t              *ptep, pte;
2070 	vm_map_address_t        va_start = VREGION1_START;
2071 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
2072 	vm_map_address_t        va_end;
2073 	vm_map_address_t        va;
2074 	vm_size_t               offset;
2075 
2076 	offset = pa_start & PAGE_MASK;
2077 	pa_start -= offset;
2078 	len += offset;
2079 
2080 	if (len > (va_max - va_start)) {
2081 		panic("%s: area too large, "
2082 		    "pa_start=%p, len=%p, prot=0x%x",
2083 		    __FUNCTION__,
2084 		    (void*)pa_start, (void*)len, prot);
2085 	}
2086 
2087 scan:
2088 	for (; va_start < va_max; va_start += PAGE_SIZE) {
2089 		ptep = pmap_pte(kernel_pmap, va_start);
2090 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2091 		if (!pte_is_valid(*ptep)) {
2092 			break;
2093 		}
2094 	}
2095 	if (va_start > va_max) {
2096 		panic("%s: insufficient pages, "
2097 		    "pa_start=%p, len=%p, prot=0x%x",
2098 		    __FUNCTION__,
2099 		    (void*)pa_start, (void*)len, prot);
2100 	}
2101 
2102 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2103 		ptep = pmap_pte(kernel_pmap, va_end);
2104 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2105 		if (pte_is_valid(*ptep)) {
2106 			va_start = va_end + PAGE_SIZE;
2107 			goto scan;
2108 		}
2109 	}
2110 
2111 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2112 		ptep = pmap_pte(kernel_pmap, va);
2113 		pte = pa_to_pte(pa_start)
2114 		    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2115 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2116 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2117 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2118 #if __ARM_KERNEL_PROTECT__
2119 		pte |= ARM_PTE_NG;
2120 #endif /* __ARM_KERNEL_PROTECT__ */
2121 		write_pte_strong(ptep, pte);
2122 	}
2123 	PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2124 #if KASAN
2125 	kasan_notify_address(va_start, len);
2126 #endif
2127 	return va_start;
2128 }
2129 
2130 static uint32_t
pmap_compute_max_asids(void)2131 pmap_compute_max_asids(void)
2132 {
2133 	DTEntry entry;
2134 	void const *prop = NULL;
2135 	uint32_t max_asids;
2136 	int err;
2137 	unsigned int prop_size;
2138 
2139 	err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2140 	assert(err == kSuccess);
2141 
2142 	if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2143 		/* TODO: consider allowing maxproc limits to be scaled earlier so that
2144 		 * we can choose a more flexible default value here. */
2145 		return MAX_ASIDS;
2146 	}
2147 
2148 	if (prop_size != sizeof(max_asids)) {
2149 		panic("pmap-max-asids property is not a 32-bit integer");
2150 	}
2151 
2152 	max_asids = *((uint32_t const *)prop);
2153 #if HAS_16BIT_ASID
2154 	if (max_asids > MAX_HW_ASIDS) {
2155 		panic("pmap-max-asids 0x%x too large", max_asids);
2156 	}
2157 #else
2158 	/* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2159 	max_asids = (max_asids + 63) & ~63UL;
2160 
2161 	if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2162 		/* currently capped by size of pmap->sw_asid */
2163 		panic("pmap-max-asids 0x%x too large", max_asids);
2164 	}
2165 #endif /* HAS_16BIT_ASID */
2166 	if (max_asids == 0) {
2167 		panic("pmap-max-asids cannot be zero");
2168 	}
2169 	return max_asids;
2170 }
2171 
2172 #if __arm64__
2173 /*
2174  * pmap_get_arm64_prot
2175  *
2176  * return effective armv8 VMSA block protections including
2177  * table AP/PXN/XN overrides of a pmap entry
2178  *
2179  */
2180 
2181 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2182 pmap_get_arm64_prot(
2183 	pmap_t pmap,
2184 	vm_offset_t addr)
2185 {
2186 	tt_entry_t tte = 0;
2187 	unsigned int level = 0;
2188 	uint64_t effective_prot_bits = 0;
2189 	uint64_t aggregate_tte = 0;
2190 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2191 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2192 
2193 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2194 		tte = *pmap_ttne(pmap, level, addr);
2195 
2196 		if (!(tte & ARM_TTE_VALID)) {
2197 			return 0;
2198 		}
2199 
2200 		if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
2201 			/* Block or page mapping; both have the same protection bit layout. */
2202 			break;
2203 		} else if (tte_is_table(tte)) {
2204 			/* All of the table bits we care about are overrides, so just OR them together. */
2205 			aggregate_tte |= tte;
2206 		}
2207 	}
2208 
2209 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2210 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2211 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2212 
2213 	/* Start with the PTE bits. */
2214 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2215 
2216 	/* Table AP bits mask out block/page AP bits */
2217 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2218 
2219 	/* XN/PXN bits can be OR'd in. */
2220 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2221 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2222 
2223 	return effective_prot_bits;
2224 }
2225 #endif /* __arm64__ */
2226 
2227 /**
2228  * Helper macros for accessing the "unnested" and "in-progress" bits in
2229  * pmap->nested_region_unnested_table_bitmap.
2230  */
2231 #define UNNEST_BIT(index) ((index) * 2)
2232 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
2233 
2234 
2235 /*
2236  *	Bootstrap the system enough to run with virtual memory.
2237  *
2238  *	The early VM initialization code has already allocated
2239  *	the first CPU's translation table and made entries for
2240  *	all the one-to-one mappings to be found there.
2241  *
2242  *	We must set up the kernel pmap structures, the
2243  *	physical-to-virtual translation lookup tables for the
2244  *	physical memory to be managed (between avail_start and
2245  *	avail_end).
2246  *
2247  *	Map the kernel's code and data, and allocate the system page table.
2248  *	Page_size must already be set.
2249  *
2250  *	Parameters:
2251  *	first_avail	first available physical page -
2252  *			   after kernel page tables
2253  *	avail_start	PA of first managed physical page
2254  *	avail_end	PA of last managed physical page
2255  */
2256 
2257 void
pmap_bootstrap(vm_offset_t vstart)2258 pmap_bootstrap(
2259 	vm_offset_t vstart)
2260 {
2261 	vm_map_offset_t maxoffset;
2262 
2263 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2264 
2265 #if XNU_MONITOR
2266 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
2267 	pmap_ppl_disable = ml_unsafe_kernel_text();
2268 #endif
2269 
2270 #endif /* XNU_MONITOR */
2271 
2272 #if DEVELOPMENT || DEBUG
2273 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2274 		kprintf("Kernel traces for pmap operations enabled\n");
2275 	}
2276 #endif
2277 
2278 	/*
2279 	 *	Initialize the kernel pmap.
2280 	 */
2281 #if ARM_PARAMETERIZED_PMAP
2282 	kernel_pmap->pmap_pt_attr = native_pt_attr;
2283 #endif /* ARM_PARAMETERIZED_PMAP */
2284 #if HAS_APPLE_PAC
2285 	kernel_pmap->disable_jop = 0;
2286 #endif /* HAS_APPLE_PAC */
2287 	kernel_pmap->tte = cpu_tte;
2288 	kernel_pmap->ttep = cpu_ttep;
2289 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2290 	kernel_pmap->max = UINTPTR_MAX;
2291 	os_atomic_init(&kernel_pmap->ref_count, 1);
2292 #if XNU_MONITOR
2293 	os_atomic_init(&kernel_pmap->nested_count, 0);
2294 #endif
2295 	kernel_pmap->nx_enabled = TRUE;
2296 #ifdef  __arm64__
2297 	kernel_pmap->is_64bit = TRUE;
2298 #else
2299 	kernel_pmap->is_64bit = FALSE;
2300 #endif
2301 #if CONFIG_ROSETTA
2302 	kernel_pmap->is_rosetta = FALSE;
2303 #endif
2304 
2305 	kernel_pmap->nested_region_addr = 0x0ULL;
2306 	kernel_pmap->nested_region_size = 0x0ULL;
2307 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2308 	kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2309 	kernel_pmap->type = PMAP_TYPE_KERNEL;
2310 
2311 	kernel_pmap->hw_asid = 0;
2312 	kernel_pmap->sw_asid = 0;
2313 
2314 	pmap_lock_init(kernel_pmap);
2315 
2316 	pmap_max_asids = pmap_compute_max_asids();
2317 #if HAS_16BIT_ASID
2318 	asid_chunk_size = MAX_HW_ASIDS;
2319 #else
2320 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2321 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2322 	/* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2323 	 * masking used by the PLRU scheme.  This means we must handle the case in which
2324 	 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2325 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2326 	_Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2327 	asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2328 #endif /* HAS_16BIT_ASIDS */
2329 
2330 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2331 
2332 #if HAS_SPECRES_DEBUGGING
2333 	PE_parse_boot_argn("specres_debug", &specres_debug, sizeof(specres_debug));
2334 
2335 	if ((specres_debug & SPECRES_DEBUG_FORCE_RCTX) && (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX)) {
2336 		panic("%s: invalid specres_debug value: %llu", __func__, specres_debug);
2337 	}
2338 #endif /* HAS_SPECRES_DEBUGGING */
2339 
2340 	/**
2341 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2342 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
2343 	 * space for these data structures.
2344 	 */
2345 	pmap_data_bootstrap();
2346 
2347 	/**
2348 	 * Bootstrap any necessary SART data structures and values needed from the device tree.
2349 	 */
2350 	sart_bootstrap();
2351 
2352 	/**
2353 	 * Don't make any assumptions about the alignment of avail_start before this
2354 	 * point (i.e., pmap_data_bootstrap() performs allocations).
2355 	 */
2356 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2357 
2358 	const pmap_paddr_t pmap_struct_start = avail_start;
2359 
2360 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
2361 	avail_start = round_page(avail_start + asid_table_size);
2362 
2363 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2364 
2365 	vm_first_phys = gPhysBase;
2366 	vm_last_phys = trunc_page(avail_end);
2367 
2368 	queue_init(&map_pmap_list);
2369 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2370 	free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2371 	free_page_size_tt_count = 0;
2372 	free_page_size_tt_max = 0;
2373 	free_tt_list = TT_FREE_ENTRY_NULL;
2374 	free_tt_count = 0;
2375 	free_tt_max = 0;
2376 
2377 	virtual_space_start = vstart;
2378 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2379 
2380 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
2381 #if !HAS_16BIT_ASID
2382 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2383 	// Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2384 	asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2385 #endif /* !HAS_16BIT_ASID */
2386 
2387 
2388 
2389 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2390 		maxoffset = trunc_page(maxoffset);
2391 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2392 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2393 			arm_pmap_max_offset_default = maxoffset;
2394 		}
2395 	}
2396 #if defined(__arm64__)
2397 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2398 		maxoffset = trunc_page(maxoffset);
2399 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2400 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2401 			arm64_pmap_max_offset_default = maxoffset;
2402 		}
2403 	}
2404 #endif
2405 
2406 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2407 
2408 
2409 #if PMAP_CS_PPL_MONITOR
2410 	/* Initialize the PPL trust cache read-write lock */
2411 	lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2412 	ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2413 #endif
2414 
2415 #if DEVELOPMENT || DEBUG
2416 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
2417 	    &vm_footprint_suspend_allowed,
2418 	    sizeof(vm_footprint_suspend_allowed));
2419 #endif /* DEVELOPMENT || DEBUG */
2420 
2421 #if KASAN
2422 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2423 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2424 #endif /* KASAN */
2425 
2426 	/**
2427 	 * Ensure that avail_start is always left on a page boundary. The calling
2428 	 * code might not perform any alignment before allocating page tables so
2429 	 * this is important.
2430 	 */
2431 	avail_start = round_page(avail_start);
2432 }
2433 
2434 #if XNU_MONITOR
2435 
2436 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2437 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2438 {
2439 	pmap_paddr_t cur_pa;
2440 	for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2441 		assert(pa_valid(cur_pa));
2442 		ppattr_pa_set_monitor(cur_pa);
2443 	}
2444 }
2445 
2446 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2447 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2448     pmap_paddr_t end_pa,
2449     unsigned int expected_perm,
2450     unsigned int new_perm)
2451 {
2452 	vm_offset_t start_va = phystokv(start_pa);
2453 	vm_offset_t end_va = start_va + (end_pa - start_pa);
2454 
2455 	pa_set_range_monitor(start_pa, end_pa);
2456 	pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2457 }
2458 
2459 static void
pmap_lockdown_kc(void)2460 pmap_lockdown_kc(void)
2461 {
2462 	extern vm_offset_t vm_kernelcache_base;
2463 	extern vm_offset_t vm_kernelcache_top;
2464 	pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2465 	pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2466 	pmap_paddr_t cur_pa = start_pa;
2467 	vm_offset_t cur_va = vm_kernelcache_base;
2468 	while (cur_pa < end_pa) {
2469 		vm_size_t range_size = end_pa - cur_pa;
2470 		vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2471 		if (ptov_va != cur_va) {
2472 			/*
2473 			 * If the physical address maps back to a virtual address that is non-linear
2474 			 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2475 			 * reclaimed by the OS and should therefore not be locked down.
2476 			 */
2477 			cur_pa += range_size;
2478 			cur_va += range_size;
2479 			continue;
2480 		}
2481 		unsigned int pai = pa_index(cur_pa);
2482 		pv_entry_t **pv_h  = pai_to_pvh(pai);
2483 
2484 		vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2485 
2486 		if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2487 			panic("pai %d already locked down", pai);
2488 		}
2489 
2490 		pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2491 		cur_pa += ARM_PGBYTES;
2492 		cur_va += ARM_PGBYTES;
2493 	}
2494 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
2495 	extern uint64_t ctrr_ro_test;
2496 	extern uint64_t ctrr_nx_test;
2497 	pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2498 	for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2499 		pv_entry_t **pv_h  = pai_to_pvh(pa_index(exclude_pages[i]));
2500 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2501 	}
2502 #endif
2503 }
2504 
2505 void
pmap_static_allocations_done(void)2506 pmap_static_allocations_done(void)
2507 {
2508 	pmap_paddr_t monitor_start_pa;
2509 	pmap_paddr_t monitor_end_pa;
2510 
2511 	/*
2512 	 * Protect the bootstrap (V=P and V->P) page tables.
2513 	 *
2514 	 * These bootstrap allocations will be used primarily for page tables.
2515 	 * If we wish to secure the page tables, we need to start by marking
2516 	 * these bootstrap allocations as pages that we want to protect.
2517 	 */
2518 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2519 	monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2520 
2521 	/* The bootstrap page tables are mapped RW at boostrap. */
2522 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2523 
2524 	/*
2525 	 * We use avail_start as a pointer to the first address that has not
2526 	 * been reserved for bootstrap, so we know which pages to give to the
2527 	 * virtual memory layer.
2528 	 */
2529 	monitor_start_pa = first_avail_phys;
2530 	monitor_end_pa = avail_start;
2531 
2532 	/* The other bootstrap allocations are mapped RW at bootstrap. */
2533 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2534 
2535 	/*
2536 	 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2537 	 * to RO in arm_vm_prot_finalize(), which is called after this function.
2538 	 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2539 	 * they can't be allocated for other uses.  We don't need a special xPRR
2540 	 * protection index, as there is no PPL_RO index, and these pages are ultimately
2541 	 * protected by KTRR/CTRR.  Furthermore, use of PPL_RW for these pages would
2542 	 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2543 	 * lookup table index to USER_XO before APRR is applied, leading the hardware
2544 	 * to believe we are dealing with an user XO page upon performing a translation.
2545 	 */
2546 	monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2547 	monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2548 	pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2549 
2550 	monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2551 	monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2552 
2553 	/* PPL data is RW for the PPL, RO for the kernel. */
2554 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2555 
2556 	monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2557 	monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2558 
2559 	/* PPL text is RX for the PPL, RO for the kernel. */
2560 	pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2561 
2562 
2563 	/*
2564 	 * In order to support DTrace, the save areas for the PPL must be
2565 	 * writable.  This is due to the fact that DTrace will try to update
2566 	 * register state.
2567 	 */
2568 	if (pmap_ppl_disable) {
2569 		vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2570 		vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2571 
2572 		pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2573 	}
2574 
2575 
2576 	if (segSizePPLDATACONST > 0) {
2577 		monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2578 		monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2579 
2580 		pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2581 	}
2582 
2583 	/*
2584 	 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2585 	 * precaution.  The real RW mappings are at a different location with guard pages.
2586 	 */
2587 	pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2588 
2589 	/* Prevent remapping of the kernelcache */
2590 	pmap_lockdown_kc();
2591 }
2592 
2593 
2594 void
pmap_lockdown_ppl(void)2595 pmap_lockdown_ppl(void)
2596 {
2597 	/* Mark the PPL as being locked down. */
2598 
2599 	mp_disable_preemption(); // for _nopreempt locking operations
2600 	pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2601 	if (commpage_text_kva != 0) {
2602 		pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2603 		    false, VM_PROT_READ | VM_PROT_EXECUTE);
2604 	}
2605 	mp_enable_preemption();
2606 
2607 	/* Write-protect the kernel RO commpage. */
2608 #error "XPRR configuration error"
2609 }
2610 #endif /* XNU_MONITOR */
2611 
2612 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2613 pmap_virtual_space(
2614 	vm_offset_t *startp,
2615 	vm_offset_t *endp
2616 	)
2617 {
2618 	*startp = virtual_space_start;
2619 	*endp = virtual_space_end;
2620 }
2621 
2622 
2623 __mockable boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2624 pmap_virtual_region(
2625 	unsigned int region_select,
2626 	vm_map_offset_t *startp,
2627 	vm_map_size_t *size
2628 	)
2629 {
2630 	boolean_t       ret = FALSE;
2631 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2632 	if (region_select == 0) {
2633 		/*
2634 		 * In this config, the bootstrap mappings should occupy their own L2
2635 		 * TTs, as they should be immutable after boot.  Having the associated
2636 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2637 		 * while allowing the rest of the kernel address range to be remapped.
2638 		 */
2639 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2640 #if defined(ARM_LARGE_MEMORY)
2641 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2642 #else
2643 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2644 #endif
2645 		ret = TRUE;
2646 	}
2647 
2648 #if defined(ARM_LARGE_MEMORY)
2649 	if (region_select == 1) {
2650 		*startp = VREGION1_START;
2651 		*size = VREGION1_SIZE;
2652 		ret = TRUE;
2653 	}
2654 #endif
2655 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2656 #if defined(ARM_LARGE_MEMORY)
2657 	/* For large memory systems with no KTRR/CTRR */
2658 	if (region_select == 0) {
2659 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2660 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2661 		ret = TRUE;
2662 	}
2663 
2664 	if (region_select == 1) {
2665 		*startp = VREGION1_START;
2666 		*size = VREGION1_SIZE;
2667 		ret = TRUE;
2668 	}
2669 #else /* !defined(ARM_LARGE_MEMORY) */
2670 	unsigned long low_global_vr_mask = 0;
2671 	vm_map_size_t low_global_vr_size = 0;
2672 
2673 	if (region_select == 0) {
2674 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2675 		if (!TEST_PAGE_SIZE_4K) {
2676 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2677 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2678 		} else {
2679 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2680 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2681 		}
2682 		ret = TRUE;
2683 	}
2684 	if (region_select == 1) {
2685 		*startp = VREGION1_START;
2686 		*size = VREGION1_SIZE;
2687 		ret = TRUE;
2688 	}
2689 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2690 	if (!TEST_PAGE_SIZE_4K) {
2691 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2692 		low_global_vr_size = 0x2000000;
2693 	} else {
2694 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2695 		low_global_vr_size = 0x800000;
2696 	}
2697 
2698 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2699 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2700 		*size = low_global_vr_size;
2701 		ret = TRUE;
2702 	}
2703 
2704 	if (region_select == 3) {
2705 		/* In this config, we allow the bootstrap mappings to occupy the same
2706 		 * page table pages as the heap.
2707 		 */
2708 		*startp = VM_MIN_KERNEL_ADDRESS;
2709 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2710 		ret = TRUE;
2711 	}
2712 #endif /* defined(ARM_LARGE_MEMORY) */
2713 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2714 	return ret;
2715 }
2716 
2717 /*
2718  * Routines to track and allocate physical pages during early boot.
2719  * On most systems that memory runs from first_avail through to avail_end
2720  * with no gaps.
2721  *
2722  * If the system supports ECC and ecc_bad_pages_count > 0, we
2723  * need to skip those pages.
2724  */
2725 
2726 static unsigned int avail_page_count = 0;
2727 static bool need_ram_ranges_init = true;
2728 
2729 
2730 /**
2731  * Checks to see if a given page is in
2732  * the array of known bad pages
2733  *
2734  * @param ppn page number to check
2735  */
2736 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2737 pmap_is_bad_ram(__unused ppnum_t ppn)
2738 {
2739 	return false;
2740 }
2741 
2742 /**
2743  * Prepare bad ram pages to be skipped.
2744  */
2745 
2746 /*
2747  * Initialize the count of available pages. No lock needed here,
2748  * as this code is called while kernel boot up is single threaded.
2749  */
2750 static void
initialize_ram_ranges(void)2751 initialize_ram_ranges(void)
2752 {
2753 	pmap_paddr_t first = first_avail;
2754 	pmap_paddr_t end = avail_end;
2755 
2756 	assert(first <= end);
2757 	assert(first == (first & ~PAGE_MASK));
2758 	assert(end == (end & ~PAGE_MASK));
2759 	avail_page_count = atop(end - first);
2760 
2761 	need_ram_ranges_init = false;
2762 }
2763 
2764 unsigned int
pmap_free_pages(void)2765 pmap_free_pages(
2766 	void)
2767 {
2768 	if (need_ram_ranges_init) {
2769 		initialize_ram_ranges();
2770 	}
2771 	return avail_page_count;
2772 }
2773 
2774 unsigned int
pmap_free_pages_span(void)2775 pmap_free_pages_span(
2776 	void)
2777 {
2778 	if (need_ram_ranges_init) {
2779 		initialize_ram_ranges();
2780 	}
2781 	return (unsigned int)atop(avail_end - first_avail);
2782 }
2783 
2784 
2785 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2786 pmap_next_page_hi(
2787 	ppnum_t            * pnum,
2788 	__unused boolean_t might_free)
2789 {
2790 	return pmap_next_page(pnum);
2791 }
2792 
2793 
2794 boolean_t
pmap_next_page(ppnum_t * pnum)2795 pmap_next_page(
2796 	ppnum_t *pnum)
2797 {
2798 	if (need_ram_ranges_init) {
2799 		initialize_ram_ranges();
2800 	}
2801 
2802 
2803 	if (first_avail != avail_end) {
2804 		*pnum = (ppnum_t)atop(first_avail);
2805 		first_avail += PAGE_SIZE;
2806 		assert(avail_page_count > 0);
2807 		--avail_page_count;
2808 		return TRUE;
2809 	}
2810 	assert(avail_page_count == 0);
2811 	return FALSE;
2812 }
2813 
2814 
2815 /**
2816  * Helper function to check wheter the given physical
2817  * page number is a restricted page.
2818  *
2819  * @param pn the physical page number to query.
2820  */
2821 bool
pmap_is_page_restricted(__unused ppnum_t pn)2822 pmap_is_page_restricted(__unused ppnum_t pn)
2823 {
2824 	return false;
2825 }
2826 
2827 /*
2828  *	Initialize the pmap module.
2829  *	Called by vm_init, to initialize any structures that the pmap
2830  *	system needs to map virtual memory.
2831  */
2832 void
pmap_init(void)2833 pmap_init(
2834 	void)
2835 {
2836 	/*
2837 	 *	Protect page zero in the kernel map.
2838 	 *	(can be overruled by permanent transltion
2839 	 *	table entries at page zero - see arm_vm_init).
2840 	 */
2841 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2842 
2843 	pmap_initialized = TRUE;
2844 
2845 	/*
2846 	 *	Create the zone of physical maps
2847 	 *	and the physical-to-virtual entries.
2848 	 */
2849 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2850 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2851 
2852 
2853 	/*
2854 	 *	Initialize the pmap object (for tracking the vm_page_t
2855 	 *	structures for pages we allocate to be page tables in
2856 	 *	pmap_expand().
2857 	 */
2858 	_vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2859 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2860 
2861 	/*
2862 	 * The values of [hard_]maxproc may have been scaled, make sure
2863 	 * they are still less than the value of pmap_max_asids.
2864 	 */
2865 	if ((uint32_t)maxproc > pmap_max_asids) {
2866 		maxproc = pmap_max_asids;
2867 	}
2868 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2869 		hard_maxproc = pmap_max_asids;
2870 	}
2871 }
2872 
2873 /**
2874  * Verify that a given physical page contains no mappings (outside of the
2875  * default physical aperture mapping).
2876  *
2877  * @param ppnum Physical page number to check there are no mappings to.
2878  *
2879  * @return True if there are no mappings, false otherwise or if the page is not
2880  *         kernel-managed.
2881  */
2882 bool
pmap_verify_free(ppnum_t ppnum)2883 pmap_verify_free(ppnum_t ppnum)
2884 {
2885 	const pmap_paddr_t pa = ptoa(ppnum);
2886 
2887 	assert(pa != vm_page_fictitious_addr);
2888 
2889 	/* Only mappings to kernel-managed physical memory are tracked. */
2890 	if (!pa_valid(pa)) {
2891 		return false;
2892 	}
2893 
2894 	const unsigned int pai = pa_index(pa);
2895 	pv_entry_t **pvh = pai_to_pvh(pai);
2896 
2897 	return pvh_test_type(pvh, PVH_TYPE_NULL);
2898 }
2899 
2900 #if MACH_ASSERT
2901 /**
2902  * Verify that a given physical page contains no mappings (outside of the
2903  * default physical aperture mapping) and if it does, then panic.
2904  *
2905  * @note It's recommended to use pmap_verify_free() directly when operating in
2906  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2907  *       normally being called from outside of the PPL, and the pv_head_table
2908  *       can't be modified outside of the PPL).
2909  *
2910  * @param ppnum Physical page number to check there are no mappings to.
2911  */
2912 void
pmap_assert_free(ppnum_t ppnum)2913 pmap_assert_free(ppnum_t ppnum)
2914 {
2915 	const pmap_paddr_t pa = ptoa(ppnum);
2916 
2917 	/* Only mappings to kernel-managed physical memory are tracked. */
2918 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2919 		return;
2920 	}
2921 
2922 	const unsigned int pai = pa_index(pa);
2923 	pv_entry_t **pvh = pai_to_pvh(pai);
2924 
2925 	/**
2926 	 * This function is always called from outside of the PPL. Because of this,
2927 	 * the PVH entry can't be locked. This function is generally only called
2928 	 * before the VM reclaims a physical page and shouldn't be creating new
2929 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2930 	 * the worst case is that the system will panic in another way, and we were
2931 	 * already about to panic anyway.
2932 	 */
2933 
2934 	/**
2935 	 * Since pmap_verify_free() returned false, that means there is at least one
2936 	 * mapping left. Let's get some extra info on the first mapping we find to
2937 	 * dump in the panic string (the common case is that there is one spare
2938 	 * mapping that was never unmapped).
2939 	 */
2940 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2941 
2942 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2943 		first_ptep = pvh_ptep(pvh);
2944 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2945 		pv_entry_t *pvep = pvh_pve_list(pvh);
2946 
2947 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2948 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2949 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2950 			if (first_ptep != PT_ENTRY_NULL) {
2951 				break;
2952 			}
2953 		}
2954 
2955 		/* The PVE should have at least one valid PTE. */
2956 		assert(first_ptep != PT_ENTRY_NULL);
2957 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2958 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2959 		    __func__, pvh, pai);
2960 	} else {
2961 		/**
2962 		 * The mapping disappeared between here and the pmap_verify_free() call.
2963 		 * The only way that can happen is if the VM was racing this call with
2964 		 * a call that unmaps PTEs. Operations on this page should not be
2965 		 * occurring at the same time as this check, and unfortunately we can't
2966 		 * lock the PVH entry to prevent it, so just panic instead.
2967 		 */
2968 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2969 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2970 		    __func__, pvh, pai);
2971 	}
2972 
2973 	/* Panic with a unique string identifying the first bad mapping and owner. */
2974 	{
2975 		/* First PTE is mapped by the main CPUs. */
2976 		pmap_t pmap = ptep_get_pmap(first_ptep);
2977 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2978 
2979 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2980 		    "%s CPU mapping (pmap: %p)",
2981 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2982 	}
2983 }
2984 #endif /* MACH_ASSERT */
2985 
2986 inline void
pmap_recycle_page(ppnum_t pn)2987 pmap_recycle_page(ppnum_t pn)
2988 {
2989 	const bool is_freed = pmap_verify_free(pn);
2990 
2991 	if (__improbable(!is_freed)) {
2992 		/*
2993 		 * There is a redundancy here, but we are going to panic anyways,
2994 		 * and ASSERT_PMAP_FREE traces useful information. So, we keep this
2995 		 * behavior.
2996 		 */
2997 #if MACH_ASSERT
2998 		pmap_assert_free(pn);
2999 #endif /* MACH_ASSERT */
3000 		panic("%s: page 0x%llx is referenced", __func__, (unsigned long long)ptoa(pn));
3001 	}
3002 }
3003 
3004 
3005 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)3006 pmap_root_alloc_size(pmap_t pmap)
3007 {
3008 #pragma unused(pmap)
3009 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3010 	unsigned int root_level = pt_attr_root_level(pt_attr);
3011 	const uint64_t index = pt_attr_va_valid_mask(pt_attr);
3012 	return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
3013 }
3014 
3015 
3016 /*
3017  *	Create and return a physical map.
3018  *
3019  *	If the size specified for the map
3020  *	is zero, the map is an actual physical
3021  *	map, and may be referenced by the
3022  *	hardware.
3023  *
3024  *	If the size specified is non-zero,
3025  *	the map will be used in software only, and
3026  *	is bounded by that size.
3027  */
3028 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)3029 pmap_create_options_internal(
3030 	ledger_t ledger,
3031 	vm_map_size_t size,
3032 	unsigned int flags,
3033 	kern_return_t *kr)
3034 {
3035 	unsigned        i;
3036 	unsigned        tte_index_max;
3037 	pmap_t          p;
3038 	bool is_64bit = flags & PMAP_CREATE_64BIT;
3039 #if defined(HAS_APPLE_PAC)
3040 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
3041 #endif /* defined(HAS_APPLE_PAC) */
3042 	kern_return_t   local_kr = KERN_SUCCESS;
3043 
3044 	if (size != 0) {
3045 		{
3046 			// Size parameter should only be set for stage 2.
3047 			return PMAP_NULL;
3048 		}
3049 	}
3050 
3051 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
3052 		return PMAP_NULL;
3053 	}
3054 
3055 #if XNU_MONITOR
3056 	if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
3057 		goto pmap_create_fail;
3058 	}
3059 
3060 	assert(p != PMAP_NULL);
3061 
3062 	if (ledger) {
3063 		pmap_ledger_validate(ledger);
3064 		pmap_ledger_retain(ledger);
3065 	}
3066 #else
3067 	/*
3068 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
3069 	 *	the translation table of the right size for the pmap.
3070 	 */
3071 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
3072 		local_kr = KERN_RESOURCE_SHORTAGE;
3073 		goto pmap_create_fail;
3074 	}
3075 #endif
3076 
3077 	p->ledger = ledger;
3078 
3079 
3080 	p->pmap_vm_map_cs_enforced = false;
3081 	p->min = 0;
3082 
3083 
3084 #if CONFIG_ROSETTA
3085 	if (flags & PMAP_CREATE_ROSETTA) {
3086 		p->is_rosetta = TRUE;
3087 	} else {
3088 		p->is_rosetta = FALSE;
3089 	}
3090 #endif /* CONFIG_ROSETTA */
3091 
3092 #if defined(HAS_APPLE_PAC)
3093 	p->disable_jop = disable_jop;
3094 #endif /* defined(HAS_APPLE_PAC) */
3095 
3096 	p->nested_region_true_start = 0;
3097 	p->nested_region_true_end = ~0;
3098 
3099 	p->nx_enabled = true;
3100 	p->is_64bit = is_64bit;
3101 	p->nested_pmap = PMAP_NULL;
3102 	p->type = PMAP_TYPE_USER;
3103 
3104 #if ARM_PARAMETERIZED_PMAP
3105 	/* Default to the native pt_attr */
3106 	p->pmap_pt_attr = native_pt_attr;
3107 #endif /* ARM_PARAMETERIZED_PMAP */
3108 #if __ARM_MIXED_PAGE_SIZE__
3109 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3110 		p->pmap_pt_attr = &pmap_pt_attr_4k;
3111 	}
3112 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3113 	p->max = pmap_user_va_size(p);
3114 
3115 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3116 		local_kr = KERN_NO_SPACE;
3117 		goto id_alloc_fail;
3118 	}
3119 
3120 	pmap_lock_init(p);
3121 
3122 	p->tt_entry_free = (tt_entry_t *)0;
3123 	tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3124 
3125 
3126 #if XNU_MONITOR
3127 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3128 #else
3129 	p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3130 #endif
3131 	if (!(p->tte)) {
3132 		local_kr = KERN_RESOURCE_SHORTAGE;
3133 		goto tt1_alloc_fail;
3134 	}
3135 
3136 	p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3137 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3138 
3139 	/* nullify the translation table */
3140 	for (i = 0; i < tte_index_max; i++) {
3141 		p->tte[i] = ARM_TTE_TYPE_FAULT;
3142 	}
3143 
3144 	FLUSH_PTE();
3145 
3146 	/*
3147 	 *  initialize the rest of the structure
3148 	 */
3149 	p->nested_region_addr = 0x0ULL;
3150 	p->nested_region_size = 0x0ULL;
3151 	p->nested_region_unnested_table_bitmap = NULL;
3152 	p->nested_region_unnested_table_bitmap_size = 0x0UL;
3153 
3154 	p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3155 	p->nested_no_bounds_refcnt = 0;
3156 	p->nested_bounds_set = false;
3157 
3158 
3159 #if MACH_ASSERT
3160 	p->pmap_pid = 0;
3161 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3162 #endif /* MACH_ASSERT */
3163 #if DEVELOPMENT || DEBUG
3164 	p->footprint_was_suspended = FALSE;
3165 #endif /* DEVELOPMENT || DEBUG */
3166 
3167 #if XNU_MONITOR
3168 	os_atomic_init(&p->nested_count, 0);
3169 	assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3170 	/* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3171 	os_atomic_thread_fence(release);
3172 #endif
3173 	os_atomic_init(&p->ref_count, 1);
3174 	pmap_simple_lock(&pmaps_lock);
3175 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3176 	pmap_simple_unlock(&pmaps_lock);
3177 
3178 	/*
3179 	 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3180 	 * which can lead to a concurrent disconnect operation making the balance
3181 	 * transiently negative.  The ledger should still ultimately balance out,
3182 	 * which we still check upon pmap destruction.
3183 	 */
3184 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3185 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3186 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3187 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3188 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3189 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3190 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3191 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3192 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3193 
3194 	return p;
3195 
3196 tt1_alloc_fail:
3197 	pmap_get_pt_ops(p)->free_id(p);
3198 id_alloc_fail:
3199 #if XNU_MONITOR
3200 	pmap_free_pmap(p);
3201 
3202 	if (ledger) {
3203 		pmap_ledger_release(ledger);
3204 	}
3205 #else
3206 	zfree(pmap_zone, p);
3207 #endif
3208 pmap_create_fail:
3209 #if XNU_MONITOR
3210 	pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3211 #endif
3212 	*kr = local_kr;
3213 #if XNU_MONITOR
3214 	pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3215 #endif
3216 	return PMAP_NULL;
3217 }
3218 
3219 __mockable pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3220 pmap_create_options(
3221 	ledger_t ledger,
3222 	vm_map_size_t size,
3223 	unsigned int flags)
3224 {
3225 	pmap_t pmap;
3226 	kern_return_t kr = KERN_SUCCESS;
3227 
3228 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3229 
3230 	ledger_reference(ledger);
3231 
3232 #if XNU_MONITOR
3233 	for (;;) {
3234 		pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3235 		if (kr != KERN_RESOURCE_SHORTAGE) {
3236 			break;
3237 		}
3238 		assert(pmap == PMAP_NULL);
3239 		pmap_alloc_page_for_ppl(0);
3240 		kr = KERN_SUCCESS;
3241 	}
3242 #else
3243 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3244 #endif
3245 
3246 	if (pmap == PMAP_NULL) {
3247 		ledger_dereference(ledger);
3248 	}
3249 
3250 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3251 
3252 	return pmap;
3253 }
3254 
3255 #if XNU_MONITOR
3256 /*
3257  * This symbol remains in place when the PPL is enabled so that the dispatch
3258  * table does not change from development to release configurations.
3259  */
3260 #endif
3261 #if MACH_ASSERT || XNU_MONITOR
3262 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3263 pmap_set_process_internal(
3264 	__unused pmap_t pmap,
3265 	__unused int pid,
3266 	__unused char *procname)
3267 {
3268 #if MACH_ASSERT
3269 	if (pmap == NULL || pmap->pmap_pid == -1) {
3270 		return;
3271 	}
3272 
3273 	validate_pmap_mutable(pmap);
3274 
3275 	pmap->pmap_pid = pid;
3276 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3277 #endif /* MACH_ASSERT */
3278 }
3279 #endif /* MACH_ASSERT || XNU_MONITOR */
3280 
3281 #if MACH_ASSERT
3282 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3283 pmap_set_process(
3284 	pmap_t pmap,
3285 	int pid,
3286 	char *procname)
3287 {
3288 #if XNU_MONITOR
3289 	pmap_set_process_ppl(pmap, pid, procname);
3290 #else
3291 	pmap_set_process_internal(pmap, pid, procname);
3292 #endif
3293 }
3294 #endif /* MACH_ASSERT */
3295 
3296 /*
3297  * pmap_deallocate_all_leaf_tts:
3298  *
3299  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
3300  * removing and deallocating all TTEs.
3301  */
3302 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3303 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3304 {
3305 	tt_entry_t tte = ARM_TTE_EMPTY;
3306 	tt_entry_t * ttep = NULL;
3307 	tt_entry_t * last_ttep = NULL;
3308 
3309 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3310 
3311 	assert(level < pt_attr_leaf_level(pt_attr));
3312 
3313 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3314 
3315 	for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3316 		tte = *ttep;
3317 
3318 		if (!(tte & ARM_TTE_VALID)) {
3319 			continue;
3320 		}
3321 
3322 		if (tte_is_block(tte)) {
3323 			panic("%s: found block mapping, ttep=%p, tte=%p, "
3324 			    "pmap=%p, first_ttep=%p, level=%u",
3325 			    __FUNCTION__, ttep, (void *)tte,
3326 			    pmap, first_ttep, level);
3327 		}
3328 
3329 		/* Must be valid, type table */
3330 		if (level < pt_attr_twig_level(pt_attr)) {
3331 			/* If we haven't reached the twig level, recurse to the next level. */
3332 			pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3333 		}
3334 
3335 		/* Remove the TTE. */
3336 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3337 		pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3338 	}
3339 }
3340 
3341 /*
3342  * We maintain stats and ledgers so that a task's physical footprint is:
3343  * phys_footprint = ((internal - alternate_accounting)
3344  *                   + (internal_compressed - alternate_accounting_compressed)
3345  *                   + iokit_mapped
3346  *                   + purgeable_nonvolatile
3347  *                   + purgeable_nonvolatile_compressed
3348  *                   + page_table)
3349  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3350  */
3351 
3352 /*
3353  *	Retire the given physical map from service.
3354  *	Should only be called if the map contains
3355  *	no valid mappings.
3356  */
3357 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3358 pmap_destroy_internal(
3359 	pmap_t pmap)
3360 {
3361 	if (pmap == PMAP_NULL) {
3362 		return;
3363 	}
3364 
3365 	validate_pmap(pmap);
3366 
3367 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3368 
3369 	int32_t ref_count = os_atomic_dec(&pmap->ref_count, release);
3370 	if (ref_count > 0) {
3371 		return;
3372 	} else if (__improbable(ref_count < 0)) {
3373 		panic("pmap %p: refcount underflow", pmap);
3374 	} else if (__improbable(pmap == kernel_pmap)) {
3375 		panic("pmap %p: attempt to destroy kernel pmap", pmap);
3376 	} else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3377 		panic("pmap %p: attempt to destroy commpage pmap", pmap);
3378 	}
3379 
3380 	/*
3381 	 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3382 	 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3383 	 * That ensures that if the pmap is currently in use elsewhere, this path will
3384 	 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3385 	 * ref_count of 0 and panic.
3386 	 */
3387 	os_atomic_thread_fence(seq_cst);
3388 
3389 #if XNU_MONITOR
3390 	if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3391 		panic("pmap %p: attempt to destroy while nested", pmap);
3392 	}
3393 	const int max_cpu = ml_get_max_cpu_number();
3394 	for (unsigned int i = 0; i <= max_cpu; ++i) {
3395 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3396 		if (cpu_data == NULL) {
3397 			continue;
3398 		}
3399 		if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3400 			panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3401 		} else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3402 			panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3403 		}
3404 	}
3405 #endif
3406 	pmap_unmap_commpage(pmap);
3407 
3408 	pmap_simple_lock(&pmaps_lock);
3409 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3410 	pmap_simple_unlock(&pmaps_lock);
3411 
3412 	pmap_trim_self(pmap);
3413 
3414 	/*
3415 	 *	Free the memory maps, then the
3416 	 *	pmap structure.
3417 	 */
3418 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3419 
3420 
3421 
3422 	if (pmap->tte) {
3423 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3424 		pmap->tte = (tt_entry_t *) NULL;
3425 		pmap->ttep = 0;
3426 	}
3427 
3428 	assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3429 
3430 	if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3431 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3432 		sync_tlb_flush();
3433 	} else {
3434 		pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3435 		sync_tlb_flush();
3436 		/* return its asid to the pool */
3437 		pmap_get_pt_ops(pmap)->free_id(pmap);
3438 		if (pmap->nested_pmap != NULL) {
3439 #if XNU_MONITOR
3440 			os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3441 #endif
3442 			/* release the reference we hold on the nested pmap */
3443 			pmap_destroy_internal(pmap->nested_pmap);
3444 		}
3445 	}
3446 
3447 	pmap_check_ledgers(pmap);
3448 
3449 	if (pmap->nested_region_unnested_table_bitmap) {
3450 #if XNU_MONITOR
3451 		pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3452 #else
3453 		kfree_data(pmap->nested_region_unnested_table_bitmap,
3454 		    pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3455 #endif
3456 	}
3457 
3458 #if XNU_MONITOR
3459 	if (pmap->ledger) {
3460 		pmap_ledger_release(pmap->ledger);
3461 	}
3462 
3463 	pmap_lock_destroy(pmap);
3464 	pmap_free_pmap(pmap);
3465 #else
3466 	pmap_lock_destroy(pmap);
3467 	zfree(pmap_zone, pmap);
3468 #endif
3469 }
3470 
3471 __mockable void
pmap_destroy(pmap_t pmap)3472 pmap_destroy(
3473 	pmap_t pmap)
3474 {
3475 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3476 
3477 	ledger_t ledger = pmap->ledger;
3478 
3479 #if XNU_MONITOR
3480 	pmap_destroy_ppl(pmap);
3481 
3482 	pmap_ledger_check_balance(pmap);
3483 #else
3484 	pmap_destroy_internal(pmap);
3485 #endif
3486 
3487 	ledger_dereference(ledger);
3488 
3489 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3490 }
3491 
3492 
3493 /*
3494  *	Add a reference to the specified pmap.
3495  */
3496 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3497 pmap_reference_internal(
3498 	pmap_t pmap)
3499 {
3500 	if (pmap != PMAP_NULL) {
3501 		validate_pmap_mutable(pmap);
3502 		os_atomic_inc(&pmap->ref_count, acquire);
3503 	}
3504 }
3505 
3506 void
pmap_reference(pmap_t pmap)3507 pmap_reference(
3508 	pmap_t pmap)
3509 {
3510 #if XNU_MONITOR
3511 	pmap_reference_ppl(pmap);
3512 #else
3513 	pmap_reference_internal(pmap);
3514 #endif
3515 }
3516 
3517 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3518 pmap_tt1_allocate(
3519 	pmap_t          pmap,
3520 	vm_size_t       size,
3521 	unsigned        option)
3522 {
3523 	tt_entry_t      *tt1 = NULL;
3524 	tt_free_entry_t *tt1_free;
3525 	pmap_paddr_t    pa;
3526 	vm_address_t    va;
3527 	vm_address_t    va_end;
3528 	kern_return_t   ret;
3529 
3530 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3531 		size = PAGE_SIZE;
3532 	}
3533 
3534 	/**
3535 	 * We expect top level translation tables to always fit into a single
3536 	 * physical page. This would also catch a misconfiguration if 4K
3537 	 * concatenated page tables needed more than one physical tt1 page.
3538 	 */
3539 	if (__improbable(size > PAGE_SIZE)) {
3540 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)size);
3541 	}
3542 
3543 	pmap_simple_lock(&tt1_lock);
3544 	if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3545 		free_page_size_tt_count--;
3546 		tt1 = (tt_entry_t *)free_page_size_tt_list;
3547 		free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3548 	} else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3549 		free_tt_count--;
3550 		tt1 = (tt_entry_t *)free_tt_list;
3551 		free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3552 	}
3553 	pmap_simple_unlock(&tt1_lock);
3554 
3555 	if (tt1 != NULL) {
3556 		pmap_tt_ledger_credit(pmap, size);
3557 		return (tt_entry_t *)tt1;
3558 	}
3559 
3560 	ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3561 
3562 	if (ret == KERN_RESOURCE_SHORTAGE) {
3563 		return (tt_entry_t *)0;
3564 	}
3565 
3566 #if XNU_MONITOR
3567 	assert(pa);
3568 #endif
3569 
3570 	if (size < PAGE_SIZE) {
3571 		va = phystokv(pa) + size;
3572 		tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3573 		tt_free_entry_t *next_free = NULL;
3574 		for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3575 			tt1_free = (tt_free_entry_t *)va;
3576 			tt1_free->next = next_free;
3577 			next_free = tt1_free;
3578 		}
3579 		pmap_simple_lock(&tt1_lock);
3580 		local_free_list->next = free_tt_list;
3581 		free_tt_list = next_free;
3582 		free_tt_count += ((PAGE_SIZE / size) - 1);
3583 		if (free_tt_count > free_tt_max) {
3584 			free_tt_max = free_tt_count;
3585 		}
3586 		pmap_simple_unlock(&tt1_lock);
3587 	}
3588 
3589 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3590 	 * Depending on the device, this can vary between 512b and 16K. */
3591 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3592 	OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3593 	pmap_tt_ledger_credit(pmap, size);
3594 
3595 	return (tt_entry_t *) phystokv(pa);
3596 }
3597 
3598 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3599 pmap_tt1_deallocate(
3600 	pmap_t pmap,
3601 	tt_entry_t *tt,
3602 	vm_size_t size,
3603 	unsigned option)
3604 {
3605 	tt_free_entry_t *tt_entry;
3606 
3607 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3608 		size = PAGE_SIZE;
3609 	}
3610 
3611 	tt_entry = (tt_free_entry_t *)tt;
3612 	assert(not_in_kdp);
3613 	pmap_simple_lock(&tt1_lock);
3614 
3615 	if (size < PAGE_SIZE) {
3616 		free_tt_count++;
3617 		if (free_tt_count > free_tt_max) {
3618 			free_tt_max = free_tt_count;
3619 		}
3620 		tt_entry->next = free_tt_list;
3621 		free_tt_list = tt_entry;
3622 	}
3623 
3624 	if (size == PAGE_SIZE) {
3625 		free_page_size_tt_count++;
3626 		if (free_page_size_tt_count > free_page_size_tt_max) {
3627 			free_page_size_tt_max = free_page_size_tt_count;
3628 		}
3629 		tt_entry->next = free_page_size_tt_list;
3630 		free_page_size_tt_list = tt_entry;
3631 	}
3632 
3633 	if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3634 		pmap_simple_unlock(&tt1_lock);
3635 		pmap_tt_ledger_debit(pmap, size);
3636 		return;
3637 	}
3638 
3639 	while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3640 		free_page_size_tt_count--;
3641 		tt = (tt_entry_t *)free_page_size_tt_list;
3642 		free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3643 
3644 		pmap_simple_unlock(&tt1_lock);
3645 
3646 		pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3647 
3648 		OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3649 
3650 		pmap_simple_lock(&tt1_lock);
3651 	}
3652 
3653 	pmap_simple_unlock(&tt1_lock);
3654 	pmap_tt_ledger_debit(pmap, size);
3655 }
3656 
3657 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3658 pmap_tt_allocate(
3659 	pmap_t pmap,
3660 	tt_entry_t **ttp,
3661 	unsigned int level,
3662 	unsigned int options)
3663 {
3664 	pmap_paddr_t pa;
3665 	*ttp = NULL;
3666 
3667 	/* Traverse the tt_entry_free list to find a free tt_entry */
3668 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3669 		return KERN_ABORTED;
3670 	}
3671 
3672 	if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3673 		tt_free_entry_t *tt_free_cur, *tt_free_next;
3674 
3675 		tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3676 		tt_free_next = tt_free_cur->next;
3677 		tt_free_cur->next = NULL;
3678 		*ttp = (tt_entry_t *)tt_free_cur;
3679 		pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3680 	}
3681 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3682 
3683 	/* Only do the heavylifting here when we don't have a free tt_entry. */
3684 	if (*ttp == NULL) {
3685 		pt_desc_t       *ptdp;
3686 
3687 		const unsigned int alloc_flags =
3688 		    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGES_ALLOCATE_NOWAIT : 0;
3689 		/*
3690 		 *  Allocate a VM page for the level x page table entries.
3691 		 */
3692 		while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, alloc_flags) != KERN_SUCCESS) {
3693 			if (options & PMAP_OPTIONS_NOWAIT) {
3694 				return KERN_RESOURCE_SHORTAGE;
3695 			}
3696 			VM_PAGE_WAIT();
3697 		}
3698 
3699 		/* Allocate a new Page Table Descriptor for the newly allocated page table. */
3700 		while ((ptdp = ptd_alloc(pmap)) == NULL) {
3701 			if (options & PMAP_OPTIONS_NOWAIT) {
3702 				/* Deallocate all allocated resources so far. */
3703 				pmap_pages_free(pa, PAGE_SIZE);
3704 				return KERN_RESOURCE_SHORTAGE;
3705 			}
3706 			VM_PAGE_WAIT();
3707 		}
3708 
3709 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3710 			OSAddAtomic64(1, &alloc_ttepages_count);
3711 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3712 		} else {
3713 			OSAddAtomic64(1, &alloc_ptepages_count);
3714 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3715 		}
3716 
3717 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3718 
3719 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3720 
3721 		pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3722 		/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3723 		pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3724 
3725 		uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3726 		if (PAGE_SIZE > pmap_page_size) {
3727 			vm_address_t    va;
3728 			vm_address_t    va_end;
3729 
3730 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3731 				/* Deallocate all allocated resources so far. */
3732 				pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3733 				PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3734 				pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3735 				pmap_pages_free(pa, PAGE_SIZE);
3736 				ptd_deallocate(ptdp);
3737 
3738 				return KERN_ABORTED;
3739 			}
3740 
3741 			for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3742 				((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3743 				pmap->tt_entry_free = (tt_entry_t *)va;
3744 			}
3745 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3746 		}
3747 
3748 		*ttp = (tt_entry_t *)phystokv(pa);
3749 	}
3750 
3751 #if XNU_MONITOR
3752 	assert(*ttp);
3753 #endif
3754 
3755 	return KERN_SUCCESS;
3756 }
3757 
3758 
3759 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3760 pmap_tt_deallocate(
3761 	pmap_t pmap,
3762 	tt_entry_t *ttp,
3763 	unsigned int level)
3764 {
3765 	pt_desc_t *ptdp;
3766 	ptd_info_t *ptd_info;
3767 	unsigned pt_acc_cnt;
3768 	unsigned i;
3769 	vm_offset_t     free_page = 0;
3770 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3771 	unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3772 
3773 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3774 
3775 	ptdp = ptep_get_ptd(ttp);
3776 	ptd_info = ptd_get_info(ptdp, ttp);
3777 
3778 	ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3779 
3780 	if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3781 		ptd_info->refcnt = 0;
3782 	}
3783 
3784 	if (__improbable(ptd_info->refcnt != 0)) {
3785 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3786 	}
3787 
3788 	for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3789 		pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3790 	}
3791 
3792 	if (pt_acc_cnt == 0) {
3793 		tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3794 		unsigned pt_free_entry_cnt = 1;
3795 
3796 		while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3797 			tt_free_entry_t *tt_free_list_next;
3798 
3799 			tt_free_list_next = tt_free_list->next;
3800 			if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3801 				pt_free_entry_cnt++;
3802 			}
3803 			tt_free_list = tt_free_list_next;
3804 		}
3805 		if (pt_free_entry_cnt == max_pt_index) {
3806 			tt_free_entry_t *tt_free_list_cur;
3807 
3808 			free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3809 			tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3810 			tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3811 
3812 			while (tt_free_list_cur) {
3813 				tt_free_entry_t *tt_free_list_next;
3814 
3815 				tt_free_list_next = tt_free_list_cur->next;
3816 				if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3817 					tt_free_list->next = tt_free_list_next->next;
3818 				} else {
3819 					tt_free_list = tt_free_list_next;
3820 				}
3821 				tt_free_list_cur = tt_free_list_next;
3822 			}
3823 		} else {
3824 			((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3825 			pmap->tt_entry_free = ttp;
3826 		}
3827 	} else {
3828 		((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3829 		pmap->tt_entry_free = ttp;
3830 	}
3831 
3832 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3833 
3834 	if (free_page != 0) {
3835 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3836 		*(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3837 		pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3838 		if (level < pt_attr_leaf_level(pt_attr)) {
3839 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3840 		} else {
3841 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3842 		}
3843 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3844 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3845 	}
3846 }
3847 
3848 /**
3849  * Safely clear out a translation table entry.
3850  *
3851  * @note If the TTE to clear out points to a leaf table, then that leaf table
3852  *       must have a refcnt of zero before the TTE can be removed.
3853  * @note This function expects to be called with pmap locked exclusive, and will
3854  *       return with pmap unlocked.
3855  *
3856  * @param pmap The pmap containing the page table whose TTE is being removed.
3857  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3858  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3859  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3860  * @param ttep Pointer to the TTE that should be cleared out.
3861  * @param level The level of the page table that contains the TTE to be removed.
3862  */
3863 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3864 pmap_tte_remove(
3865 	pmap_t pmap,
3866 	vm_offset_t va_start,
3867 	vm_offset_t va_end,
3868 	bool need_strong_sync,
3869 	tt_entry_t *ttep,
3870 	unsigned int level)
3871 {
3872 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3873 
3874 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3875 	const tt_entry_t tte = *ttep;
3876 
3877 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3878 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3879 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3880 	}
3881 
3882 	*ttep = (tt_entry_t) 0;
3883 	FLUSH_PTE_STRONG();
3884 	// If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3885 	if (va_end > va_start) {
3886 		PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3887 	}
3888 
3889 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3890 
3891 	/**
3892 	 * Remember, the passed in "level" parameter refers to the level above the
3893 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3894 	 * page table).
3895 	 */
3896 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3897 
3898 	/**
3899 	 * Non-leaf pagetables don't track active references in the PTD and instead
3900 	 * use a sentinel refcount.  If we're removing a leaf pagetable, we'll load
3901 	 * the real refcount below.
3902 	 */
3903 	unsigned short refcnt = PT_DESC_REFCOUNT;
3904 
3905 	/*
3906 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3907 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3908 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3909 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3910 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3911 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3912 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3913 	 * synchronize it against the disconnect operation.  If that removal caused the
3914 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3915 	 * operation is finished using the relevant pagetable descriptor.
3916 	 * Address these cases by waiting until all CPUs have been observed to not be
3917 	 * executing pmap_disconnect().
3918 	 */
3919 	if (remove_leaf_table) {
3920 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3921 		const int max_cpu = ml_get_max_cpu_number();
3922 		bitmap_full(&active_disconnects[0], max_cpu + 1);
3923 		bool inflight_disconnect;
3924 
3925 		/*
3926 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3927 		 * ahead of any prior PTE load which may have observed the effect of a
3928 		 * concurrent disconnect operation.  An acquire fence is required for this;
3929 		 * a load-acquire operation is insufficient.
3930 		 */
3931 		os_atomic_thread_fence(acquire);
3932 		do {
3933 			inflight_disconnect = false;
3934 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3935 			    i >= 0;
3936 			    i = bitmap_next(&active_disconnects[0], i)) {
3937 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3938 				if (cpu_data == NULL) {
3939 					continue;
3940 				}
3941 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3942 					__builtin_arm_wfe();
3943 					inflight_disconnect = true;
3944 					continue;
3945 				}
3946 				os_atomic_clear_exclusive();
3947 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3948 			}
3949 		} while (inflight_disconnect);
3950 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3951 		os_atomic_thread_fence(acquire);
3952 		refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3953 	}
3954 
3955 #if MACH_ASSERT
3956 	/**
3957 	 * On internal devices, always do the page table consistency check
3958 	 * regardless of page table level or the actual refcnt value.
3959 	 */
3960 	{
3961 #else /* MACH_ASSERT */
3962 	/**
3963 	 * Only perform the page table consistency check when deleting leaf page
3964 	 * tables and it seems like there might be valid/compressed mappings
3965 	 * leftover.
3966 	 */
3967 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3968 #endif /* MACH_ASSERT */
3969 
3970 		/**
3971 		 * There are multiple problems that can arise as a non-zero refcnt:
3972 		 * 1. A bug in the refcnt management logic.
3973 		 * 2. A memory stomper or hardware failure.
3974 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3975 		 *    space before destroying a pmap.
3976 		 *
3977 		 * By looping over the page table and determining how many valid or
3978 		 * compressed entries there actually are, we can narrow down which of
3979 		 * these three cases is causing this panic. If the expected refcnt
3980 		 * (valid + compressed) and the actual refcnt don't match then the
3981 		 * problem is probably either a memory corruption issue (if the
3982 		 * non-empty entries don't match valid+compressed, that could also be a
3983 		 * sign of corruption) or refcnt management bug. Otherwise, there
3984 		 * actually are leftover mappings and the higher layers of xnu are
3985 		 * probably at fault.
3986 		 */
3987 		const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3988 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3989 
3990 		pt_entry_t *ptep = bpte;
3991 		unsigned short non_empty = 0, valid = 0, comp = 0;
3992 
3993 		for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3994 			/**
3995 			 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3996 			 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3997 			 * That's because it's possible for the 4-tuple PTE clear operation in
3998 			 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3999 			 * pmap_disconnect() to race each other in such a way that the compressed marker
4000 			 * may be left in the 2nd, 3rd, and/or 4th PTEs.
4001 			 * This should be harmless as only the 1st PTE is used for accounting purposes,
4002 			 * but we don't want it to trip our internal checks here.
4003 			 */
4004 			if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
4005 				if ((i % PAGE_RATIO) == 0) {
4006 					comp++;
4007 				} else {
4008 					continue;
4009 				}
4010 			} else if (__improbable(pte_is_valid(*ptep))) {
4011 				valid++;
4012 			}
4013 
4014 			/* Keep track of all non-empty entries to detect memory corruption. */
4015 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
4016 				non_empty++;
4017 			}
4018 		}
4019 
4020 #if MACH_ASSERT
4021 		/**
4022 		 * On internal machines, panic whenever a page table getting deleted has
4023 		 * leftover mappings (valid or otherwise) or a leaf page table has a
4024 		 * non-zero refcnt.
4025 		 */
4026 		if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
4027 #else /* MACH_ASSERT */
4028 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
4029 		{
4030 #endif /* MACH_ASSERT */
4031 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
4032 			    "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
4033 			    level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
4034 		}
4035 	}
4036 }
4037 
4038 /**
4039  * Given a pointer to an entry within a `level` page table, delete the
4040  * page table at `level` + 1 that is represented by that entry. For instance,
4041  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
4042  * contains the PA of the L3 table, and `level` would be "2".
4043  *
4044  * @note If the table getting deallocated is a leaf table, then that leaf table
4045  *       must have a refcnt of zero before getting deallocated. All other levels
4046  *       must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
4047  * @note This function expects to be called with pmap locked exclusive and will
4048  *       return with pmap unlocked.
4049  *
4050  * @param pmap The pmap that owns the page table to be deallocated.
4051  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
4052  * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
4053  * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
4054  * @param ttep Pointer to the `level` TTE to remove.
4055  * @param level The level of the table that contains an entry pointing to the
4056  *              table to be removed. The deallocated page table will be a
4057  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
4058  *              deleted).
4059  */
4060 void
4061 pmap_tte_deallocate(
4062 	pmap_t pmap,
4063 	vm_offset_t va_start,
4064 	vm_offset_t va_end,
4065 	bool need_strong_sync,
4066 	tt_entry_t *ttep,
4067 	unsigned int level)
4068 {
4069 	tt_entry_t tte;
4070 
4071 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4072 
4073 	tte = *ttep;
4074 
4075 	if (tte_get_ptd(tte)->pmap != pmap) {
4076 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4077 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4078 	}
4079 
4080 	assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
4081 	    (unsigned long long)tte);
4082 
4083 	/* pmap_tte_remove() will drop the pmap lock */
4084 	pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4085 
4086 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4087 }
4088 
4089 /*
4090  *	Remove a range of hardware page-table entries.
4091  *	The entries given are the first (inclusive)
4092  *	and last (exclusive) entries for the VM pages.
4093  *	The virtual address is the va for the first pte.
4094  *
4095  *	The pmap must be locked.
4096  *	If the pmap is not the kernel pmap, the range must lie
4097  *	entirely within one pte-page.  This is NOT checked.
4098  *	Assumes that the pte-page exists.
4099  *
4100  *	Returns the number of PTE changed
4101  */
4102 MARK_AS_PMAP_TEXT static int
4103 pmap_remove_range(
4104 	pmap_t pmap,
4105 	vm_map_address_t va,
4106 	pt_entry_t *bpte,
4107 	pt_entry_t *epte)
4108 {
4109 	bool need_strong_sync = false;
4110 	int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4111 	    &need_strong_sync, PMAP_OPTIONS_REMOVE);
4112 	if (num_changed > 0) {
4113 		PMAP_UPDATE_TLBS(pmap, va,
4114 		    va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4115 	}
4116 	return num_changed;
4117 }
4118 
4119 
4120 #ifdef PVH_FLAG_EXEC
4121 
4122 /*
4123  *	Update the access protection bits of the physical aperture mapping for a page.
4124  *	This is useful, for example, in guranteeing that a verified executable page
4125  *	has no writable mappings anywhere in the system, including the physical
4126  *	aperture.  flush_tlb_async can be set to true to avoid unnecessary TLB
4127  *	synchronization overhead in cases where the call to this function is
4128  *	guaranteed to be followed by other TLB operations.
4129  */
4130 void
4131 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4132 {
4133 #if __ARM_PTE_PHYSMAP__
4134 	pvh_assert_locked(pai);
4135 	vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4136 	pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4137 
4138 	pt_entry_t tmplate = *pte_p;
4139 	if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4140 		return;
4141 	}
4142 	tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4143 	if (tmplate & ARM_PTE_HINT_MASK) {
4144 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4145 		    __func__, pte_p, (void *)kva, tmplate);
4146 	}
4147 	write_pte_strong(pte_p, tmplate);
4148 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4149 	if (!flush_tlb_async) {
4150 		sync_tlb_flush();
4151 	}
4152 #endif
4153 }
4154 #endif /* defined(PVH_FLAG_EXEC) */
4155 
4156 
4157 
4158 MARK_AS_PMAP_TEXT int
4159 pmap_remove_range_options(
4160 	pmap_t pmap,
4161 	vm_map_address_t va,
4162 	pt_entry_t *bpte,
4163 	pt_entry_t *epte,
4164 	vm_map_address_t *eva,
4165 	bool *need_strong_sync __unused,
4166 	int options)
4167 {
4168 	pt_entry_t     *cpte;
4169 	size_t          npages = 0;
4170 	int             num_removed, num_unwired;
4171 	int             num_pte_changed;
4172 	unsigned int    pai = 0;
4173 	pmap_paddr_t    pa;
4174 	int             num_external, num_internal, num_reusable;
4175 	int             num_alt_internal;
4176 	uint64_t        num_compressed, num_alt_compressed;
4177 	int16_t         refcnt = 0;
4178 
4179 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4180 
4181 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4182 	uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4183 
4184 	if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4185 		panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4186 	}
4187 
4188 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4189 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4190 	}
4191 
4192 	if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4193 		panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4194 	}
4195 
4196 	num_removed = 0;
4197 	num_unwired = 0;
4198 	num_pte_changed = 0;
4199 	num_external = 0;
4200 	num_internal = 0;
4201 	num_reusable = 0;
4202 	num_compressed = 0;
4203 	num_alt_internal = 0;
4204 	num_alt_compressed = 0;
4205 
4206 #if XNU_MONITOR
4207 	bool ro_va = false;
4208 	if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4209 		ro_va = true;
4210 	}
4211 #endif
4212 	for (cpte = bpte; cpte < epte;
4213 	    cpte += PAGE_RATIO, va += pmap_page_size) {
4214 		pt_entry_t      spte;
4215 		boolean_t       managed = FALSE;
4216 
4217 		/*
4218 		 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4219 		 * so we need to be as aggressive as possible in checking for preemption when we can.
4220 		 */
4221 		if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4222 			*eva = va;
4223 			break;
4224 		}
4225 
4226 		spte = *((volatile pt_entry_t*)cpte);
4227 
4228 		while (!managed) {
4229 			if (pmap != kernel_pmap &&
4230 			    (options & PMAP_OPTIONS_REMOVE) &&
4231 			    (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4232 				/*
4233 				 * "pmap" must be locked at this point,
4234 				 * so this should not race with another
4235 				 * pmap_remove_range() or pmap_enter().
4236 				 */
4237 
4238 				/* one less "compressed"... */
4239 				num_compressed++;
4240 				if (spte & ARM_PTE_COMPRESSED_ALT) {
4241 					/* ... but it used to be "ALTACCT" */
4242 					num_alt_compressed++;
4243 				}
4244 
4245 				/* clear marker */
4246 				write_pte_fast(cpte, ARM_PTE_EMPTY);
4247 				/*
4248 				 * "refcnt" also accounts for
4249 				 * our "compressed" markers,
4250 				 * so let's update it here.
4251 				 */
4252 				--refcnt;
4253 				spte = *((volatile pt_entry_t*)cpte);
4254 			}
4255 			/*
4256 			 * It may be possible for the pte to transition from managed
4257 			 * to unmanaged in this timeframe; for now, elide the assert.
4258 			 * We should break out as a consequence of checking pa_valid.
4259 			 */
4260 			//assert(!ARM_PTE_IS_COMPRESSED(spte));
4261 			pa = pte_to_pa(spte);
4262 			if (!pa_valid(pa)) {
4263 #if XNU_MONITOR
4264 				unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4265 #endif
4266 #if XNU_MONITOR
4267 				if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4268 				    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4269 					panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4270 					    __func__, (uint64_t)pa);
4271 				}
4272 #endif
4273 				break;
4274 			}
4275 #if HAS_FEAT_XS
4276 			if (pte_is_xs(pt_attr, spte)) {
4277 				*need_strong_sync = true;
4278 			}
4279 #endif /* HAS_FEAT_XS */
4280 			pai = pa_index(pa);
4281 			pvh_lock(pai);
4282 			spte = *((volatile pt_entry_t*)cpte);
4283 			pa = pte_to_pa(spte);
4284 			if (pai == pa_index(pa)) {
4285 				managed = TRUE;
4286 				break; // Leave pai locked as we will unlock it after we free the PV entry
4287 			}
4288 			pvh_unlock(pai);
4289 		}
4290 
4291 		if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4292 			/*
4293 			 * There used to be a valid mapping here but it
4294 			 * has already been removed when the page was
4295 			 * sent to the VM compressor, so nothing left to
4296 			 * remove now...
4297 			 */
4298 			continue;
4299 		}
4300 
4301 		/* remove the translation, do not flush the TLB */
4302 		if (*cpte != ARM_PTE_EMPTY) {
4303 			assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4304 			assertf(pte_is_valid(*cpte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4305 #if MACH_ASSERT
4306 			if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4307 				panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4308 				    cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4309 			}
4310 #endif
4311 			write_pte_fast(cpte, ARM_PTE_EMPTY);
4312 			num_pte_changed++;
4313 		}
4314 
4315 		if ((spte != ARM_PTE_EMPTY) && (pmap != kernel_pmap)) {
4316 			assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4317 			assertf(pte_is_valid(spte), "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4318 			--refcnt;
4319 		}
4320 
4321 		if (pte_is_wired(spte)) {
4322 			pte_set_wired(pmap, cpte, 0);
4323 			num_unwired++;
4324 		}
4325 		/*
4326 		 * if not managed, we're done
4327 		 */
4328 		if (!managed) {
4329 			continue;
4330 		}
4331 
4332 #if XNU_MONITOR
4333 		if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4334 			panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4335 		}
4336 		if (__improbable(ro_va)) {
4337 			pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4338 		}
4339 #endif
4340 
4341 		/*
4342 		 * find and remove the mapping from the chain for this
4343 		 * physical address.
4344 		 */
4345 		bool is_internal, is_altacct;
4346 		pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4347 
4348 		if (is_altacct) {
4349 			assert(is_internal);
4350 			num_internal++;
4351 			num_alt_internal++;
4352 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4353 				ppattr_clear_altacct(pai);
4354 				ppattr_clear_internal(pai);
4355 			}
4356 		} else if (is_internal) {
4357 			if (ppattr_test_reusable(pai)) {
4358 				num_reusable++;
4359 			} else {
4360 				num_internal++;
4361 			}
4362 			if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4363 				ppattr_clear_internal(pai);
4364 			}
4365 		} else {
4366 			num_external++;
4367 		}
4368 		pvh_unlock(pai);
4369 		num_removed++;
4370 	}
4371 
4372 	/*
4373 	 *	Update the counts
4374 	 */
4375 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4376 
4377 	if (pmap != kernel_pmap) {
4378 		if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4379 			panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4380 		}
4381 
4382 		/* update ledgers */
4383 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4384 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4385 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4386 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4387 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4388 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4389 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4390 		/* make needed adjustments to phys_footprint */
4391 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4392 		    ((num_internal -
4393 		    num_alt_internal) +
4394 		    (num_compressed -
4395 		    num_alt_compressed)) * pmap_page_size);
4396 	}
4397 
4398 	/* flush the ptable entries we have written */
4399 	if (num_pte_changed > 0) {
4400 		FLUSH_PTE_STRONG();
4401 	}
4402 
4403 	return num_pte_changed;
4404 }
4405 
4406 
4407 /*
4408  *	Remove the given range of addresses
4409  *	from the specified map.
4410  *
4411  *	It is assumed that the start and end are properly
4412  *	rounded to the hardware page size.
4413  */
4414 void
4415 pmap_remove(
4416 	pmap_t pmap,
4417 	vm_map_address_t start,
4418 	vm_map_address_t end)
4419 {
4420 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4421 }
4422 
4423 MARK_AS_PMAP_TEXT vm_map_address_t
4424 pmap_remove_options_internal(
4425 	pmap_t pmap,
4426 	vm_map_address_t start,
4427 	vm_map_address_t end,
4428 	int options)
4429 {
4430 	vm_map_address_t eva = end;
4431 	pt_entry_t     *bpte, *epte;
4432 	pt_entry_t     *pte_p;
4433 	tt_entry_t     *tte_p;
4434 	int             remove_count = 0;
4435 	bool            need_strong_sync = false;
4436 	bool            unlock = true;
4437 
4438 	validate_pmap_mutable(pmap);
4439 
4440 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4441 
4442 	if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4443 		panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4444 	}
4445 
4446 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4447 
4448 	tte_p = pmap_tte(pmap, start);
4449 
4450 	if (tte_p == (tt_entry_t *) NULL) {
4451 		goto done;
4452 	}
4453 
4454 	if (tte_is_valid_table(*tte_p)) {
4455 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
4456 		bpte = &pte_p[pte_index(pt_attr, start)];
4457 		epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4458 
4459 		/*
4460 		 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4461 		 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4462 		 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4463 		 */
4464 		if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4465 			panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4466 			    __func__, ptep_get_pmap(bpte), pmap, bpte);
4467 		}
4468 
4469 		remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4470 		    &need_strong_sync, options);
4471 
4472 		if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4473 			pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4474 			remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4475 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4476 		}
4477 	}
4478 
4479 done:
4480 	if (unlock) {
4481 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4482 	}
4483 
4484 	if (remove_count > 0) {
4485 		PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4486 	}
4487 	return eva;
4488 }
4489 
4490 __mockable void
4491 pmap_remove_options(
4492 	pmap_t pmap,
4493 	vm_map_address_t start,
4494 	vm_map_address_t end,
4495 	int options)
4496 {
4497 	vm_map_address_t va;
4498 
4499 	if (pmap == PMAP_NULL) {
4500 		return;
4501 	}
4502 
4503 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4504 
4505 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4506 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4507 	    VM_KERNEL_ADDRHIDE(end));
4508 
4509 	/*
4510 	 * We allow single-page requests to execute non-preemptibly,
4511 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4512 	 * operation, and there are a couple of special use cases that
4513 	 * require a non-preemptible single-page operation.
4514 	 */
4515 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4516 		pmap_verify_preemptible();
4517 	}
4518 
4519 	/*
4520 	 *      Invalidate the translation buffer first
4521 	 */
4522 	va = start;
4523 	while (va < end) {
4524 		vm_map_address_t l;
4525 
4526 #if XNU_TARGET_OS_XR
4527 		/* rdar://84856940 */
4528 		unsigned int const BATCH_SIZE = 128 * pt_attr_leaf_size(pt_attr);
4529 
4530 		l = va + BATCH_SIZE;
4531 
4532 		vm_map_address_t const l_twig = l & ~pt_attr_twig_offmask(pt_attr);
4533 
4534 		if (l_twig > (va & ~pt_attr_twig_offmask(pt_attr))) {
4535 			// We're not allowed to cross an L2 boundary.
4536 			l = l_twig;
4537 		}
4538 #else /* XNU_TARGET_OS_XR */
4539 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4540 #endif /* XNU_TARGET_OS_XR */
4541 		if (l > end) {
4542 			l = end;
4543 		}
4544 
4545 #if XNU_MONITOR
4546 		va = pmap_remove_options_ppl(pmap, va, l, options);
4547 
4548 		pmap_ledger_check_balance(pmap);
4549 #else
4550 		va = pmap_remove_options_internal(pmap, va, l, options);
4551 #endif
4552 	}
4553 
4554 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4555 }
4556 
4557 
4558 /*
4559  *	Remove phys addr if mapped in specified map
4560  */
4561 void
4562 pmap_remove_some_phys(
4563 	__unused pmap_t map,
4564 	__unused ppnum_t pn)
4565 {
4566 	/* Implement to support working set code */
4567 }
4568 
4569 /*
4570  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4571  * switch a thread onto a new vm_map.
4572  */
4573 void
4574 pmap_switch_user(thread_t thread, vm_map_t new_map)
4575 {
4576 	pmap_t new_pmap = new_map->pmap;
4577 
4578 
4579 	thread->map = new_map;
4580 	pmap_set_pmap(new_pmap, thread);
4581 
4582 }
4583 
4584 void
4585 pmap_set_pmap(
4586 	pmap_t pmap,
4587 	thread_t        thread)
4588 {
4589 	pmap_switch(pmap, thread);
4590 #if __ARM_USER_PROTECT__
4591 	thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4592 	thread->machine.asid = pmap->hw_asid;
4593 #endif
4594 }
4595 
4596 static void
4597 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4598 {
4599 	flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4600 }
4601 
4602 #if HAS_SPECRES
4603 static void
4604 pmap_flush_core_cfp_asid_async(pmap_t pmap)
4605 {
4606 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4607 	asm volatile ("cfp rctx, %0" : : "r"(rctx_operand));
4608 }
4609 
4610 #if REQUIRES_DVP_RCTX
4611 static void
4612 pmap_flush_core_dvp_asid_async(pmap_t pmap)
4613 {
4614 	const uint64_t rctx_operand = RCTX_EL(0ULL) | RCTX_ASID((uint64_t) pmap->hw_asid);
4615 	asm volatile ("dvp rctx, %0" : : "r"(rctx_operand));
4616 }
4617 #endif /* REQUIRES_DVP_RCTX */
4618 #endif /* HAS_SPECRES */
4619 
4620 static inline bool
4621 pmap_user_ttb_is_clear(void)
4622 {
4623 	return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4624 }
4625 
4626 MARK_AS_PMAP_TEXT void
4627 pmap_switch_internal(
4628 	pmap_t pmap)
4629 {
4630 	pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4631 #if XNU_MONITOR
4632 	os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4633 
4634 	/**
4635 	 * Make sure a pmap is never active-and-nested. For more details,
4636 	 * see pmap_set_nested_internal().
4637 	 */
4638 	os_atomic_thread_fence(seq_cst);
4639 	if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4640 		panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4641 	}
4642 #endif
4643 	validate_pmap_mutable(pmap);
4644 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4645 	uint16_t asid_index = pmap->hw_asid;
4646 	bool do_asid_flush = false;
4647 	bool do_commpage_flush = false;
4648 #if HAS_SPECRES
4649 	bool do_speculation_restriction = false;
4650 #endif /* HAS_SPECRES */
4651 
4652 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4653 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4654 	}
4655 #if __ARM_KERNEL_PROTECT__
4656 	asid_index >>= 1;
4657 #endif
4658 
4659 	pmap_t                    last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4660 	__unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4661 	__unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4662 	__unused vm_map_offset_t  last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4663 	bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4664 	bool break_before_make = do_shared_region_flush;
4665 
4666 #if !HAS_16BIT_ASID
4667 	if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4668 		asid_index -= 1;
4669 		pmap_update_plru(asid_index);
4670 
4671 		/* Paranoia. */
4672 		assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4673 
4674 		/* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4675 		uint8_t new_sw_asid = pmap->sw_asid;
4676 		uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4677 
4678 		if (new_sw_asid != last_sw_asid) {
4679 			/**
4680 			 * If the virtual ASID of the new pmap does not match the virtual ASID
4681 			 * last seen on this CPU for the physical ASID (that was a mouthful),
4682 			 * then this switch runs the risk of aliasing.  We need to flush the
4683 			 * TLB for this phyiscal ASID in this case.
4684 			 */
4685 			cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4686 			do_asid_flush = true;
4687 #if HAS_SPECRES
4688 			do_speculation_restriction = true;
4689 #endif /* HAS_SPECRES */
4690 			break_before_make = true;
4691 		}
4692 	}
4693 #endif /* !HAS_16BIT_ASID */
4694 
4695 #if HAS_SPECRES_DEBUGGING
4696 	if (specres_debug & SPECRES_DEBUG_FORCE_RCTX) {
4697 		do_speculation_restriction = true;
4698 	} else if (specres_debug & SPECRES_DEBUG_FORCE_NO_RCTX) {
4699 		do_speculation_restriction = false;
4700 	}
4701 #endif /* HAS_SPECRES_DEBUGGING */
4702 
4703 #if __ARM_MIXED_PAGE_SIZE__
4704 	if (pt_attr->pta_tcr_value != get_tcr()) {
4705 		break_before_make = true;
4706 	}
4707 #endif
4708 #if __ARM_MIXED_PAGE_SIZE__
4709 	/*
4710 	 * For mixed page size configurations, we need to flush the global commpage mappings from
4711 	 * the TLB when transitioning between address spaces with different page sizes.  Otherwise
4712 	 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4713 	 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4714 	 * conflict abort or other unpredictable behavior.
4715 	 */
4716 	if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4717 		do_commpage_flush = true;
4718 	}
4719 	if (do_commpage_flush) {
4720 		break_before_make = true;
4721 	}
4722 #endif
4723 	if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4724 		PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4725 		pmap_clear_user_ttb_internal();
4726 	}
4727 
4728 #if HAS_SPECRES
4729 	/**
4730 	 * Perform an CFP/DVP flush if required.
4731 	 */
4732 	if (__improbable(do_speculation_restriction)) {
4733 		pmap_flush_core_cfp_asid_async(pmap);
4734 #if REQUIRES_DVP_RCTX
4735 		pmap_flush_core_dvp_asid_async(pmap);
4736 #endif /* REQUIRES_DVP_RCTX */
4737 #if DEVELOPMENT || DEBUG
4738 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4739 #endif /* DEVELOPMENT || DEBUG */
4740 	}
4741 #endif /* HAS_SPECRES */
4742 
4743 	/* If we're switching to a different nested pmap (i.e. shared region), we'll need
4744 	 * to flush the userspace mappings for that region.  Those mappings are global
4745 	 * and will not be protected by the ASID.  It should also be cheaper to flush the
4746 	 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4747 	if (__improbable(do_shared_region_flush)) {
4748 #if __ARM_RANGE_TLBI__
4749 		uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4750 		vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4751 
4752 		/* NOTE: here we flush the global TLB entries for the previous nested region only.
4753 		 * There may still be non-global entries that overlap with the incoming pmap's
4754 		 * nested region.  On Apple SoCs at least, this is acceptable.  Those non-global entries
4755 		 * must necessarily belong to a different ASID than the incoming pmap, or they would
4756 		 * be flushed in the do_asid_flush case below.  This will prevent them from conflicting
4757 		 * with the incoming pmap's nested region.  However, the ARMv8 ARM is not crystal clear
4758 		 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4759 		 * to consider additional invalidation here in the future. */
4760 		if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4761 			flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4762 		} else {
4763 			/*
4764 			 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4765 			 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4766 			 * have a single-page shared region anyway, not least because pmap_nest()
4767 			 * requires L2 block alignment of the address and size.
4768 			 */
4769 			do_asid_flush = false;
4770 			flush_core_tlb_async();
4771 		}
4772 #else
4773 		do_asid_flush = false;
4774 		flush_core_tlb_async();
4775 #endif // __ARM_RANGE_TLBI__
4776 	}
4777 
4778 #if __ARM_MIXED_PAGE_SIZE__
4779 	if (__improbable(do_commpage_flush)) {
4780 		const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4781 		const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4782 		    0, _COMM_PAGE64_NESTING_START, commpage_shift);
4783 		flush_core_tlb_allrange_async(rtlbi_param);
4784 	}
4785 #endif
4786 	if (__improbable(do_asid_flush)) {
4787 		pmap_flush_core_tlb_asid_async(pmap);
4788 #if DEVELOPMENT || DEBUG
4789 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4790 #endif /* DEVELOPMENT || DEBUG */
4791 	}
4792 
4793 	if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush
4794 #if HAS_SPECRES && !HAS_ERRATA_123855614
4795 	    || do_speculation_restriction
4796 #endif /* HAS_SPECRES && !HAS_ERRATA_123855614 */
4797 	    )) {
4798 		sync_tlb_flush_local();
4799 	}
4800 
4801 	pmap_switch_user_ttb(pmap, cpu_data_ptr);
4802 }
4803 
4804 void
4805 pmap_switch(
4806 	pmap_t pmap,
4807 	thread_t thread __unused)
4808 {
4809 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4810 #if XNU_MONITOR
4811 	pmap_switch_ppl(pmap);
4812 #else
4813 	pmap_switch_internal(pmap);
4814 #endif
4815 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4816 }
4817 
4818 void
4819 pmap_page_protect(
4820 	ppnum_t ppnum,
4821 	vm_prot_t prot)
4822 {
4823 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4824 }
4825 
4826 /*
4827  *	Routine:	pmap_page_protect_options
4828  *
4829  *	Function:
4830  *		Lower the permission for all mappings to a given
4831  *		page.
4832  */
4833 MARK_AS_PMAP_TEXT static void
4834 pmap_page_protect_options_with_flush_range(
4835 	ppnum_t ppnum,
4836 	vm_prot_t prot,
4837 	unsigned int options,
4838 	pmap_tlb_flush_range_t *flush_range)
4839 {
4840 	pmap_paddr_t    phys = ptoa(ppnum);
4841 	pv_entry_t    **pv_h;
4842 	pv_entry_t     *pve_p, *orig_pve_p;
4843 	pv_entry_t     *pveh_p;
4844 	pv_entry_t     *pvet_p;
4845 	pt_entry_t     *pte_p, *orig_pte_p;
4846 	pv_entry_t     *new_pve_p;
4847 	pt_entry_t     *new_pte_p;
4848 	vm_offset_t     pvh_flags;
4849 	unsigned int    pai;
4850 	bool            remove;
4851 	bool            set_NX;
4852 	unsigned int    pvh_cnt = 0;
4853 	unsigned int    pass1_updated = 0;
4854 	unsigned int    pass2_updated = 0;
4855 
4856 	assert(ppnum != vm_page_fictitious_addr);
4857 
4858 	/* Only work with managed pages. */
4859 	if (!pa_valid(phys)) {
4860 		return;
4861 	}
4862 
4863 	/*
4864 	 * Determine the new protection.
4865 	 */
4866 	switch (prot) {
4867 	case VM_PROT_ALL:
4868 		return;         /* nothing to do */
4869 	case VM_PROT_READ:
4870 	case VM_PROT_READ | VM_PROT_EXECUTE:
4871 		remove = false;
4872 		break;
4873 	default:
4874 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4875 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4876 		remove = true;
4877 		break;
4878 	}
4879 
4880 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4881 	if (remove) {
4882 #if !XNU_MONITOR
4883 		mp_disable_preemption();
4884 #endif
4885 		pmap_cpu_data = pmap_get_cpu_data();
4886 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4887 		/*
4888 		 * Ensure the store to inflight_disconnect will be observed before any of the
4889 		 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4890 		 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4891 		 * another CPU, in between this function's clearing a PTE and dropping the
4892 		 * corresponding pagetable refcount.  That can lead to a panic if the
4893 		 * destroying thread observes a non-zero refcount.  For this we need a store-
4894 		 * store barrier; a store-release operation would not be sufficient.
4895 		 */
4896 		os_atomic_thread_fence(release);
4897 	}
4898 
4899 	pai = pa_index(phys);
4900 	pvh_lock(pai);
4901 	pv_h = pai_to_pvh(pai);
4902 	pvh_flags = pvh_get_flags(pv_h);
4903 
4904 #if XNU_MONITOR
4905 	if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4906 		panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4907 	}
4908 	if (__improbable(ppattr_pa_test_monitor(phys))) {
4909 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4910 	}
4911 	if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4912 		panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4913 	}
4914 #endif
4915 
4916 
4917 	orig_pte_p = pte_p = PT_ENTRY_NULL;
4918 	orig_pve_p = pve_p = PV_ENTRY_NULL;
4919 	pveh_p = PV_ENTRY_NULL;
4920 	pvet_p = PV_ENTRY_NULL;
4921 	new_pve_p = PV_ENTRY_NULL;
4922 	new_pte_p = PT_ENTRY_NULL;
4923 
4924 
4925 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4926 		orig_pte_p = pte_p = pvh_ptep(pv_h);
4927 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4928 		orig_pve_p = pve_p = pvh_pve_list(pv_h);
4929 		pveh_p = pve_p;
4930 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4931 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4932 	}
4933 
4934 	/* Pass 1: Update all CPU PTEs and accounting info as necessary */
4935 	int pve_ptep_idx = 0;
4936 
4937 	/*
4938 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4939 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
4940 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4941 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
4942 	 * operation, TLB invalidation may be handled by the caller so it's possible for
4943 	 * tlb_flush_needed to be true while issue_tlbi is false.
4944 	 */
4945 	bool issue_tlbi = false;
4946 	bool tlb_flush_needed = false;
4947 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4948 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4949 		pt_entry_t tmplate = ARM_PTE_EMPTY;
4950 		bool update = false;
4951 
4952 		if (pve_p != PV_ENTRY_NULL) {
4953 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4954 			if (pte_p == PT_ENTRY_NULL) {
4955 				goto protect_skip_pve_pass1;
4956 			}
4957 		}
4958 
4959 #ifdef PVH_FLAG_IOMMU
4960 		if (pvh_ptep_is_iommu(pte_p)) {
4961 #if XNU_MONITOR
4962 			if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4963 				panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4964 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4965 			}
4966 #endif
4967 			if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4968 				panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4969 				    ppnum, ptep_get_iommu(pte_p), pve_p);
4970 			}
4971 			goto protect_skip_pve_pass1;
4972 		}
4973 #endif
4974 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4975 		const pmap_t pmap = ptdp->pmap;
4976 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4977 
4978 		if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4979 #if MACH_ASSERT
4980 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4981 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4982 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4983 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4984 
4985 				pv_entry_t *check_pvep = pve_p;
4986 
4987 				do {
4988 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4989 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4990 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4991 					}
4992 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4993 
4994 				/* Restore previous PTEP value. */
4995 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4996 			}
4997 #endif
4998 			panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4999 			    pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5000 		}
5001 
5002 #if DEVELOPMENT || DEBUG
5003 		if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5004 #else
5005 		if ((prot & VM_PROT_EXECUTE))
5006 #endif
5007 		{
5008 			set_NX = false;
5009 		} else {
5010 			set_NX = true;
5011 		}
5012 
5013 #if HAS_FEAT_XS
5014 		/**
5015 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
5016 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
5017 		 */
5018 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
5019 #endif /* HAS_FEAT_XS */
5020 
5021 		/* Remove the mapping if new protection is NONE */
5022 		if (remove) {
5023 			if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
5024 				panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
5025 				    __func__, pmap, ppnum);
5026 			}
5027 
5028 			const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5029 			const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5030 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5031 			pt_entry_t spte = *pte_p;
5032 
5033 			if (pte_is_wired(spte)) {
5034 				pte_set_wired(pmap, pte_p, 0);
5035 				spte = *pte_p;
5036 				if (pmap != kernel_pmap) {
5037 					pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5038 				}
5039 			}
5040 
5041 			assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
5042 			    (uint64_t)spte, pte_p, ppnum);
5043 
5044 			if (compress && is_internal && (pmap != kernel_pmap)) {
5045 				assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
5046 				/* mark this PTE as having been "compressed" */
5047 				tmplate = ARM_PTE_COMPRESSED;
5048 				if (is_altacct) {
5049 					tmplate |= ARM_PTE_COMPRESSED_ALT;
5050 				}
5051 			} else {
5052 				tmplate = ARM_PTE_EMPTY;
5053 			}
5054 
5055 			assert(spte != tmplate);
5056 			write_pte_fast(pte_p, tmplate);
5057 			update = true;
5058 			++pass1_updated;
5059 
5060 			pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5061 
5062 			if (pmap != kernel_pmap) {
5063 				if (ppattr_test_reusable(pai) &&
5064 				    is_internal &&
5065 				    !is_altacct) {
5066 					pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5067 				} else if (!is_internal) {
5068 					pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5069 				}
5070 
5071 				if (is_altacct) {
5072 					assert(is_internal);
5073 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5074 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5075 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5076 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5077 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5078 					}
5079 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5080 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5081 				} else if (ppattr_test_reusable(pai)) {
5082 					assert(is_internal);
5083 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5084 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5085 						/* was not in footprint, but is now */
5086 						pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5087 					}
5088 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5089 				} else if (is_internal) {
5090 					pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5091 
5092 					/*
5093 					 * Update all stats related to physical footprint, which only
5094 					 * deals with internal pages.
5095 					 */
5096 					if (options & PMAP_OPTIONS_COMPRESSOR) {
5097 						/*
5098 						 * This removal is only being done so we can send this page to
5099 						 * the compressor; therefore it mustn't affect total task footprint.
5100 						 */
5101 						pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5102 					} else {
5103 						/*
5104 						 * This internal page isn't going to the compressor, so adjust stats to keep
5105 						 * phys_footprint up to date.
5106 						 */
5107 						pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5108 					}
5109 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5110 				} else {
5111 					/* external page: no impact on ledgers */
5112 				}
5113 			}
5114 			assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
5115 		} else {
5116 			pt_entry_t spte = *pte_p;
5117 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5118 
5119 			if (pmap == kernel_pmap) {
5120 				tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5121 			} else {
5122 				tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5123 			}
5124 
5125 			/*
5126 			 * While the naive implementation of this would serve to add execute
5127 			 * permission, this is not how the VM uses this interface, or how
5128 			 * x86_64 implements it.  So ignore requests to add execute permissions.
5129 			 */
5130 			if (set_NX) {
5131 				tmplate |= pt_attr_leaf_xn(pt_attr);
5132 			}
5133 
5134 
5135 			assert(spte != ARM_PTE_EMPTY);
5136 			assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
5137 
5138 			if (spte != tmplate) {
5139 				/*
5140 				 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
5141 				 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
5142 				 * require permission updates.  We use the ARM_PTE_WRITEABLE bit as that bit
5143 				 * should always be cleared by this function.
5144 				 */
5145 				pte_set_was_writeable(tmplate, true);
5146 				write_pte_fast(pte_p, tmplate);
5147 				update = true;
5148 				++pass1_updated;
5149 			} else if (pte_was_writeable(tmplate)) {
5150 				/*
5151 				 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5152 				 * to flush the TLB, but we do want to clear the "was_writeable" flag.  When revoking
5153 				 * write access to a page, this function should always at least clear that flag for
5154 				 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5155 				 * these mappings go through vm_fault().  We therefore don't want those accesses to
5156 				 * be handled through arm_fast_fault().
5157 				 */
5158 				pte_set_was_writeable(tmplate, false);
5159 				write_pte_fast(pte_p, tmplate);
5160 			}
5161 		}
5162 
5163 		if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5164 			tlb_flush_needed = true;
5165 			if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5166 			    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5167 				issue_tlbi = true;
5168 			}
5169 		}
5170 protect_skip_pve_pass1:
5171 		pte_p = PT_ENTRY_NULL;
5172 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5173 			pve_ptep_idx = 0;
5174 			pve_p = pve_next(pve_p);
5175 		}
5176 	}
5177 
5178 	if (tlb_flush_needed) {
5179 		FLUSH_PTE_STRONG();
5180 	}
5181 
5182 	if (!remove && !issue_tlbi) {
5183 		goto protect_finish;
5184 	}
5185 
5186 	/* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5187 	pv_entry_t **pve_pp = pv_h;
5188 	pve_p = orig_pve_p;
5189 	pte_p = orig_pte_p;
5190 	pve_ptep_idx = 0;
5191 
5192 	/*
5193 	 * We need to keep track of whether a particular PVE list contains IOMMU
5194 	 * mappings when removing entries, because we should only remove CPU
5195 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5196 	 * it around.
5197 	 */
5198 	bool iommu_mapping_in_pve = false;
5199 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5200 		if (pve_p != PV_ENTRY_NULL) {
5201 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5202 			if (pte_p == PT_ENTRY_NULL) {
5203 				goto protect_skip_pve_pass2;
5204 			}
5205 		}
5206 
5207 #ifdef PVH_FLAG_IOMMU
5208 		if (pvh_ptep_is_iommu(pte_p)) {
5209 			iommu_mapping_in_pve = true;
5210 			if (remove && (pve_p == PV_ENTRY_NULL)) {
5211 				/*
5212 				 * We've found an IOMMU entry and it's the only entry in the PV list.
5213 				 * We don't discard IOMMU entries, so simply set up the new PV list to
5214 				 * contain the single IOMMU PTE and exit the loop.
5215 				 */
5216 				new_pte_p = pte_p;
5217 				break;
5218 			}
5219 			goto protect_skip_pve_pass2;
5220 		}
5221 #endif
5222 		pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5223 		const pmap_t pmap = ptdp->pmap;
5224 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5225 
5226 		if (remove) {
5227 			if (!compress && (pmap != kernel_pmap)) {
5228 				/*
5229 				 * We must wait to decrement the refcount until we're completely finished using the PTE
5230 				 * on this path.  Otherwise, if we happened to drop the refcount to zero, a concurrent
5231 				 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5232 				 * under us.
5233 				 */
5234 				if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5235 					panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5236 				}
5237 			}
5238 			/* Remove this CPU mapping from PVE list. */
5239 			if (pve_p != PV_ENTRY_NULL) {
5240 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5241 			}
5242 		} else {
5243 			pt_entry_t spte = *pte_p;
5244 			if (pte_was_writeable(spte)) {
5245 				pte_set_was_writeable(spte, false);
5246 				write_pte_fast(pte_p, spte);
5247 			} else {
5248 				goto protect_skip_pve_pass2;
5249 			}
5250 		}
5251 		++pass2_updated;
5252 		if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5253 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5254 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5255 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5256 		}
5257 
5258 protect_skip_pve_pass2:
5259 		pte_p = PT_ENTRY_NULL;
5260 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5261 			pve_ptep_idx = 0;
5262 
5263 			if (remove) {
5264 				/**
5265 				 * If there are any IOMMU mappings in the PVE list, preserve
5266 				 * those mappings in a new PVE list (new_pve_p) which will later
5267 				 * become the new PVH entry. Keep track of the CPU mappings in
5268 				 * pveh_p/pvet_p so they can be deallocated later.
5269 				 */
5270 				if (iommu_mapping_in_pve) {
5271 					iommu_mapping_in_pve = false;
5272 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5273 					pve_remove(pv_h, pve_pp, pve_p);
5274 					pveh_p = pvh_pve_list(pv_h);
5275 					pve_p->pve_next = new_pve_p;
5276 					new_pve_p = pve_p;
5277 					pve_p = temp_pve_p;
5278 					continue;
5279 				} else {
5280 					pvet_p = pve_p;
5281 					pvh_cnt++;
5282 				}
5283 			}
5284 
5285 			pve_pp = pve_next_ptr(pve_p);
5286 			pve_p = pve_next(pve_p);
5287 			iommu_mapping_in_pve = false;
5288 		}
5289 	}
5290 
5291 protect_finish:
5292 
5293 #ifdef PVH_FLAG_EXEC
5294 	if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5295 		pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5296 	}
5297 #endif
5298 	if (__improbable(pass1_updated != pass2_updated)) {
5299 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5300 		    __func__, pass1_updated, pass2_updated);
5301 	}
5302 	/* if we removed a bunch of entries, take care of them now */
5303 	if (remove) {
5304 		if (new_pve_p != PV_ENTRY_NULL) {
5305 			pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5306 			pvh_set_flags(pv_h, pvh_flags);
5307 		} else if (new_pte_p != PT_ENTRY_NULL) {
5308 			pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5309 			pvh_set_flags(pv_h, pvh_flags);
5310 		} else {
5311 			if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5312 				pmap_flush_noncoherent_page(phys);
5313 			}
5314 			pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5315 		}
5316 	}
5317 
5318 	if (flush_range && tlb_flush_needed) {
5319 		if (!remove) {
5320 			flush_range->ptfr_flush_needed = true;
5321 			tlb_flush_needed = false;
5322 		}
5323 	}
5324 
5325 	/*
5326 	 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5327 	 * lock to allow the backing pages to be repurposed.  This is a security precaution, aimed
5328 	 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5329 	 * a page to be repurposed while it is still live in the TLBs.
5330 	 */
5331 	if (remove && tlb_flush_needed) {
5332 		sync_tlb_flush();
5333 	}
5334 
5335 
5336 	pvh_unlock(pai);
5337 
5338 	if (remove) {
5339 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5340 #if !XNU_MONITOR
5341 		mp_enable_preemption();
5342 #endif
5343 	}
5344 
5345 	if (!remove && tlb_flush_needed) {
5346 		sync_tlb_flush();
5347 	}
5348 
5349 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5350 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5351 	}
5352 }
5353 
5354 MARK_AS_PMAP_TEXT void
5355 pmap_page_protect_options_internal(
5356 	ppnum_t ppnum,
5357 	vm_prot_t prot,
5358 	unsigned int options,
5359 	void *arg)
5360 {
5361 	if (arg != NULL) {
5362 		/*
5363 		 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5364 		 * ultimately be flushed.  The nature of ARM TLB maintenance is such that we can flush the
5365 		 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5366 		 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5367 		 * In that case, force the flush to take place.
5368 		 */
5369 		options &= ~PMAP_OPTIONS_NOFLUSH;
5370 	}
5371 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5372 }
5373 
5374 void
5375 pmap_page_protect_options(
5376 	ppnum_t ppnum,
5377 	vm_prot_t prot,
5378 	unsigned int options,
5379 	void *arg)
5380 {
5381 	pmap_paddr_t    phys = ptoa(ppnum);
5382 
5383 	assert(ppnum != vm_page_fictitious_addr);
5384 
5385 	/* Only work with managed pages. */
5386 	if (!pa_valid(phys)) {
5387 		return;
5388 	}
5389 
5390 	/*
5391 	 * Determine the new protection.
5392 	 */
5393 	if (prot == VM_PROT_ALL) {
5394 		return;         /* nothing to do */
5395 	}
5396 
5397 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5398 
5399 #if XNU_MONITOR
5400 	pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5401 #else
5402 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5403 #endif
5404 
5405 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5406 }
5407 
5408 
5409 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5410 MARK_AS_PMAP_TEXT void
5411 pmap_disable_user_jop_internal(pmap_t pmap)
5412 {
5413 	if (pmap == kernel_pmap) {
5414 		panic("%s: called with kernel_pmap", __func__);
5415 	}
5416 	validate_pmap_mutable(pmap);
5417 	pmap->disable_jop = true;
5418 }
5419 
5420 void
5421 pmap_disable_user_jop(pmap_t pmap)
5422 {
5423 #if XNU_MONITOR
5424 	pmap_disable_user_jop_ppl(pmap);
5425 #else
5426 	pmap_disable_user_jop_internal(pmap);
5427 #endif
5428 }
5429 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5430 
5431 /*
5432  * Indicates if the pmap layer enforces some additional restrictions on the
5433  * given set of protections.
5434  */
5435 bool
5436 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5437 {
5438 	return false;
5439 }
5440 
5441 /*
5442  *	Set the physical protection on the
5443  *	specified range of this map as requested.
5444  *	VERY IMPORTANT: Will not increase permissions.
5445  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5446  */
5447 void
5448 pmap_protect(
5449 	pmap_t pmap,
5450 	vm_map_address_t b,
5451 	vm_map_address_t e,
5452 	vm_prot_t prot)
5453 {
5454 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5455 }
5456 
5457 MARK_AS_PMAP_TEXT vm_map_address_t
5458 pmap_protect_options_internal(
5459 	pmap_t pmap,
5460 	vm_map_address_t start,
5461 	vm_map_address_t end,
5462 	vm_prot_t prot,
5463 	unsigned int options,
5464 	__unused void *args)
5465 {
5466 	tt_entry_t      *tte_p;
5467 	pt_entry_t      *bpte_p, *epte_p;
5468 	pt_entry_t      *pte_p;
5469 	boolean_t        set_NX = TRUE;
5470 	boolean_t        set_XO = FALSE;
5471 	boolean_t        should_have_removed = FALSE;
5472 	bool             need_strong_sync = false;
5473 
5474 	/* Validate the pmap input before accessing its data. */
5475 	validate_pmap_mutable(pmap);
5476 
5477 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5478 
5479 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5480 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5481 	}
5482 
5483 #if DEVELOPMENT || DEBUG
5484 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5485 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5486 			should_have_removed = TRUE;
5487 		}
5488 	} else
5489 #endif
5490 	{
5491 		/* Determine the new protection. */
5492 		switch (prot) {
5493 		case VM_PROT_EXECUTE:
5494 			set_XO = TRUE;
5495 			OS_FALLTHROUGH;
5496 		case VM_PROT_READ:
5497 		case VM_PROT_READ | VM_PROT_EXECUTE:
5498 			break;
5499 		case VM_PROT_READ | VM_PROT_WRITE:
5500 		case VM_PROT_ALL:
5501 			return end;         /* nothing to do */
5502 		default:
5503 			should_have_removed = TRUE;
5504 		}
5505 	}
5506 
5507 	if (should_have_removed) {
5508 		panic("%s: should have been a remove operation, "
5509 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5510 		    __FUNCTION__,
5511 		    pmap, (void *)start, (void *)end, prot, options, args);
5512 	}
5513 
5514 #if DEVELOPMENT || DEBUG
5515 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5516 #else
5517 	if ((prot & VM_PROT_EXECUTE))
5518 #endif
5519 	{
5520 		set_NX = FALSE;
5521 	} else {
5522 		set_NX = TRUE;
5523 	}
5524 
5525 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5526 	vm_map_address_t va = start;
5527 	unsigned int npages = 0;
5528 
5529 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5530 
5531 	tte_p = pmap_tte(pmap, start);
5532 
5533 	if ((tte_p != (tt_entry_t *) NULL) && tte_is_valid_table(*tte_p)) {
5534 		bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5535 		bpte_p = &bpte_p[pte_index(pt_attr, start)];
5536 		epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5537 		pte_p = bpte_p;
5538 
5539 		for (pte_p = bpte_p;
5540 		    pte_p < epte_p;
5541 		    pte_p += PAGE_RATIO, va += pmap_page_size) {
5542 			++npages;
5543 			if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5544 			    pmap_pending_preemption())) {
5545 				break;
5546 			}
5547 			pt_entry_t spte;
5548 #if DEVELOPMENT || DEBUG
5549 			boolean_t  force_write = FALSE;
5550 #endif
5551 
5552 			spte = *((volatile pt_entry_t*)pte_p);
5553 
5554 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5555 				continue;
5556 			}
5557 
5558 			pmap_paddr_t    pa;
5559 			unsigned int    pai = 0;
5560 			boolean_t       managed = FALSE;
5561 
5562 			while (!managed) {
5563 				/*
5564 				 * It may be possible for the pte to transition from managed
5565 				 * to unmanaged in this timeframe; for now, elide the assert.
5566 				 * We should break out as a consequence of checking pa_valid.
5567 				 */
5568 				// assert(!ARM_PTE_IS_COMPRESSED(spte));
5569 				pa = pte_to_pa(spte);
5570 				if (!pa_valid(pa)) {
5571 					break;
5572 				}
5573 				pai = pa_index(pa);
5574 				pvh_lock(pai);
5575 				spte = *((volatile pt_entry_t*)pte_p);
5576 				pa = pte_to_pa(spte);
5577 				if (pai == pa_index(pa)) {
5578 					managed = TRUE;
5579 					break; // Leave the PVH locked as we will unlock it after we free the PTE
5580 				}
5581 				pvh_unlock(pai);
5582 			}
5583 
5584 			if ((spte == ARM_PTE_EMPTY) || ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5585 				continue;
5586 			}
5587 
5588 			pt_entry_t      tmplate;
5589 
5590 			if (pmap == kernel_pmap) {
5591 #if DEVELOPMENT || DEBUG
5592 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5593 					force_write = TRUE;
5594 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5595 				} else
5596 #endif
5597 				{
5598 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5599 				}
5600 			} else {
5601 #if DEVELOPMENT || DEBUG
5602 				if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5603 					assert(pmap->type != PMAP_TYPE_NESTED);
5604 					force_write = TRUE;
5605 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5606 				} else
5607 #endif
5608 				{
5609 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5610 				}
5611 			}
5612 
5613 			/*
5614 			 * XXX Removing "NX" would
5615 			 * grant "execute" access
5616 			 * immediately, bypassing any
5617 			 * checks VM might want to do
5618 			 * in its soft fault path.
5619 			 * pmap_protect() and co. are
5620 			 * not allowed to increase
5621 			 * access permissions.
5622 			 */
5623 			if (set_NX) {
5624 				tmplate |= pt_attr_leaf_xn(pt_attr);
5625 			} else {
5626 				if (pmap == kernel_pmap) {
5627 					/* do NOT clear "PNX"! */
5628 					tmplate |= ARM_PTE_NX;
5629 				} else {
5630 					/* do NOT clear "NX"! */
5631 					tmplate |= pt_attr_leaf_x(pt_attr);
5632 					if (set_XO) {
5633 						tmplate &= ~ARM_PTE_APMASK;
5634 						tmplate |= pt_attr_leaf_rona(pt_attr);
5635 					}
5636 				}
5637 			}
5638 
5639 #if DEVELOPMENT || DEBUG
5640 			if (force_write) {
5641 				/*
5642 				 * TODO: Run CS/Monitor checks here.
5643 				 */
5644 				if (managed) {
5645 					/*
5646 					 * We are marking the page as writable,
5647 					 * so we consider it to be modified and
5648 					 * referenced.
5649 					 */
5650 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5651 					tmplate |= ARM_PTE_AF;
5652 
5653 					if (ppattr_test_reffault(pai)) {
5654 						ppattr_clear_reffault(pai);
5655 					}
5656 
5657 					if (ppattr_test_modfault(pai)) {
5658 						ppattr_clear_modfault(pai);
5659 					}
5660 				}
5661 			} else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5662 				/*
5663 				 * An immediate request for anything other than
5664 				 * write should still mark the page as
5665 				 * referenced if managed.
5666 				 */
5667 				if (managed) {
5668 					ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5669 					tmplate |= ARM_PTE_AF;
5670 
5671 					if (ppattr_test_reffault(pai)) {
5672 						ppattr_clear_reffault(pai);
5673 					}
5674 				}
5675 			}
5676 #endif
5677 
5678 			/* We do not expect to write fast fault the entry. */
5679 			pte_set_was_writeable(tmplate, false);
5680 #if HAS_FEAT_XS
5681 			if (pte_is_xs(pt_attr, spte)) {
5682 				need_strong_sync = true;
5683 			}
5684 #endif /* HAS_FEAT_XS */
5685 
5686 			write_pte_fast(pte_p, tmplate);
5687 
5688 			if (managed) {
5689 				pvh_assert_locked(pai);
5690 				pvh_unlock(pai);
5691 			}
5692 		}
5693 		FLUSH_PTE_STRONG();
5694 		PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5695 	} else {
5696 		va = end;
5697 	}
5698 
5699 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5700 	return va;
5701 }
5702 
5703 void
5704 pmap_protect_options(
5705 	pmap_t pmap,
5706 	vm_map_address_t b,
5707 	vm_map_address_t e,
5708 	vm_prot_t prot,
5709 	unsigned int options,
5710 	__unused void *args)
5711 {
5712 	vm_map_address_t l, beg;
5713 
5714 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5715 
5716 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5717 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5718 		    pmap, (uint64_t)b, (uint64_t)e);
5719 	}
5720 
5721 	/*
5722 	 * We allow single-page requests to execute non-preemptibly,
5723 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5724 	 * operation, and there are a couple of special use cases that
5725 	 * require a non-preemptible single-page operation.
5726 	 */
5727 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5728 		pmap_verify_preemptible();
5729 	}
5730 
5731 #if DEVELOPMENT || DEBUG
5732 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5733 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5734 			pmap_remove_options(pmap, b, e, options);
5735 			return;
5736 		}
5737 	} else
5738 #endif
5739 	{
5740 		/* Determine the new protection. */
5741 		switch (prot) {
5742 		case VM_PROT_EXECUTE:
5743 		case VM_PROT_READ:
5744 		case VM_PROT_READ | VM_PROT_EXECUTE:
5745 			break;
5746 		case VM_PROT_READ | VM_PROT_WRITE:
5747 		case VM_PROT_ALL:
5748 			return;         /* nothing to do */
5749 		default:
5750 			pmap_remove_options(pmap, b, e, options);
5751 			return;
5752 		}
5753 	}
5754 
5755 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5756 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5757 	    VM_KERNEL_ADDRHIDE(e));
5758 
5759 	beg = b;
5760 
5761 	while (beg < e) {
5762 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5763 
5764 		if (l > e) {
5765 			l = e;
5766 		}
5767 
5768 #if XNU_MONITOR
5769 		beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5770 #else
5771 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5772 #endif
5773 	}
5774 
5775 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5776 }
5777 
5778 /**
5779  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5780  *
5781  * @param pmap pmap to insert the pages into.
5782  * @param va virtual address to map the pages into.
5783  * @param pa page number of the first physical page to map.
5784  * @param size block size, in number of pages.
5785  * @param prot mapping protection attributes.
5786  * @param attr flags to pass to pmap_enter().
5787  *
5788  * @return KERN_SUCCESS.
5789  */
5790 kern_return_t
5791 pmap_map_block(
5792 	pmap_t pmap,
5793 	addr64_t va,
5794 	ppnum_t pa,
5795 	uint32_t size,
5796 	vm_prot_t prot,
5797 	int attr,
5798 	unsigned int flags)
5799 {
5800 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5801 }
5802 
5803 /**
5804  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5805  * As opposed to pmap_map_block(), this function takes
5806  * a physical address as an input and operates using the
5807  * page size associated with the input pmap.
5808  *
5809  * @param pmap pmap to insert the pages into.
5810  * @param va virtual address to map the pages into.
5811  * @param pa physical address of the first physical page to map.
5812  * @param size block size, in number of pages.
5813  * @param prot mapping protection attributes.
5814  * @param attr flags to pass to pmap_enter().
5815  *
5816  * @return KERN_SUCCESS.
5817  */
5818 kern_return_t
5819 pmap_map_block_addr(
5820 	pmap_t pmap,
5821 	addr64_t va,
5822 	pmap_paddr_t pa,
5823 	uint32_t size,
5824 	vm_prot_t prot,
5825 	int attr,
5826 	unsigned int flags)
5827 {
5828 #if __ARM_MIXED_PAGE_SIZE__
5829 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5830 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5831 #else
5832 	const uint64_t pmap_page_size = PAGE_SIZE;
5833 #endif
5834 
5835 	for (ppnum_t page = 0; page < size; page++) {
5836 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5837 			panic("%s: failed pmap_enter_addr, "
5838 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5839 			    __FUNCTION__,
5840 			    pmap, va, (uint64_t)pa, size, prot, flags);
5841 		}
5842 
5843 		va += pmap_page_size;
5844 		pa += pmap_page_size;
5845 	}
5846 
5847 	return KERN_SUCCESS;
5848 }
5849 
5850 kern_return_t
5851 pmap_enter_addr(
5852 	pmap_t pmap,
5853 	vm_map_address_t v,
5854 	pmap_paddr_t pa,
5855 	vm_prot_t prot,
5856 	vm_prot_t fault_type,
5857 	unsigned int flags,
5858 	boolean_t wired)
5859 {
5860 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5861 }
5862 
5863 /*
5864  *	Insert the given physical page (p) at
5865  *	the specified virtual address (v) in the
5866  *	target physical map with the protection requested.
5867  *
5868  *	If specified, the page will be wired down, meaning
5869  *	that the related pte can not be reclaimed.
5870  *
5871  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5872  *	or lose information.  That is, this routine must actually
5873  *	insert this page into the given map eventually (must make
5874  *	forward progress eventually.
5875  */
5876 kern_return_t
5877 pmap_enter(
5878 	pmap_t pmap,
5879 	vm_map_address_t v,
5880 	ppnum_t pn,
5881 	vm_prot_t prot,
5882 	vm_prot_t fault_type,
5883 	unsigned int flags,
5884 	boolean_t wired,
5885 	__unused pmap_mapping_type_t mapping_type)
5886 {
5887 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5888 }
5889 
5890 /*
5891  * Attempt to commit the pte.
5892  * Succeeds iff able to change *pte_p from old_pte to new_pte.
5893  * Performs no page table or accounting writes on failures.
5894  */
5895 static inline bool
5896 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5897 {
5898 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5899 	bool success = false, changed_wiring = false;
5900 
5901 	__unreachable_ok_push
5902 	if (TEST_PAGE_RATIO_4) {
5903 		/*
5904 		 * 16K virtual pages w/ 4K hw pages.
5905 		 * We actually need to update 4 ptes here which can't easily be done atomically.
5906 		 * As a result we require the exclusive pmap lock.
5907 		 */
5908 		pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5909 		*old_pte = *pte_p;
5910 		if (*old_pte == new_pte) {
5911 			/* Another thread completed this operation. Nothing to do here. */
5912 			success = true;
5913 		} else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5914 		    pte_is_valid(*old_pte)) {
5915 			/* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5916 			success = false;
5917 		} else {
5918 			write_pte_fast(pte_p, new_pte);
5919 			success = true;
5920 		}
5921 	} else {
5922 		success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5923 	}
5924 	__unreachable_ok_pop
5925 
5926 	if (success && *old_pte != new_pte) {
5927 		if (pte_is_valid(*old_pte)) {
5928 			bool need_strong_sync = false;
5929 			FLUSH_PTE_STRONG();
5930 #if HAS_FEAT_XS
5931 			if (pte_is_xs(pt_attr, *old_pte)) {
5932 				need_strong_sync = true;
5933 			}
5934 #endif /* HAS_FEAT_XS */
5935 			PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5936 		} else {
5937 			FLUSH_PTE();
5938 			__builtin_arm_isb(ISB_SY);
5939 		}
5940 		changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5941 		    (new_pte & ARM_PTE_WIRED) != 0 :
5942 		    (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5943 
5944 		if (pmap != kernel_pmap && changed_wiring) {
5945 			SInt16  *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5946 			if (new_pte & ARM_PTE_WIRED) {
5947 				OSAddAtomic16(1, ptd_wiredcnt_ptr);
5948 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5949 			} else {
5950 				OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5951 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5952 			}
5953 		}
5954 
5955 		PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5956 		    VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5957 	}
5958 	return success;
5959 }
5960 
5961 MARK_AS_PMAP_TEXT static pt_entry_t
5962 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5963 {
5964 	pt_entry_t pte;
5965 
5966 	switch (wimg & (VM_WIMG_MASK)) {
5967 	case VM_WIMG_IO:
5968 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5969 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5970 		// AP, while preserving the security benefits of using device
5971 		// mapping against side-channel attacks. On pre-H14 platforms,
5972 		// the accesses will still be strongly ordered.
5973 		if (is_dram_addr(pa)) {
5974 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5975 		} else {
5976 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5977 #if HAS_FEAT_XS
5978 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5979 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5980 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5981 			}
5982 #endif /* HAS_FEAT_XS */
5983 		}
5984 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5985 		break;
5986 	case VM_WIMG_RT:
5987 		if (is_dram_addr(pa)) {
5988 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5989 		} else {
5990 #if HAS_FEAT_XS
5991 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5992 #else /* HAS_FEAT_XS */
5993 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5994 #endif /* HAS_FEAT_XS */
5995 #if DEBUG || DEVELOPMENT
5996 			pmap_wcrt_on_non_dram_count_increment_atomic();
5997 #endif /* DEBUG || DEVELOPMENT */
5998 		}
5999 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6000 		break;
6001 	case VM_WIMG_POSTED:
6002 		if (is_dram_addr(pa)) {
6003 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6004 		} else {
6005 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6006 		}
6007 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6008 		break;
6009 	case VM_WIMG_POSTED_REORDERED:
6010 		if (is_dram_addr(pa)) {
6011 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6012 		} else {
6013 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6014 		}
6015 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6016 		break;
6017 	case VM_WIMG_POSTED_COMBINED_REORDERED:
6018 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6019 #if HAS_FEAT_XS
6020 		if (!is_dram_addr(pa)) {
6021 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6022 		}
6023 #endif /* HAS_FEAT_XS */
6024 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6025 		break;
6026 	case VM_WIMG_WCOMB:
6027 		if (is_dram_addr(pa)) {
6028 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6029 		} else {
6030 #if HAS_FEAT_XS
6031 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6032 #else /* HAS_FEAT_XS */
6033 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6034 #endif /* HAS_FEAT_XS */
6035 #if DEBUG || DEVELOPMENT
6036 			pmap_wcrt_on_non_dram_count_increment_atomic();
6037 #endif /* DEBUG || DEVELOPMENT */
6038 		}
6039 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6040 		break;
6041 	case VM_WIMG_WTHRU:
6042 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6043 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6044 		break;
6045 	case VM_WIMG_COPYBACK:
6046 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6047 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6048 		break;
6049 	case VM_WIMG_INNERWBACK:
6050 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6051 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6052 		break;
6053 	default:
6054 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6055 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6056 	}
6057 
6058 	return pte;
6059 }
6060 
6061 
6062 /*
6063  * Construct a PTE (and the physical page attributes) for the given virtual to
6064  * physical mapping.
6065  *
6066  * This function has no side effects and is safe to call so that it is safe to
6067  * call while attempting a pmap_enter transaction.
6068  */
6069 MARK_AS_PMAP_TEXT static pt_entry_t
6070 pmap_construct_pte(
6071 	const pmap_t pmap,
6072 	vm_map_address_t va,
6073 	pmap_paddr_t pa,
6074 	vm_prot_t prot,
6075 	vm_prot_t fault_type,
6076 	boolean_t wired,
6077 	const pt_attr_t* const pt_attr,
6078 	uint16_t *pp_attr_bits /* OUTPUT */
6079 	)
6080 {
6081 	bool set_NX = false, set_XO = false;
6082 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
6083 	assert(pp_attr_bits != NULL);
6084 	*pp_attr_bits = 0;
6085 
6086 	if (wired) {
6087 		pte |= ARM_PTE_WIRED;
6088 	}
6089 
6090 #if DEVELOPMENT || DEBUG
6091 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
6092 #else
6093 	if ((prot & VM_PROT_EXECUTE))
6094 #endif
6095 	{
6096 		set_NX = false;
6097 	} else {
6098 		set_NX = true;
6099 	}
6100 
6101 	if (prot == VM_PROT_EXECUTE) {
6102 		set_XO = true;
6103 	}
6104 
6105 	if (set_NX) {
6106 		pte |= pt_attr_leaf_xn(pt_attr);
6107 	} else {
6108 		if (pmap == kernel_pmap) {
6109 			pte |= ARM_PTE_NX;
6110 		} else {
6111 			pte |= pt_attr_leaf_x(pt_attr);
6112 		}
6113 	}
6114 
6115 	if (pmap == kernel_pmap) {
6116 #if __ARM_KERNEL_PROTECT__
6117 		pte |= ARM_PTE_NG;
6118 #endif /* __ARM_KERNEL_PROTECT__ */
6119 		if (prot & VM_PROT_WRITE) {
6120 			pte |= ARM_PTE_AP(AP_RWNA);
6121 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
6122 		} else {
6123 			pte |= ARM_PTE_AP(AP_RONA);
6124 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6125 		}
6126 	} else {
6127 		if (pmap->type != PMAP_TYPE_NESTED) {
6128 			pte |= ARM_PTE_NG;
6129 		} else if ((pmap->nested_region_unnested_table_bitmap)
6130 		    && (va >= pmap->nested_region_addr)
6131 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
6132 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
6133 
6134 			if ((pmap->nested_region_unnested_table_bitmap)
6135 			    && testbit(UNNEST_BIT(index), (int *)pmap->nested_region_unnested_table_bitmap)) {
6136 				pte |= ARM_PTE_NG;
6137 			}
6138 		}
6139 		if (prot & VM_PROT_WRITE) {
6140 			assert(pmap->type != PMAP_TYPE_NESTED);
6141 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6142 				if (fault_type & VM_PROT_WRITE) {
6143 					pte |= pt_attr_leaf_rw(pt_attr);
6144 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6145 				} else {
6146 					pte |= pt_attr_leaf_ro(pt_attr);
6147 					/*
6148 					 * Mark the page as MODFAULT so that a subsequent write
6149 					 * may be handled through arm_fast_fault().
6150 					 */
6151 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6152 					pte_set_was_writeable(pte, true);
6153 				}
6154 			} else {
6155 				pte |= pt_attr_leaf_rw(pt_attr);
6156 				*pp_attr_bits |= PP_ATTR_REFERENCED;
6157 			}
6158 		} else {
6159 			if (set_XO) {
6160 				pte |= pt_attr_leaf_rona(pt_attr);
6161 			} else {
6162 				pte |= pt_attr_leaf_ro(pt_attr);
6163 			}
6164 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6165 		}
6166 	}
6167 
6168 	pte |= ARM_PTE_AF;
6169 	return pte;
6170 }
6171 
6172 MARK_AS_PMAP_TEXT kern_return_t
6173 pmap_enter_options_internal(
6174 	pmap_t pmap,
6175 	vm_map_address_t v,
6176 	pmap_paddr_t pa,
6177 	vm_prot_t prot,
6178 	vm_prot_t fault_type,
6179 	unsigned int flags,
6180 	boolean_t wired,
6181 	unsigned int options)
6182 {
6183 	ppnum_t         pn = (ppnum_t)atop(pa);
6184 	pt_entry_t      pte;
6185 	pt_entry_t      spte;
6186 	pt_entry_t      *pte_p;
6187 	bool            refcnt_updated;
6188 	bool            wiredcnt_updated;
6189 	bool            ro_va = false;
6190 	unsigned int    wimg_bits;
6191 	bool            committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6192 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6193 	kern_return_t   kr = KERN_SUCCESS;
6194 	uint16_t pp_attr_bits;
6195 	volatile uint16_t *refcnt;
6196 	volatile uint16_t *wiredcnt;
6197 	pv_free_list_t *local_pv_free;
6198 
6199 	validate_pmap_mutable(pmap);
6200 
6201 #if XNU_MONITOR
6202 	if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6203 		panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6204 	}
6205 #endif
6206 
6207 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6208 
6209 	if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6210 		panic("%s: pmap %p v 0x%llx not page-aligned",
6211 		    __func__, pmap, (unsigned long long)v);
6212 	}
6213 
6214 	if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6215 		panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6216 	}
6217 
6218 	/* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6219 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6220 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6221 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6222 	}
6223 
6224 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6225 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6226 		    pmap, (uint64_t)pa);
6227 	}
6228 
6229 	/* The PA should not extend beyond the architected physical address space */
6230 	pa &= ARM_PTE_PAGE_MASK;
6231 
6232 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6233 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6234 		extern vm_offset_t ctrr_test_page;
6235 		if (__probable(v != ctrr_test_page))
6236 #endif
6237 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6238 	}
6239 	if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6240 		if (__improbable(prot != VM_PROT_READ)) {
6241 			panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6242 			    __func__, (unsigned long long)v, prot);
6243 		}
6244 		ro_va = true;
6245 	}
6246 	assert(pn != vm_page_fictitious_addr);
6247 
6248 	refcnt_updated = false;
6249 	wiredcnt_updated = false;
6250 
6251 	if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6252 		/*
6253 		 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6254 		 *
6255 		 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6256 		 */
6257 		lock_mode = PMAP_LOCK_EXCLUSIVE;
6258 	}
6259 
6260 	if (!pmap_lock_preempt(pmap, lock_mode)) {
6261 		return KERN_ABORTED;
6262 	}
6263 
6264 	/*
6265 	 *	Expand pmap to include this pte.  Assume that
6266 	 *	pmap is always expanded to include enough hardware
6267 	 *	pages to map one VM page.
6268 	 */
6269 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6270 		/* Must unlock to expand the pmap. */
6271 		pmap_unlock(pmap, lock_mode);
6272 
6273 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6274 
6275 		if (kr != KERN_SUCCESS) {
6276 			return kr;
6277 		}
6278 
6279 		if (!pmap_lock_preempt(pmap, lock_mode)) {
6280 			return KERN_ABORTED;
6281 		}
6282 	}
6283 
6284 	if (options & PMAP_OPTIONS_NOENTER) {
6285 		pmap_unlock(pmap, lock_mode);
6286 		return KERN_SUCCESS;
6287 	}
6288 
6289 	/*
6290 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6291 	 * done via a cmpxchg loop.
6292 	 * We need to be careful about modifying non-local data structures before commiting
6293 	 * the new pte since we may need to re-do the transaction.
6294 	 */
6295 	spte = os_atomic_load(pte_p, relaxed);
6296 	while (!committed) {
6297 		refcnt = NULL;
6298 		wiredcnt = NULL;
6299 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6300 		had_valid_mapping = pte_is_valid(spte);
6301 
6302 		if (pmap != kernel_pmap) {
6303 			ptd_info_t *ptd_info = ptep_get_info(pte_p);
6304 			refcnt = &ptd_info->refcnt;
6305 			wiredcnt = &ptd_info->wiredcnt;
6306 			/*
6307 			 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6308 			 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6309 			 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6310 			 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6311 			 * have PTDs, so we can't use the check there.
6312 			 */
6313 			if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6314 				panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6315 				    __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6316 			}
6317 			/*
6318 			 * Bump the wired count to keep the PTE page from being reclaimed.  We need this because
6319 			 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6320 			 * or acquire the pmap lock exclusive.
6321 			 */
6322 			if (!wiredcnt_updated) {
6323 				OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6324 				wiredcnt_updated = true;
6325 			}
6326 			if (!refcnt_updated) {
6327 				OSAddAtomic16(1, (volatile int16_t*)refcnt);
6328 				refcnt_updated = true;
6329 				drop_refcnt = true;
6330 			}
6331 		}
6332 
6333 		if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6334 			/*
6335 			 * There is already a mapping here & it's for a different physical page.
6336 			 * First remove that mapping.
6337 			 *
6338 			 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6339 			 */
6340 			if (lock_mode == PMAP_LOCK_SHARED) {
6341 				if (pmap_lock_shared_to_exclusive(pmap)) {
6342 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6343 				} else {
6344 					/*
6345 					 * We failed to upgrade to an exclusive lock.
6346 					 * As a result we no longer hold the lock at all,
6347 					 * so we need to re-acquire it and restart the transaction.
6348 					 */
6349 					pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6350 					lock_mode = PMAP_LOCK_EXCLUSIVE;
6351 					/* pmap might have changed after we dropped the lock. Try again. */
6352 					spte = os_atomic_load(pte_p, relaxed);
6353 					continue;
6354 				}
6355 			}
6356 			pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6357 			spte = ARM_PTE_EMPTY;
6358 			assert(os_atomic_load(pte_p, acquire) == ARM_PTE_EMPTY);
6359 		}
6360 
6361 		/*
6362 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6363 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6364 		 * read-write protection. The PMAP layer though still needs to use the right
6365 		 * index, which is the older XO-now-TPRO one and that is specially selected
6366 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6367 		 */
6368 		if (options & PMAP_OPTIONS_MAP_TPRO) {
6369 			if (__improbable(pmap == kernel_pmap)) {
6370 				panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6371 				    __func__);
6372 			}
6373 			pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6374 		} else {
6375 			pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6376 		}
6377 
6378 		if (pa_valid(pa)) {
6379 			unsigned int pai;
6380 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6381 
6382 			is_internal = FALSE;
6383 			is_altacct = FALSE;
6384 
6385 			pai = pa_index(pa);
6386 
6387 			pvh_lock(pai);
6388 
6389 			/*
6390 			 * Make sure that the current per-cpu PV free list has
6391 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6392 			 * if the transaction succeeds. We're either in the
6393 			 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6394 			 * Note that we can still be interrupted, but a primary
6395 			 * interrupt handler can never enter the pmap.
6396 			 */
6397 #if !XNU_MONITOR
6398 			assert(get_preemption_level() > 0);
6399 #endif
6400 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6401 			pv_entry_t **pv_h = pai_to_pvh(pai);
6402 			const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6403 			    !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6404 
6405 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6406 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6407 				int new_allocated_pves = 0;
6408 
6409 				while (new_allocated_pves < 2) {
6410 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6411 					pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6412 					if (pv_status == PV_ALLOC_FAIL) {
6413 						break;
6414 					} else if (pv_status == PV_ALLOC_RETRY) {
6415 						/*
6416 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6417 						 * it will have dropped the pmap lock while doing so.
6418 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6419 						 * be on a different CPU now.
6420 						 */
6421 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6422 					} else {
6423 						/* If we've gotten this far then a node should've been allocated. */
6424 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6425 
6426 						new_allocated_pves++;
6427 					}
6428 				}
6429 
6430 				for (int i = 0; i < new_allocated_pves; i++) {
6431 					pv_free(new_pve_p[i]);
6432 				}
6433 			}
6434 
6435 			if (pv_status == PV_ALLOC_FAIL) {
6436 				pvh_unlock(pai);
6437 				kr = KERN_RESOURCE_SHORTAGE;
6438 				break;
6439 			} else if (pv_status == PV_ALLOC_RETRY) {
6440 				pvh_unlock(pai);
6441 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6442 				spte = os_atomic_load(pte_p, relaxed);
6443 				continue;
6444 			}
6445 
6446 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6447 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6448 			} else {
6449 				wimg_bits = pmap_cache_attributes(pn);
6450 			}
6451 
6452 			/* We may be retrying this operation after dropping the PVH lock.
6453 			 * Cache attributes for the physical page may have changed while the lock
6454 			 * was dropped, so clear any cache attributes we may have previously set
6455 			 * in the PTE template. */
6456 			pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6457 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6458 
6459 #if XNU_MONITOR
6460 			/* The regular old kernel is not allowed to remap PPL pages. */
6461 			if (__improbable(ppattr_pa_test_monitor(pa))) {
6462 				panic("%s: page belongs to PPL, "
6463 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6464 				    __FUNCTION__,
6465 				    pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6466 			}
6467 
6468 			if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6469 				panic("%s: page locked down, "
6470 				    "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6471 				    __FUNCTION__,
6472 				    pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6473 			}
6474 #endif
6475 
6476 
6477 
6478 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6479 			if (!committed) {
6480 				pvh_unlock(pai);
6481 				continue;
6482 			}
6483 			had_valid_mapping = pte_is_valid(spte);
6484 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6485 
6486 			assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6487 			/*
6488 			 * If there was already a valid pte here then we reuse its reference
6489 			 * on the ptd and drop the one that we took above.
6490 			 */
6491 			drop_refcnt = had_valid_mapping;
6492 
6493 			if (!had_valid_mapping) {
6494 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6495 				int pve_ptep_idx = 0;
6496 				pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6497 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6498 				if (pv_status != PV_ALLOC_SUCCESS) {
6499 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6500 					    __func__, pv_status, new_pve_p, pmap);
6501 				}
6502 
6503 				if (pmap != kernel_pmap) {
6504 					if (options & PMAP_OPTIONS_INTERNAL) {
6505 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6506 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6507 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6508 							/*
6509 							 * Make a note to ourselves that this
6510 							 * mapping is using alternative
6511 							 * accounting. We'll need this in order
6512 							 * to know which ledger to debit when
6513 							 * the mapping is removed.
6514 							 *
6515 							 * The altacct bit must be set while
6516 							 * the pv head is locked. Defer the
6517 							 * ledger accounting until after we've
6518 							 * dropped the lock.
6519 							 */
6520 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6521 							is_altacct = TRUE;
6522 						}
6523 					}
6524 					if (ppattr_test_reusable(pai) &&
6525 					    !is_altacct) {
6526 						is_reusable = TRUE;
6527 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6528 						is_internal = TRUE;
6529 					} else {
6530 						is_external = TRUE;
6531 					}
6532 				}
6533 			}
6534 
6535 			pvh_unlock(pai);
6536 
6537 			if (pp_attr_bits != 0) {
6538 				ppattr_pa_set_bits(pa, pp_attr_bits);
6539 			}
6540 
6541 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6542 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6543 
6544 				if (is_internal) {
6545 					/*
6546 					 * Make corresponding adjustments to
6547 					 * phys_footprint statistics.
6548 					 */
6549 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6550 					if (is_altacct) {
6551 						/*
6552 						 * If this page is internal and
6553 						 * in an IOKit region, credit
6554 						 * the task's total count of
6555 						 * dirty, internal IOKit pages.
6556 						 * It should *not* count towards
6557 						 * the task's total physical
6558 						 * memory footprint, because
6559 						 * this entire region was
6560 						 * already billed to the task
6561 						 * at the time the mapping was
6562 						 * created.
6563 						 *
6564 						 * Put another way, this is
6565 						 * internal++ and
6566 						 * alternate_accounting++, so
6567 						 * net effect on phys_footprint
6568 						 * is 0. That means: don't
6569 						 * touch phys_footprint here.
6570 						 */
6571 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6572 					} else {
6573 						if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6574 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6575 							skip_footprint_debit = true;
6576 						} else {
6577 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6578 						}
6579 					}
6580 				}
6581 				if (is_reusable) {
6582 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6583 				} else if (is_external) {
6584 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6585 				}
6586 			}
6587 		} else {
6588 			if (prot & VM_PROT_EXECUTE) {
6589 				kr = KERN_FAILURE;
6590 				break;
6591 			}
6592 
6593 			wimg_bits = pmap_cache_attributes(pn);
6594 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6595 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6596 			}
6597 
6598 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6599 
6600 #if XNU_MONITOR
6601 			pte = pmap_construct_io_pte(pa, pte);
6602 
6603 			/**
6604 			 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6605 			 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6606 			 * created and later removed.  We must therefore prevent an attacker from downgrading a
6607 			 * a writable mapping in order to allow it to be removed and remapped to something else.
6608 			 */
6609 			if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6610 			    pte_is_valid(spte) &&
6611 			    (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6612 			    (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6613 				panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6614 				    __func__, (uint64_t)pte_to_pa(spte));
6615 			}
6616 #endif
6617 
6618 			committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6619 			if (committed) {
6620 				had_valid_mapping = pte_is_valid(spte);
6621 				assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6622 
6623 				/**
6624 				 * If there was already a valid pte here then we reuse its
6625 				 * reference on the ptd and drop the one that we took above.
6626 				 */
6627 				drop_refcnt = had_valid_mapping;
6628 			}
6629 		}
6630 		if (committed) {
6631 			if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6632 				assert(pmap != kernel_pmap);
6633 
6634 				/* One less "compressed" */
6635 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6636 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6637 
6638 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6639 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6640 				} else if (!skip_footprint_debit) {
6641 					/* Was part of the footprint */
6642 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6643 				}
6644 				/* The old entry held a reference so drop the extra one that we took above. */
6645 				drop_refcnt = true;
6646 			}
6647 		}
6648 	}
6649 
6650 	if (drop_refcnt && refcnt != NULL) {
6651 		assert(refcnt_updated);
6652 		if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6653 			panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6654 		}
6655 	}
6656 
6657 	if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6658 		panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6659 	}
6660 
6661 	pmap_unlock(pmap, lock_mode);
6662 
6663 	if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6664 		pmap_phys_write_disable(v);
6665 	}
6666 
6667 	return kr;
6668 }
6669 
6670 kern_return_t
6671 pmap_enter_options_addr(
6672 	pmap_t pmap,
6673 	vm_map_address_t v,
6674 	pmap_paddr_t pa,
6675 	vm_prot_t prot,
6676 	vm_prot_t fault_type,
6677 	unsigned int flags,
6678 	boolean_t wired,
6679 	unsigned int options,
6680 	__unused void   *arg,
6681 	__unused pmap_mapping_type_t mapping_type)
6682 {
6683 	kern_return_t kr = KERN_FAILURE;
6684 
6685 
6686 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6687 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6688 
6689 
6690 	const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6691 	do {
6692 #if XNU_MONITOR
6693 		kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6694 #else
6695 		kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6696 #endif
6697 
6698 		if (kr == KERN_RESOURCE_SHORTAGE) {
6699 #if XNU_MONITOR
6700 			pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6701 #endif
6702 			if (nowait_requested) {
6703 				break;
6704 			}
6705 		}
6706 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6707 
6708 #if XNU_MONITOR
6709 	pmap_ledger_check_balance(pmap);
6710 #endif
6711 
6712 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6713 
6714 	return kr;
6715 }
6716 
6717 kern_return_t
6718 pmap_enter_options(
6719 	pmap_t pmap,
6720 	vm_map_address_t v,
6721 	ppnum_t pn,
6722 	vm_prot_t prot,
6723 	vm_prot_t fault_type,
6724 	unsigned int flags,
6725 	boolean_t wired,
6726 	unsigned int options,
6727 	__unused void   *arg,
6728 	pmap_mapping_type_t mapping_type)
6729 {
6730 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6731 }
6732 
6733 /*
6734  *	Routine:	pmap_change_wiring
6735  *	Function:	Change the wiring attribute for a map/virtual-address
6736  *			pair.
6737  *	In/out conditions:
6738  *			The mapping must already exist in the pmap.
6739  */
6740 MARK_AS_PMAP_TEXT kern_return_t
6741 pmap_change_wiring_internal(
6742 	pmap_t pmap,
6743 	vm_map_address_t v,
6744 	boolean_t wired)
6745 {
6746 	pt_entry_t     *pte_p;
6747 	pmap_paddr_t    pa;
6748 
6749 	validate_pmap_mutable(pmap);
6750 
6751 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6752 		return KERN_ABORTED;
6753 	}
6754 
6755 	const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6756 
6757 	pte_p = pmap_pte(pmap, v);
6758 	if (pte_p == PT_ENTRY_NULL) {
6759 		if (!wired) {
6760 			/*
6761 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6762 			 * may have been freed by a remove operation.
6763 			 */
6764 			goto pmap_change_wiring_return;
6765 		} else {
6766 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6767 		}
6768 	}
6769 	/*
6770 	 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6771 	 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6772 	 */
6773 	pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6774 
6775 	while (pa_valid(pa)) {
6776 		pmap_paddr_t new_pa;
6777 
6778 		pvh_lock(pa_index(pa));
6779 		new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6780 
6781 		if (pa == new_pa) {
6782 			break;
6783 		}
6784 
6785 		pvh_unlock(pa_index(pa));
6786 		pa = new_pa;
6787 	}
6788 
6789 	/* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6790 	if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6791 		if (!wired) {
6792 			/* PTE cleared by prior remove/disconnect operation */
6793 			goto pmap_change_wiring_cleanup;
6794 		} else {
6795 			panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6796 			    __func__, pte_p, (uint64_t)*pte_p, pmap);
6797 		}
6798 	}
6799 
6800 	assertf(pte_is_valid(*pte_p), "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6801 	if (wired != pte_is_wired(*pte_p)) {
6802 		pte_set_wired(pmap, pte_p, wired);
6803 		if (pmap != kernel_pmap) {
6804 			if (wired) {
6805 				pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6806 			} else if (!wired) {
6807 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6808 			}
6809 		}
6810 	}
6811 
6812 pmap_change_wiring_cleanup:
6813 	if (pa_valid(pa)) {
6814 		pvh_unlock(pa_index(pa));
6815 	}
6816 
6817 pmap_change_wiring_return:
6818 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6819 
6820 	return KERN_SUCCESS;
6821 }
6822 
6823 void
6824 pmap_change_wiring(
6825 	pmap_t pmap,
6826 	vm_map_address_t v,
6827 	boolean_t wired)
6828 {
6829 	/* This function is going to lock the pmap lock, so it'd better be preemptible. */
6830 	pmap_verify_preemptible();
6831 
6832 	kern_return_t kr = KERN_FAILURE;
6833 #if XNU_MONITOR
6834 	/* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6835 	do {
6836 		kr = pmap_change_wiring_ppl(pmap, v, wired);
6837 	} while (kr == KERN_ABORTED);
6838 
6839 	pmap_ledger_check_balance(pmap);
6840 #else
6841 	/* Since we verified preemptibility, call the helper only once. */
6842 	kr = pmap_change_wiring_internal(pmap, v, wired);
6843 #endif
6844 
6845 	if (kr != KERN_SUCCESS) {
6846 		panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6847 		    __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6848 	}
6849 }
6850 
6851 MARK_AS_PMAP_TEXT pmap_paddr_t
6852 pmap_find_pa_internal(
6853 	pmap_t pmap,
6854 	addr64_t va)
6855 {
6856 	pmap_paddr_t    pa = 0;
6857 
6858 	validate_pmap(pmap);
6859 
6860 	if (pmap != kernel_pmap) {
6861 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6862 	}
6863 
6864 	pa = pmap_vtophys(pmap, va);
6865 
6866 	if (pmap != kernel_pmap) {
6867 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6868 	}
6869 
6870 	return pa;
6871 }
6872 
6873 pmap_paddr_t
6874 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6875 {
6876 	pmap_paddr_t pa = 0;
6877 
6878 	if (pmap == kernel_pmap) {
6879 		pa = mmu_kvtop(va);
6880 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6881 		/*
6882 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6883 		 * translation even if PAN would prevent kernel access through the translation.
6884 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6885 		 */
6886 		pa = mmu_uvtop(va);
6887 	}
6888 	return pa;
6889 }
6890 
6891 pmap_paddr_t
6892 pmap_find_pa(
6893 	pmap_t pmap,
6894 	addr64_t va)
6895 {
6896 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6897 
6898 	if (pa != 0) {
6899 		return pa;
6900 	}
6901 
6902 	if (not_in_kdp) {
6903 #if XNU_MONITOR
6904 		return pmap_find_pa_ppl(pmap, va);
6905 #else
6906 		return pmap_find_pa_internal(pmap, va);
6907 #endif
6908 	} else {
6909 		return pmap_vtophys(pmap, va);
6910 	}
6911 }
6912 
6913 ppnum_t
6914 pmap_find_phys_nofault(
6915 	pmap_t pmap,
6916 	addr64_t va)
6917 {
6918 	ppnum_t ppn;
6919 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6920 	return ppn;
6921 }
6922 
6923 ppnum_t
6924 pmap_find_phys(
6925 	pmap_t pmap,
6926 	addr64_t va)
6927 {
6928 	ppnum_t ppn;
6929 	ppn = atop(pmap_find_pa(pmap, va));
6930 	return ppn;
6931 }
6932 
6933 /**
6934  * Translate a kernel virtual address into a physical address.
6935  *
6936  * @param va The kernel virtual address to translate. Does not work on user
6937  *           virtual addresses.
6938  *
6939  * @return The physical address if the translation was successful, or zero if
6940  *         no valid mappings were found for the given virtual address.
6941  */
6942 pmap_paddr_t
6943 kvtophys(vm_offset_t va)
6944 {
6945 	/**
6946 	 * Attempt to do the translation first in hardware using the AT (address
6947 	 * translation) instruction. This will attempt to use the MMU to do the
6948 	 * translation for us.
6949 	 */
6950 	pmap_paddr_t pa = mmu_kvtop(va);
6951 
6952 	if (pa) {
6953 		return pa;
6954 	}
6955 
6956 	/* If the MMU can't find the mapping, then manually walk the page tables. */
6957 	return pmap_vtophys(kernel_pmap, va);
6958 }
6959 
6960 /**
6961  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6962  * points to a non-kernel-managed physical page, then this call will panic().
6963  *
6964  * @note The output of this function is guaranteed to be a kernel-managed
6965  *       physical page, which means it's safe to pass the output directly to
6966  *       pa_index() to create a physical address index for various pmap data
6967  *       structures.
6968  *
6969  * @param va The kernel virtual address to translate. Does not work on user
6970  *           virtual addresses.
6971  *
6972  * @return The translated physical address for the given virtual address.
6973  */
6974 pmap_paddr_t
6975 kvtophys_nofail(vm_offset_t va)
6976 {
6977 	pmap_paddr_t pa = kvtophys(va);
6978 
6979 	if (!pa_valid(pa)) {
6980 		panic("%s: Invalid or non-kernel-managed physical page returned, "
6981 		    "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6982 	}
6983 
6984 	return pa;
6985 }
6986 
6987 pmap_paddr_t
6988 pmap_vtophys(
6989 	pmap_t pmap,
6990 	addr64_t va)
6991 {
6992 	if ((va < pmap->min) || (va >= pmap->max)) {
6993 		return 0;
6994 	}
6995 
6996 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6997 
6998 	tt_entry_t * ttp = NULL;
6999 	tt_entry_t * ttep = NULL;
7000 	tt_entry_t   tte = ARM_TTE_EMPTY;
7001 	pmap_paddr_t pa = 0;
7002 	unsigned int cur_level;
7003 
7004 	ttp = pmap->tte;
7005 
7006 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
7007 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
7008 
7009 		tte = *ttep;
7010 
7011 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
7012 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
7013 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
7014 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
7015 
7016 		if ((tte & valid_mask) != valid_mask) {
7017 			return (pmap_paddr_t) 0;
7018 		}
7019 
7020 		/* This detects both leaf entries and intermediate block mappings. */
7021 		if ((tte & type_mask) == type_block) {
7022 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
7023 			break;
7024 		}
7025 
7026 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
7027 	}
7028 
7029 	return pa;
7030 }
7031 
7032 /*
7033  *	pmap_init_pte_page - Initialize a page table page.
7034  */
7035 MARK_AS_PMAP_TEXT void
7036 pmap_init_pte_page(
7037 	pmap_t pmap,
7038 	pt_entry_t *pte_p,
7039 	vm_offset_t va,
7040 	unsigned int ttlevel,
7041 	boolean_t alloc_ptd)
7042 {
7043 	pt_desc_t   *ptdp = NULL;
7044 	pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
7045 
7046 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7047 		if (alloc_ptd) {
7048 			/*
7049 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
7050 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
7051 			 * bootstrap request, so we check for an existing PTD here.
7052 			 */
7053 			ptdp = ptd_alloc(pmap);
7054 			if (ptdp == NULL) {
7055 				panic("%s: unable to allocate PTD", __func__);
7056 			}
7057 			pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
7058 			/* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
7059 			pvh_set_flags(pvh, 0);
7060 		} else {
7061 			panic("pmap_init_pte_page(): pte_p %p", pte_p);
7062 		}
7063 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7064 		ptdp = pvh_ptd(pvh);
7065 	} else {
7066 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7067 	}
7068 
7069 	// below barrier ensures previous updates to the page are visible to PTW before
7070 	// it is linked to the PTE of previous level
7071 	__builtin_arm_dmb(DMB_ISHST);
7072 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7073 }
7074 
7075 /*
7076  *	Routine:	pmap_expand
7077  *
7078  *	Expands a pmap to be able to map the specified virtual address.
7079  *
7080  *	Allocates new memory for the default (COARSE) translation table
7081  *	entry, initializes all the pte entries to ARM_PTE_EMPTY and
7082  *	also allocates space for the corresponding pv entries.
7083  *
7084  *	Nothing should be locked.
7085  */
7086 MARK_AS_PMAP_TEXT static kern_return_t
7087 pmap_expand(
7088 	pmap_t pmap,
7089 	vm_map_address_t v,
7090 	unsigned int options,
7091 	unsigned int level)
7092 {
7093 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7094 
7095 	if (__improbable((v < pmap->min) || (v >= pmap->max))) {
7096 		return KERN_INVALID_ADDRESS;
7097 	}
7098 	pmap_paddr_t    pa;
7099 	unsigned int    ttlevel = pt_attr_root_level(pt_attr);
7100 	tt_entry_t              *tte_p;
7101 	tt_entry_t              *tt_p;
7102 
7103 	pa = 0x0ULL;
7104 	tt_p =  (tt_entry_t *)NULL;
7105 
7106 	for (; ttlevel < level; ttlevel++) {
7107 		if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7108 			return KERN_ABORTED;
7109 		}
7110 
7111 		if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
7112 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7113 			kern_return_t ret;
7114 			while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
7115 				if (options & PMAP_OPTIONS_NOWAIT) {
7116 					/* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
7117 					return ret;
7118 				}
7119 #if XNU_MONITOR
7120 				panic("%s: failed to allocate tt, "
7121 				    "pmap=%p, v=%p, options=0x%x, level=%u",
7122 				    __FUNCTION__,
7123 				    pmap, (void *)v, options, level);
7124 #else
7125 				VM_PAGE_WAIT();
7126 #endif
7127 			}
7128 
7129 			if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
7130 				pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7131 				return KERN_ABORTED;
7132 			}
7133 
7134 			if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
7135 				pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
7136 				pa = kvtophys_nofail((vm_offset_t)tt_p);
7137 				tte_p = pmap_ttne(pmap, ttlevel, v);
7138 				*tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7139 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7140 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
7141 				pa = 0x0ULL;
7142 				tt_p = (tt_entry_t *)NULL;
7143 			}
7144 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
7145 		} else {
7146 			pmap_unlock(pmap, PMAP_LOCK_SHARED);
7147 		}
7148 
7149 		if (tt_p != (tt_entry_t *)NULL) {
7150 			pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
7151 			tt_p = (tt_entry_t *)NULL;
7152 		}
7153 	}
7154 
7155 	return KERN_SUCCESS;
7156 }
7157 
7158 /*
7159  *	Routine:	pmap_gc
7160  *	Function:
7161  *              Pmap garbage collection
7162  *		Called by the pageout daemon when pages are scarce.
7163  *
7164  */
7165 void
7166 pmap_gc(void)
7167 {
7168 	/*
7169 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7170 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7171 	 * or may contain wired mappings.  However, with the relatively recent change to
7172 	 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
7173 	 * page, it may make sense to call that function here.
7174 	 */
7175 }
7176 
7177 /*
7178  *      By default, don't attempt pmap GC more frequently
7179  *      than once / 1 minutes.
7180  */
7181 
7182 void
7183 compute_pmap_gc_throttle(
7184 	void *arg __unused)
7185 {
7186 }
7187 
7188 /*
7189  * pmap_attribute_cache_sync(vm_offset_t pa)
7190  *
7191  * Invalidates all of the instruction cache on a physical page and
7192  * pushes any dirty data from the data cache for the same physical page
7193  */
7194 
7195 kern_return_t
7196 pmap_attribute_cache_sync(
7197 	ppnum_t pp,
7198 	vm_size_t size,
7199 	__unused vm_machine_attribute_t attribute,
7200 	__unused vm_machine_attribute_val_t * value)
7201 {
7202 	if (size > PAGE_SIZE) {
7203 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7204 	} else {
7205 		cache_sync_page(pp);
7206 	}
7207 
7208 	return KERN_SUCCESS;
7209 }
7210 
7211 /*
7212  * pmap_sync_page_data_phys(ppnum_t pp)
7213  *
7214  * Invalidates all of the instruction cache on a physical page and
7215  * pushes any dirty data from the data cache for the same physical page
7216  */
7217 void
7218 pmap_sync_page_data_phys(
7219 	ppnum_t pp)
7220 {
7221 	cache_sync_page(pp);
7222 }
7223 
7224 /*
7225  * pmap_sync_page_attributes_phys(ppnum_t pp)
7226  *
7227  * Write back and invalidate all cachelines on a physical page.
7228  */
7229 void
7230 pmap_sync_page_attributes_phys(
7231 	ppnum_t pp)
7232 {
7233 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7234 }
7235 
7236 #if CONFIG_COREDUMP
7237 /* temporary workaround */
7238 boolean_t
7239 coredumpok(
7240 	vm_map_t map,
7241 	mach_vm_offset_t va)
7242 {
7243 	pt_entry_t     *pte_p;
7244 	pt_entry_t      spte;
7245 
7246 	pte_p = pmap_pte(map->pmap, va);
7247 	if (0 == pte_p) {
7248 		return FALSE;
7249 	}
7250 	if (vm_map_entry_has_device_pager(map, va)) {
7251 		return FALSE;
7252 	}
7253 	spte = *pte_p;
7254 	return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7255 }
7256 #endif
7257 
7258 void
7259 fillPage(
7260 	ppnum_t pn,
7261 	unsigned int fill)
7262 {
7263 	unsigned int   *addr;
7264 	int             count;
7265 
7266 	addr = (unsigned int *) phystokv(ptoa(pn));
7267 	count = PAGE_SIZE / sizeof(unsigned int);
7268 	while (count--) {
7269 		*addr++ = fill;
7270 	}
7271 }
7272 
7273 extern void     mapping_set_mod(ppnum_t pn);
7274 
7275 void
7276 mapping_set_mod(
7277 	ppnum_t pn)
7278 {
7279 	pmap_set_modify(pn);
7280 }
7281 
7282 extern void     mapping_set_ref(ppnum_t pn);
7283 
7284 void
7285 mapping_set_ref(
7286 	ppnum_t pn)
7287 {
7288 	pmap_set_reference(pn);
7289 }
7290 
7291 /*
7292  * Clear specified attribute bits.
7293  *
7294  * Try to force an arm_fast_fault() for all mappings of
7295  * the page - to force attributes to be set again at fault time.
7296  * If the forcing succeeds, clear the cached bits at the head.
7297  * Otherwise, something must have been wired, so leave the cached
7298  * attributes alone.
7299  */
7300 MARK_AS_PMAP_TEXT static void
7301 phys_attribute_clear_with_flush_range(
7302 	ppnum_t         pn,
7303 	unsigned int    bits,
7304 	int             options,
7305 	void            *arg,
7306 	pmap_tlb_flush_range_t *flush_range)
7307 {
7308 	pmap_paddr_t    pa = ptoa(pn);
7309 	vm_prot_t       allow_mode = VM_PROT_ALL;
7310 
7311 #if XNU_MONITOR
7312 	if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7313 		panic("%s: illegal request, "
7314 		    "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7315 		    __FUNCTION__,
7316 		    pn, bits, options, arg, flush_range);
7317 	}
7318 #endif
7319 	if ((arg != NULL) || (flush_range != NULL)) {
7320 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7321 	}
7322 
7323 	if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7324 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7325 		    "invalid options",
7326 		    pn, bits, options, arg, flush_range);
7327 	}
7328 
7329 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7330 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7331 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7332 		    "should not clear 'modified' without flushing TLBs",
7333 		    pn, bits, options, arg, flush_range);
7334 	}
7335 
7336 	assert(pn != vm_page_fictitious_addr);
7337 
7338 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7339 		assert(bits == PP_ATTR_MODIFIED);
7340 
7341 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7342 		/*
7343 		 * We short circuit this case; it should not need to
7344 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7345 		 * pmap_page_protect has taken care of resetting
7346 		 * the state so that we'll see the next write as a fault to
7347 		 * the VM (i.e. we don't want a fast fault).
7348 		 */
7349 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7350 		return;
7351 	}
7352 	if (bits & PP_ATTR_REFERENCED) {
7353 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7354 	}
7355 	if (bits & PP_ATTR_MODIFIED) {
7356 		allow_mode &= ~VM_PROT_WRITE;
7357 	}
7358 
7359 	if (bits == PP_ATTR_NOENCRYPT) {
7360 		/*
7361 		 * We short circuit this case; it should not need to
7362 		 * invoke arm_force_fast_fault, so just clear and
7363 		 * return.  On ARM, this bit is just a debugging aid.
7364 		 */
7365 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7366 		return;
7367 	}
7368 
7369 	if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7370 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7371 	}
7372 }
7373 
7374 MARK_AS_PMAP_TEXT void
7375 phys_attribute_clear_internal(
7376 	ppnum_t         pn,
7377 	unsigned int    bits,
7378 	int             options,
7379 	void            *arg)
7380 {
7381 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7382 }
7383 
7384 #if __ARM_RANGE_TLBI__
7385 MARK_AS_PMAP_TEXT static vm_map_address_t
7386 phys_attribute_clear_twig_internal(
7387 	pmap_t pmap,
7388 	vm_map_address_t start,
7389 	vm_map_address_t end,
7390 	unsigned int bits,
7391 	unsigned int options,
7392 	pmap_tlb_flush_range_t *flush_range)
7393 {
7394 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7395 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7396 	assert(end >= start);
7397 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7398 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7399 	vm_map_address_t va = start;
7400 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7401 	tt_entry_t     *tte_p;
7402 	tte_p = pmap_tte(pmap, start);
7403 	unsigned int npages = 0;
7404 
7405 	if (tte_p == (tt_entry_t *) NULL) {
7406 		return end;
7407 	}
7408 
7409 	if (tte_is_valid_table(*tte_p)) {
7410 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
7411 
7412 		start_pte_p = &pte_p[pte_index(pt_attr, start)];
7413 		end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7414 		assert(end_pte_p >= start_pte_p);
7415 		for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7416 			if (__improbable(npages++ && pmap_pending_preemption())) {
7417 				return va;
7418 			}
7419 			pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7420 			if (pa_valid(pa)) {
7421 				ppnum_t pn = (ppnum_t) atop(pa);
7422 				phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7423 			}
7424 		}
7425 	}
7426 	return end;
7427 }
7428 
7429 MARK_AS_PMAP_TEXT vm_map_address_t
7430 phys_attribute_clear_range_internal(
7431 	pmap_t pmap,
7432 	vm_map_address_t start,
7433 	vm_map_address_t end,
7434 	unsigned int bits,
7435 	unsigned int options)
7436 {
7437 	if (__improbable(end < start)) {
7438 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7439 	}
7440 	validate_pmap_mutable(pmap);
7441 
7442 	vm_map_address_t va = start;
7443 	pmap_tlb_flush_range_t flush_range = {
7444 		.ptfr_pmap = pmap,
7445 		.ptfr_start = start,
7446 		.ptfr_end = end,
7447 		.ptfr_flush_needed = false
7448 	};
7449 
7450 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7451 		return va;
7452 	}
7453 
7454 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7455 
7456 	while (va < end) {
7457 		vm_map_address_t curr_end;
7458 
7459 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7460 		if (curr_end > end) {
7461 			curr_end = end;
7462 		}
7463 
7464 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7465 		if ((va < curr_end) || pmap_pending_preemption()) {
7466 			break;
7467 		}
7468 	}
7469 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7470 	if (flush_range.ptfr_flush_needed) {
7471 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7472 			flush_range.ptfr_start,
7473 			flush_range.ptfr_end - flush_range.ptfr_start,
7474 			flush_range.ptfr_pmap,
7475 			true,
7476 			false);
7477 		sync_tlb_flush();
7478 	}
7479 	return va;
7480 }
7481 
7482 static void
7483 phys_attribute_clear_range(
7484 	pmap_t pmap,
7485 	vm_map_address_t start,
7486 	vm_map_address_t end,
7487 	unsigned int bits,
7488 	unsigned int options)
7489 {
7490 	/*
7491 	 * We allow single-page requests to execute non-preemptibly,
7492 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7493 	 * operation, and there are a couple of special use cases that
7494 	 * require a non-preemptible single-page operation.
7495 	 */
7496 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7497 		pmap_verify_preemptible();
7498 	}
7499 
7500 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7501 
7502 	while (start < end) {
7503 #if XNU_MONITOR
7504 		start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7505 #else
7506 		start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7507 #endif
7508 	}
7509 
7510 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7511 }
7512 #endif /* __ARM_RANGE_TLBI__ */
7513 
7514 static void
7515 phys_attribute_clear(
7516 	ppnum_t         pn,
7517 	unsigned int    bits,
7518 	int             options,
7519 	void            *arg)
7520 {
7521 	/*
7522 	 * Do we really want this tracepoint?  It will be extremely chatty.
7523 	 * Also, should we have a corresponding trace point for the set path?
7524 	 */
7525 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7526 
7527 #if XNU_MONITOR
7528 	phys_attribute_clear_ppl(pn, bits, options, arg);
7529 #else
7530 	phys_attribute_clear_internal(pn, bits, options, arg);
7531 #endif
7532 
7533 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7534 }
7535 
7536 /*
7537  *	Set specified attribute bits.
7538  *
7539  *	Set cached value in the pv head because we have
7540  *	no per-mapping hardware support for referenced and
7541  *	modify bits.
7542  */
7543 MARK_AS_PMAP_TEXT void
7544 phys_attribute_set_internal(
7545 	ppnum_t pn,
7546 	unsigned int bits)
7547 {
7548 	pmap_paddr_t    pa = ptoa(pn);
7549 	assert(pn != vm_page_fictitious_addr);
7550 
7551 #if XNU_MONITOR
7552 	if (bits & PP_ATTR_PPL_OWNED_BITS) {
7553 		panic("%s: illegal request, "
7554 		    "pn=%u, bits=%#x",
7555 		    __FUNCTION__,
7556 		    pn, bits);
7557 	}
7558 #endif
7559 
7560 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7561 
7562 	return;
7563 }
7564 
7565 static void
7566 phys_attribute_set(
7567 	ppnum_t pn,
7568 	unsigned int bits)
7569 {
7570 #if XNU_MONITOR
7571 	phys_attribute_set_ppl(pn, bits);
7572 #else
7573 	phys_attribute_set_internal(pn, bits);
7574 #endif
7575 }
7576 
7577 
7578 /*
7579  *	Check specified attribute bits.
7580  *
7581  *	use the software cached bits (since no hw support).
7582  */
7583 static boolean_t
7584 phys_attribute_test(
7585 	ppnum_t pn,
7586 	unsigned int bits)
7587 {
7588 	pmap_paddr_t    pa = ptoa(pn);
7589 	assert(pn != vm_page_fictitious_addr);
7590 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7591 }
7592 
7593 
7594 /*
7595  *	Set the modify/reference bits on the specified physical page.
7596  */
7597 void
7598 pmap_set_modify(ppnum_t pn)
7599 {
7600 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7601 }
7602 
7603 
7604 /*
7605  *	Clear the modify bits on the specified physical page.
7606  */
7607 void
7608 pmap_clear_modify(
7609 	ppnum_t pn)
7610 {
7611 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7612 }
7613 
7614 
7615 /*
7616  *	pmap_is_modified:
7617  *
7618  *	Return whether or not the specified physical page is modified
7619  *	by any physical maps.
7620  */
7621 boolean_t
7622 pmap_is_modified(
7623 	ppnum_t pn)
7624 {
7625 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7626 }
7627 
7628 
7629 /*
7630  *	Set the reference bit on the specified physical page.
7631  */
7632 static void
7633 pmap_set_reference(
7634 	ppnum_t pn)
7635 {
7636 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7637 }
7638 
7639 /*
7640  *	Clear the reference bits on the specified physical page.
7641  */
7642 void
7643 pmap_clear_reference(
7644 	ppnum_t pn)
7645 {
7646 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7647 }
7648 
7649 
7650 /*
7651  *	pmap_is_referenced:
7652  *
7653  *	Return whether or not the specified physical page is referenced
7654  *	by any physical maps.
7655  */
7656 boolean_t
7657 pmap_is_referenced(
7658 	ppnum_t pn)
7659 {
7660 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7661 }
7662 
7663 /*
7664  * pmap_get_refmod(phys)
7665  *  returns the referenced and modified bits of the specified
7666  *  physical page.
7667  */
7668 unsigned int
7669 pmap_get_refmod(
7670 	ppnum_t pn)
7671 {
7672 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7673 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7674 }
7675 
7676 static inline unsigned int
7677 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7678 {
7679 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7680 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7681 }
7682 
7683 /*
7684  * pmap_clear_refmod(phys, mask)
7685  *  clears the referenced and modified bits as specified by the mask
7686  *  of the specified physical page.
7687  */
7688 void
7689 pmap_clear_refmod_options(
7690 	ppnum_t         pn,
7691 	unsigned int    mask,
7692 	unsigned int    options,
7693 	void            *arg)
7694 {
7695 	unsigned int    bits;
7696 
7697 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7698 	phys_attribute_clear(pn, bits, options, arg);
7699 }
7700 
7701 /*
7702  * Perform pmap_clear_refmod_options on a virtual address range.
7703  * The operation will be performed in bulk & tlb flushes will be coalesced
7704  * if possible.
7705  *
7706  * Returns true if the operation is supported on this platform.
7707  * If this function returns false, the operation is not supported and
7708  * nothing has been modified in the pmap.
7709  */
7710 bool
7711 pmap_clear_refmod_range_options(
7712 	pmap_t pmap __unused,
7713 	vm_map_address_t start __unused,
7714 	vm_map_address_t end __unused,
7715 	unsigned int mask __unused,
7716 	unsigned int options __unused)
7717 {
7718 #if __ARM_RANGE_TLBI__
7719 	unsigned int    bits;
7720 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7721 	phys_attribute_clear_range(pmap, start, end, bits, options);
7722 	return true;
7723 #else /* __ARM_RANGE_TLBI__ */
7724 #pragma unused(pmap, start, end, mask, options)
7725 	/*
7726 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7727 	 * contiguous range of addresses. This is large performance improvement on
7728 	 * platforms that support ranged tlbi instructions. But on older platforms,
7729 	 * we can only flush per-page or the entire asid. So we currently
7730 	 * only support this operation on platforms that support ranged tlbi.
7731 	 * instructions. On other platforms, we require that
7732 	 * the VM modify the bits on a per-page basis.
7733 	 */
7734 	return false;
7735 #endif /* __ARM_RANGE_TLBI__ */
7736 }
7737 
7738 void
7739 pmap_clear_refmod(
7740 	ppnum_t pn,
7741 	unsigned int mask)
7742 {
7743 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7744 }
7745 
7746 unsigned int
7747 pmap_disconnect_options(
7748 	ppnum_t pn,
7749 	unsigned int options,
7750 	void *arg)
7751 {
7752 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7753 		/*
7754 		 * On ARM, the "modified" bit is managed by software, so
7755 		 * we know up-front if the physical page is "modified",
7756 		 * without having to scan all the PTEs pointing to it.
7757 		 * The caller should have made the VM page "busy" so noone
7758 		 * should be able to establish any new mapping and "modify"
7759 		 * the page behind us.
7760 		 */
7761 		if (pmap_is_modified(pn)) {
7762 			/*
7763 			 * The page has been modified and will be sent to
7764 			 * the VM compressor.
7765 			 */
7766 			options |= PMAP_OPTIONS_COMPRESSOR;
7767 		} else {
7768 			/*
7769 			 * The page hasn't been modified and will be freed
7770 			 * instead of compressed.
7771 			 */
7772 		}
7773 	}
7774 
7775 	/* disconnect the page */
7776 	pmap_page_protect_options(pn, 0, options, arg);
7777 
7778 	/* return ref/chg status */
7779 	return pmap_get_refmod(pn);
7780 }
7781 
7782 /*
7783  *	Routine:
7784  *		pmap_disconnect
7785  *
7786  *	Function:
7787  *		Disconnect all mappings for this page and return reference and change status
7788  *		in generic format.
7789  *
7790  */
7791 unsigned int
7792 pmap_disconnect(
7793 	ppnum_t pn)
7794 {
7795 	pmap_page_protect(pn, 0);       /* disconnect the page */
7796 	return pmap_get_refmod(pn);   /* return ref/chg status */
7797 }
7798 
7799 boolean_t
7800 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7801 {
7802 	if (ptoa(first) >= vm_last_phys) {
7803 		return FALSE;
7804 	}
7805 	if (ptoa(last) < vm_first_phys) {
7806 		return FALSE;
7807 	}
7808 
7809 	return TRUE;
7810 }
7811 
7812 /*
7813  * The state maintained by the noencrypt functions is used as a
7814  * debugging aid on ARM.  This incurs some overhead on the part
7815  * of the caller.  A special case check in phys_attribute_clear
7816  * (the most expensive path) currently minimizes this overhead,
7817  * but stubbing these functions out on RELEASE kernels yields
7818  * further wins.
7819  */
7820 boolean_t
7821 pmap_is_noencrypt(
7822 	ppnum_t pn)
7823 {
7824 #if DEVELOPMENT || DEBUG
7825 	boolean_t result = FALSE;
7826 
7827 	if (!pa_valid(ptoa(pn))) {
7828 		return FALSE;
7829 	}
7830 
7831 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7832 
7833 	return result;
7834 #else
7835 #pragma unused(pn)
7836 	return FALSE;
7837 #endif
7838 }
7839 
7840 void
7841 pmap_set_noencrypt(
7842 	ppnum_t pn)
7843 {
7844 #if DEVELOPMENT || DEBUG
7845 	if (!pa_valid(ptoa(pn))) {
7846 		return;
7847 	}
7848 
7849 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7850 #else
7851 #pragma unused(pn)
7852 #endif
7853 }
7854 
7855 void
7856 pmap_clear_noencrypt(
7857 	ppnum_t pn)
7858 {
7859 #if DEVELOPMENT || DEBUG
7860 	if (!pa_valid(ptoa(pn))) {
7861 		return;
7862 	}
7863 
7864 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7865 #else
7866 #pragma unused(pn)
7867 #endif
7868 }
7869 
7870 #if XNU_MONITOR
7871 boolean_t
7872 pmap_is_monitor(ppnum_t pn)
7873 {
7874 	assert(pa_valid(ptoa(pn)));
7875 	return phys_attribute_test(pn, PP_ATTR_MONITOR);
7876 }
7877 #endif
7878 
7879 void
7880 pmap_lock_phys_page(ppnum_t pn)
7881 {
7882 #if !XNU_MONITOR
7883 	unsigned int    pai;
7884 	pmap_paddr_t    phys = ptoa(pn);
7885 
7886 	if (pa_valid(phys)) {
7887 		pai = pa_index(phys);
7888 		pvh_lock(pai);
7889 	} else
7890 #else
7891 	(void)pn;
7892 #endif
7893 	{ simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7894 }
7895 
7896 
7897 void
7898 pmap_unlock_phys_page(ppnum_t pn)
7899 {
7900 #if !XNU_MONITOR
7901 	unsigned int    pai;
7902 	pmap_paddr_t    phys = ptoa(pn);
7903 
7904 	if (pa_valid(phys)) {
7905 		pai = pa_index(phys);
7906 		pvh_unlock(pai);
7907 	} else
7908 #else
7909 	(void)pn;
7910 #endif
7911 	{ simple_unlock(&phys_backup_lock);}
7912 }
7913 
7914 MARK_AS_PMAP_TEXT static void
7915 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7916 {
7917 	if (pmap != kernel_pmap) {
7918 		pmap_t nested_pmap = pmap->nested_pmap;
7919 		cpu_data_ptr->cpu_nested_pmap = nested_pmap;
7920 		if (nested_pmap != NULL) {
7921 			cpu_data_ptr->cpu_nested_pmap_attr = pmap_get_pt_attr(nested_pmap);
7922 			/**
7923 			 * Obtain the full shared region bounds from the nested pmap.  If the top-level pmap
7924 			 * hasn't been fully nested yet, its bounds may not yet be configured, or may be in the
7925 			 * process of being configured on another core.
7926 			 */
7927 			cpu_data_ptr->cpu_nested_region_addr = nested_pmap->nested_region_addr;
7928 			cpu_data_ptr->cpu_nested_region_size = nested_pmap->nested_region_size;
7929 		}
7930 #if __ARM_MIXED_PAGE_SIZE__
7931 		cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7932 #endif
7933 	}
7934 
7935 
7936 #if __ARM_MIXED_PAGE_SIZE__
7937 	if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7938 		set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7939 	}
7940 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7941 
7942 
7943 	if (pmap != kernel_pmap) {
7944 		set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7945 	} else if (!pmap_user_ttb_is_clear()) {
7946 		pmap_clear_user_ttb_internal();
7947 	}
7948 }
7949 
7950 MARK_AS_PMAP_TEXT void
7951 pmap_clear_user_ttb_internal(void)
7952 {
7953 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7954 }
7955 
7956 void
7957 pmap_clear_user_ttb(void)
7958 {
7959 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7960 #if XNU_MONITOR
7961 	pmap_clear_user_ttb_ppl();
7962 #else
7963 	pmap_clear_user_ttb_internal();
7964 #endif
7965 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7966 }
7967 
7968 
7969 #if defined(__arm64__)
7970 /*
7971  * Marker for use in multi-pass fast-fault PV list processing.
7972  * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7973  * these functions, as compressed PTEs should never be present in PV lists.
7974  * Note that this only holds true for arm64; for arm32 we don't have enough
7975  * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7976  * and WRITEABLE marker depending on whether the PTE is valid.
7977  */
7978 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7979 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7980 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7981 #endif
7982 
7983 
7984 MARK_AS_PMAP_TEXT static boolean_t
7985 arm_force_fast_fault_with_flush_range(
7986 	ppnum_t         ppnum,
7987 	vm_prot_t       allow_mode,
7988 	int             options,
7989 	pmap_tlb_flush_range_t *flush_range)
7990 {
7991 	pmap_paddr_t     phys = ptoa(ppnum);
7992 	pv_entry_t      *pve_p;
7993 	pt_entry_t      *pte_p;
7994 	unsigned int     pai;
7995 	unsigned int     pass1_updated = 0;
7996 	unsigned int     pass2_updated = 0;
7997 	boolean_t        result;
7998 	pv_entry_t     **pv_h;
7999 	bool             is_reusable;
8000 	bool             ref_fault;
8001 	bool             mod_fault;
8002 	bool             clear_write_fault = false;
8003 	bool             ref_aliases_mod = false;
8004 	bool             mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
8005 
8006 	assert(ppnum != vm_page_fictitious_addr);
8007 
8008 	if (!pa_valid(phys)) {
8009 		return FALSE;   /* Not a managed page. */
8010 	}
8011 
8012 	result = TRUE;
8013 	ref_fault = false;
8014 	mod_fault = false;
8015 	pai = pa_index(phys);
8016 	if (__probable(mustsynch)) {
8017 		pvh_lock(pai);
8018 	}
8019 	pv_h = pai_to_pvh(pai);
8020 
8021 #if XNU_MONITOR
8022 	if (__improbable(ppattr_pa_test_monitor(phys))) {
8023 		panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
8024 	}
8025 #endif
8026 	pte_p = PT_ENTRY_NULL;
8027 	pve_p = PV_ENTRY_NULL;
8028 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8029 		pte_p = pvh_ptep(pv_h);
8030 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8031 		pve_p = pvh_pve_list(pv_h);
8032 	} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8033 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
8034 	}
8035 
8036 	is_reusable = ppattr_test_reusable(pai);
8037 
8038 	/*
8039 	 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
8040 	 * invalidation during pass 2.  tlb_flush_needed only indicates that PTE permissions have
8041 	 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
8042 	 * FLUSH_PTE_STRONG() to synchronize prior PTE updates.  In the case of a flush_range
8043 	 * operation, TLB invalidation may be handled by the caller so it's possible for
8044 	 * tlb_flush_needed to be true while issue_tlbi is false.
8045 	 */
8046 	bool issue_tlbi = false;
8047 	bool tlb_flush_needed = false;
8048 
8049 	pv_entry_t *orig_pve_p = pve_p;
8050 	pt_entry_t *orig_pte_p = pte_p;
8051 	int pve_ptep_idx = 0;
8052 
8053 	/*
8054 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8055 	 * TLB invalidation in pass 2.
8056 	 */
8057 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8058 		pt_entry_t       spte;
8059 		pt_entry_t       tmplate;
8060 
8061 		if (pve_p != PV_ENTRY_NULL) {
8062 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8063 			if (pte_p == PT_ENTRY_NULL) {
8064 				goto fff_skip_pve_pass1;
8065 			}
8066 		}
8067 
8068 #ifdef PVH_FLAG_IOMMU
8069 		if (pvh_ptep_is_iommu(pte_p)) {
8070 			goto fff_skip_pve_pass1;
8071 		}
8072 #endif
8073 		if (*pte_p == ARM_PTE_EMPTY) {
8074 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8075 		}
8076 		if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
8077 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8078 		}
8079 
8080 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8081 		const pmap_t pmap = ptdp->pmap;
8082 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8083 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8084 
8085 		assert(va >= pmap->min && va < pmap->max);
8086 
8087 		/* update pmap stats and ledgers */
8088 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8089 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8090 		if (is_altacct) {
8091 			/*
8092 			 * We do not track "reusable" status for
8093 			 * "alternate accounting" mappings.
8094 			 */
8095 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8096 		    is_reusable &&
8097 		    is_internal &&
8098 		    pmap != kernel_pmap) {
8099 			/* one less "reusable" */
8100 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8101 			/* one more "internal" */
8102 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8103 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8104 
8105 			/*
8106 			 * Since the page is being marked non-reusable, we assume that it will be
8107 			 * modified soon.  Avoid the cost of another trap to handle the fast
8108 			 * fault when we next write to this page.
8109 			 */
8110 			clear_write_fault = true;
8111 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8112 		    !is_reusable &&
8113 		    is_internal &&
8114 		    pmap != kernel_pmap) {
8115 			/* one more "reusable" */
8116 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8117 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8118 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8119 		}
8120 
8121 		bool wiredskip = pte_is_wired(*pte_p) &&
8122 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8123 
8124 		if (wiredskip) {
8125 			result = FALSE;
8126 			goto fff_skip_pve_pass1;
8127 		}
8128 
8129 		spte = *pte_p;
8130 		tmplate = spte;
8131 
8132 #if HAS_FEAT_XS
8133 		/**
8134 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
8135 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
8136 		 */
8137 		assert(!pte_is_xs(pt_attr, spte));
8138 #endif /* HAS_FEAT_XS */
8139 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8140 			/* read protection sets the pte to fault */
8141 			tmplate =  tmplate & ~ARM_PTE_AF;
8142 			ref_fault = true;
8143 		}
8144 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8145 			/* take away write permission if set */
8146 			if (pmap == kernel_pmap) {
8147 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8148 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8149 					pte_set_was_writeable(tmplate, true);
8150 					mod_fault = true;
8151 				}
8152 			} else {
8153 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8154 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8155 					pte_set_was_writeable(tmplate, true);
8156 					mod_fault = true;
8157 				}
8158 			}
8159 		}
8160 
8161 #if MACH_ASSERT && XNU_MONITOR
8162 		if (is_pte_xprr_protected(pmap, spte)) {
8163 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8164 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8165 				    "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
8166 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8167 				    ppnum, options, allow_mode);
8168 			}
8169 		}
8170 #endif /* MACH_ASSERT && XNU_MONITOR */
8171 
8172 		if (result && (tmplate != spte)) {
8173 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
8174 			    !(options & PMAP_OPTIONS_NOFLUSH)) {
8175 				tlb_flush_needed = true;
8176 				if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8177 				    va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
8178 #ifdef ARM_PTE_FF_MARKER
8179 					assert(!(spte & ARM_PTE_FF_MARKER));
8180 					tmplate |= ARM_PTE_FF_MARKER;
8181 					++pass1_updated;
8182 #endif
8183 					issue_tlbi = true;
8184 				}
8185 			}
8186 			write_pte_fast(pte_p, tmplate);
8187 		}
8188 
8189 fff_skip_pve_pass1:
8190 		pte_p = PT_ENTRY_NULL;
8191 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8192 			pve_ptep_idx = 0;
8193 			pve_p = pve_next(pve_p);
8194 		}
8195 	}
8196 
8197 	if (tlb_flush_needed) {
8198 		FLUSH_PTE_STRONG();
8199 	}
8200 
8201 	if (!issue_tlbi) {
8202 		goto fff_finish;
8203 	}
8204 
8205 	/* Pass 2: Issue any required TLB invalidations */
8206 	pve_p = orig_pve_p;
8207 	pte_p = orig_pte_p;
8208 	pve_ptep_idx = 0;
8209 
8210 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8211 		if (pve_p != PV_ENTRY_NULL) {
8212 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8213 			if (pte_p == PT_ENTRY_NULL) {
8214 				goto fff_skip_pve_pass2;
8215 			}
8216 		}
8217 
8218 #ifdef PVH_FLAG_IOMMU
8219 		if (pvh_ptep_is_iommu(pte_p)) {
8220 			goto fff_skip_pve_pass2;
8221 		}
8222 #endif
8223 
8224 #ifdef ARM_PTE_FF_MARKER
8225 		pt_entry_t spte = *pte_p;
8226 
8227 		if (!(spte & ARM_PTE_FF_MARKER)) {
8228 			goto fff_skip_pve_pass2;
8229 		} else {
8230 			spte &= (~ARM_PTE_FF_MARKER);
8231 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8232 			write_pte_fast(pte_p, spte);
8233 			++pass2_updated;
8234 		}
8235 #endif
8236 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8237 		const pmap_t pmap = ptdp->pmap;
8238 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8239 
8240 		if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8241 		    (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8242 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8243 			    pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8244 		}
8245 
8246 fff_skip_pve_pass2:
8247 		pte_p = PT_ENTRY_NULL;
8248 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8249 			pve_ptep_idx = 0;
8250 			pve_p = pve_next(pve_p);
8251 		}
8252 	}
8253 
8254 fff_finish:
8255 	if (__improbable(pass1_updated != pass2_updated)) {
8256 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8257 		    __func__, pass1_updated, pass2_updated);
8258 	}
8259 
8260 	/*
8261 	 * If we are using the same approach for ref and mod
8262 	 * faults on this PTE, do not clear the write fault;
8263 	 * this would cause both ref and mod to be set on the
8264 	 * page again, and prevent us from taking ANY read/write
8265 	 * fault on the mapping.
8266 	 */
8267 	if (clear_write_fault && !ref_aliases_mod) {
8268 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8269 	}
8270 	if (tlb_flush_needed) {
8271 		if (flush_range) {
8272 			/* Delayed flush. Signal to the caller that the flush is needed. */
8273 			flush_range->ptfr_flush_needed = true;
8274 		} else {
8275 			sync_tlb_flush();
8276 		}
8277 	}
8278 
8279 	/* update global "reusable" status for this page */
8280 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8281 		ppattr_clear_reusable(pai);
8282 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8283 		ppattr_set_reusable(pai);
8284 	}
8285 
8286 	if (mod_fault) {
8287 		ppattr_set_modfault(pai);
8288 	}
8289 	if (ref_fault) {
8290 		ppattr_set_reffault(pai);
8291 	}
8292 	if (__probable(mustsynch)) {
8293 		pvh_unlock(pai);
8294 	}
8295 	return result;
8296 }
8297 
8298 MARK_AS_PMAP_TEXT boolean_t
8299 arm_force_fast_fault_internal(
8300 	ppnum_t         ppnum,
8301 	vm_prot_t       allow_mode,
8302 	int             options)
8303 {
8304 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8305 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8306 	}
8307 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8308 }
8309 
8310 /*
8311  *	Routine:	arm_force_fast_fault
8312  *
8313  *	Function:
8314  *		Force all mappings for this page to fault according
8315  *		to the access modes allowed, so we can gather ref/modify
8316  *		bits again.
8317  */
8318 
8319 boolean_t
8320 arm_force_fast_fault(
8321 	ppnum_t         ppnum,
8322 	vm_prot_t       allow_mode,
8323 	int             options,
8324 	__unused void   *arg)
8325 {
8326 	pmap_paddr_t    phys = ptoa(ppnum);
8327 
8328 	assert(ppnum != vm_page_fictitious_addr);
8329 
8330 	if (!pa_valid(phys)) {
8331 		return FALSE;   /* Not a managed page. */
8332 	}
8333 
8334 #if XNU_MONITOR
8335 	return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8336 #else
8337 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8338 #endif
8339 }
8340 
8341 /*
8342  *	Routine:	arm_clear_fast_fault
8343  *
8344  *	Function:
8345  *		Clear pending force fault for all mappings for this page based on
8346  *		the observed fault type, update ref/modify bits.
8347  */
8348 MARK_AS_PMAP_TEXT static boolean_t
8349 arm_clear_fast_fault(
8350 	ppnum_t ppnum,
8351 	vm_prot_t fault_type,
8352 	pt_entry_t *pte_p)
8353 {
8354 	pmap_paddr_t    pa = ptoa(ppnum);
8355 	pv_entry_t     *pve_p;
8356 	unsigned int    pai;
8357 	boolean_t       result;
8358 	bool            tlb_flush_needed = false;
8359 	pv_entry_t    **pv_h;
8360 	unsigned int    npve = 0;
8361 	unsigned int    pass1_updated = 0;
8362 	unsigned int    pass2_updated = 0;
8363 
8364 	assert(ppnum != vm_page_fictitious_addr);
8365 
8366 	if (!pa_valid(pa)) {
8367 		return FALSE;   /* Not a managed page. */
8368 	}
8369 
8370 	result = FALSE;
8371 	pai = pa_index(pa);
8372 	pvh_assert_locked(pai);
8373 	pv_h = pai_to_pvh(pai);
8374 
8375 	pve_p = PV_ENTRY_NULL;
8376 	if (pte_p == PT_ENTRY_NULL) {
8377 		if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8378 			pte_p = pvh_ptep(pv_h);
8379 		} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8380 			pve_p = pvh_pve_list(pv_h);
8381 		} else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8382 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8383 		}
8384 	}
8385 
8386 	pv_entry_t *orig_pve_p = pve_p;
8387 	pt_entry_t *orig_pte_p = pte_p;
8388 	int pve_ptep_idx = 0;
8389 
8390 	/*
8391 	 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8392 	 * TLB invalidation in pass 2.
8393 	 */
8394 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8395 		pt_entry_t spte;
8396 		pt_entry_t tmplate;
8397 
8398 		if (pve_p != PV_ENTRY_NULL) {
8399 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8400 			if (pte_p == PT_ENTRY_NULL) {
8401 				goto cff_skip_pve_pass1;
8402 			}
8403 		}
8404 
8405 #ifdef PVH_FLAG_IOMMU
8406 		if (pvh_ptep_is_iommu(pte_p)) {
8407 			goto cff_skip_pve_pass1;
8408 		}
8409 #endif
8410 		if (*pte_p == ARM_PTE_EMPTY) {
8411 			panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8412 		}
8413 
8414 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8415 		const pmap_t pmap = ptdp->pmap;
8416 		__assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8417 
8418 		assert(va >= pmap->min && va < pmap->max);
8419 
8420 		spte = *pte_p;
8421 		tmplate = spte;
8422 
8423 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8424 			{
8425 				if (pmap == kernel_pmap) {
8426 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8427 				} else {
8428 					assert(pmap->type != PMAP_TYPE_NESTED);
8429 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8430 				}
8431 			}
8432 
8433 			tmplate |= ARM_PTE_AF;
8434 
8435 			pte_set_was_writeable(tmplate, false);
8436 			ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8437 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8438 			tmplate = spte | ARM_PTE_AF;
8439 
8440 			{
8441 				ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8442 			}
8443 		}
8444 
8445 #if MACH_ASSERT && XNU_MONITOR
8446 		if (is_pte_xprr_protected(pmap, spte)) {
8447 			if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8448 				panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8449 				    "ppnum=0x%x, fault_type=0x%x",
8450 				    __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8451 				    ppnum, fault_type);
8452 			}
8453 		}
8454 #endif /* MACH_ASSERT && XNU_MONITOR */
8455 
8456 		assert(spte != ARM_PTE_EMPTY);
8457 		if (spte != tmplate) {
8458 			if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8459 #ifdef ARM_PTE_FF_MARKER
8460 				assert(!(spte & ARM_PTE_FF_MARKER));
8461 				tmplate |= ARM_PTE_FF_MARKER;
8462 				++pass1_updated;
8463 #endif
8464 				tlb_flush_needed = true;
8465 			}
8466 			write_pte_fast(pte_p, tmplate);
8467 			result = TRUE;
8468 		}
8469 
8470 cff_skip_pve_pass1:
8471 		pte_p = PT_ENTRY_NULL;
8472 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8473 			pve_ptep_idx = 0;
8474 			pve_p = pve_next(pve_p);
8475 			++npve;
8476 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8477 				break;
8478 			}
8479 		}
8480 	}
8481 
8482 	if (!tlb_flush_needed) {
8483 		goto cff_finish;
8484 	}
8485 
8486 	FLUSH_PTE_STRONG();
8487 
8488 	/* Pass 2: Issue any required TLB invalidations */
8489 	pve_p = orig_pve_p;
8490 	pte_p = orig_pte_p;
8491 	pve_ptep_idx = 0;
8492 	npve = 0;
8493 
8494 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8495 		if (pve_p != PV_ENTRY_NULL) {
8496 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8497 			if (pte_p == PT_ENTRY_NULL) {
8498 				goto cff_skip_pve_pass2;
8499 			}
8500 		}
8501 
8502 #ifdef PVH_FLAG_IOMMU
8503 		if (pvh_ptep_is_iommu(pte_p)) {
8504 			goto cff_skip_pve_pass2;
8505 		}
8506 #endif
8507 
8508 #ifdef ARM_PTE_FF_MARKER
8509 		pt_entry_t spte = *pte_p;
8510 
8511 		if (!(spte & ARM_PTE_FF_MARKER)) {
8512 			goto cff_skip_pve_pass2;
8513 		} else {
8514 			spte &= (~ARM_PTE_FF_MARKER);
8515 			/* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8516 			write_pte_fast(pte_p, spte);
8517 			++pass2_updated;
8518 		}
8519 #endif
8520 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8521 		const pmap_t pmap = ptdp->pmap;
8522 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8523 
8524 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8525 		    pmap, true, false);
8526 
8527 cff_skip_pve_pass2:
8528 		pte_p = PT_ENTRY_NULL;
8529 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8530 			pve_ptep_idx = 0;
8531 			pve_p = pve_next(pve_p);
8532 			++npve;
8533 			if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8534 				break;
8535 			}
8536 		}
8537 	}
8538 
8539 cff_finish:
8540 	if (__improbable(pass1_updated != pass2_updated)) {
8541 		panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8542 		    __func__, pass1_updated, pass2_updated);
8543 	}
8544 	if (tlb_flush_needed) {
8545 		sync_tlb_flush();
8546 	}
8547 	return result;
8548 }
8549 
8550 /*
8551  * Determine if the fault was induced by software tracking of
8552  * modify/reference bits.  If so, re-enable the mapping (and set
8553  * the appropriate bits).
8554  *
8555  * Returns KERN_SUCCESS if the fault was induced and was
8556  * successfully handled.
8557  *
8558  * Returns KERN_FAILURE if the fault was not induced and
8559  * the function was unable to deal with it.
8560  *
8561  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8562  * disallows this type of access.
8563  *
8564  * Returns KERN_ABORTED if the pmap lock is taken and a
8565  * preemption is pending.
8566  *
8567  */
8568 MARK_AS_PMAP_TEXT kern_return_t
8569 arm_fast_fault_internal(
8570 	pmap_t pmap,
8571 	vm_map_address_t va,
8572 	vm_prot_t fault_type,
8573 	__unused bool was_af_fault,
8574 	__unused bool from_user)
8575 {
8576 	kern_return_t   result = KERN_FAILURE;
8577 	pt_entry_t     *ptep;
8578 	pt_entry_t      spte = ARM_PTE_EMPTY;
8579 	unsigned int    pai;
8580 	pmap_paddr_t    pa;
8581 	validate_pmap_mutable(pmap);
8582 
8583 	if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8584 		return KERN_ABORTED;
8585 	}
8586 
8587 	/*
8588 	 * If the entry doesn't exist, is completely invalid, or is already
8589 	 * valid, we can't fix it here.
8590 	 */
8591 
8592 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8593 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8594 	if (ptep != PT_ENTRY_NULL) {
8595 		while (true) {
8596 			spte = *((volatile pt_entry_t*)ptep);
8597 
8598 			pa = pte_to_pa(spte);
8599 
8600 			if ((spte == ARM_PTE_EMPTY) ||
8601 			    ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8602 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8603 				return result;
8604 			}
8605 
8606 			if (!pa_valid(pa)) {
8607 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8608 #if XNU_MONITOR
8609 				if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8610 					return KERN_PROTECTION_FAILURE;
8611 				} else
8612 #endif
8613 				return result;
8614 			}
8615 			pai = pa_index(pa);
8616 			pvh_lock(pai);
8617 			if (*ptep == spte) {
8618 				/*
8619 				 * Double-check the spte value, as we care about the AF bit.
8620 				 * It's also possible that pmap_page_protect() transitioned the
8621 				 * PTE to compressed/empty before we grabbed the PVH lock.
8622 				 */
8623 				break;
8624 			}
8625 			pvh_unlock(pai);
8626 		}
8627 	} else {
8628 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8629 		return result;
8630 	}
8631 
8632 
8633 	if ((result != KERN_SUCCESS) &&
8634 	    ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8635 		/*
8636 		 * An attempted access will always clear ref/mod fault state, as
8637 		 * appropriate for the fault type.  arm_clear_fast_fault will
8638 		 * update the associated PTEs for the page as appropriate; if
8639 		 * any PTEs are updated, we redrive the access.  If the mapping
8640 		 * does not actually allow for the attempted access, the
8641 		 * following fault will (hopefully) fail to update any PTEs, and
8642 		 * thus cause arm_fast_fault to decide that it failed to handle
8643 		 * the fault.
8644 		 */
8645 		if (ppattr_test_reffault(pai)) {
8646 			ppattr_clear_reffault(pai);
8647 		}
8648 		if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8649 			ppattr_clear_modfault(pai);
8650 		}
8651 
8652 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8653 			/*
8654 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8655 			 * cost of not doing so is a another fault in a case
8656 			 * that should already result in an exception.
8657 			 */
8658 			result = KERN_SUCCESS;
8659 		}
8660 	}
8661 
8662 	/*
8663 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8664 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8665 	 * on mappings of the same page
8666 	 */
8667 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8668 		uintptr_t ap_ro, ap_rw, ap_x;
8669 		if (pmap == kernel_pmap) {
8670 			ap_ro = ARM_PTE_AP(AP_RONA);
8671 			ap_rw = ARM_PTE_AP(AP_RWNA);
8672 			ap_x = ARM_PTE_NX;
8673 		} else {
8674 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8675 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8676 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8677 		}
8678 		/*
8679 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8680 		 * hardware they may be xPRR-protected, in which case they'll be handled
8681 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8682 		 * handling path currently does not call arm_fast_fault() without at least
8683 		 * VM_PROT_READ in fault_type.
8684 		 */
8685 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8686 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8687 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8688 				result = KERN_SUCCESS;
8689 			}
8690 		}
8691 	}
8692 
8693 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8694 		/*
8695 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8696 		 * another pending PV list operation or an excessively large PV list.
8697 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8698 		 * taking a fault on the same mapping.
8699 		 */
8700 		result = KERN_SUCCESS;
8701 	}
8702 
8703 	pvh_unlock(pai);
8704 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8705 	return result;
8706 }
8707 
8708 kern_return_t
8709 arm_fast_fault(
8710 	pmap_t pmap,
8711 	vm_map_address_t va,
8712 	vm_prot_t fault_type,
8713 	bool was_af_fault,
8714 	__unused bool from_user)
8715 {
8716 	kern_return_t   result = KERN_FAILURE;
8717 
8718 	if (va < pmap->min || va >= pmap->max) {
8719 		return result;
8720 	}
8721 
8722 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8723 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8724 	    from_user);
8725 
8726 	do {
8727 #if XNU_MONITOR
8728 		result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8729 #else
8730 		result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8731 #endif
8732 	} while (result == KERN_ABORTED);
8733 
8734 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8735 
8736 	return result;
8737 }
8738 
8739 void
8740 pmap_copy_page(
8741 	ppnum_t psrc,
8742 	ppnum_t pdst,
8743 	int options)
8744 {
8745 	bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8746 	    (addr64_t) (ptoa(pdst)),
8747 	    PAGE_SIZE,
8748 	    options);
8749 }
8750 
8751 
8752 /*
8753  *	pmap_copy_page copies the specified (machine independent) pages.
8754  */
8755 void
8756 pmap_copy_part_page(
8757 	ppnum_t psrc,
8758 	vm_offset_t src_offset,
8759 	ppnum_t pdst,
8760 	vm_offset_t dst_offset,
8761 	vm_size_t len)
8762 {
8763 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8764 	    (addr64_t) (ptoa(pdst) + dst_offset),
8765 	    len);
8766 }
8767 
8768 
8769 /*
8770  *	pmap_zero_page zeros the specified (machine independent) page.
8771  */
8772 void
8773 pmap_zero_page(
8774 	ppnum_t pn)
8775 {
8776 	assert(pn != vm_page_fictitious_addr);
8777 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8778 }
8779 
8780 void
8781 pmap_zero_page_with_options(
8782 	ppnum_t pn,
8783 	int options)
8784 {
8785 	assert(pn != vm_page_fictitious_addr);
8786 	bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8787 }
8788 
8789 /*
8790  *	pmap_zero_part_page
8791  *	zeros the specified (machine independent) part of a page.
8792  */
8793 void
8794 pmap_zero_part_page(
8795 	ppnum_t pn,
8796 	vm_offset_t offset,
8797 	vm_size_t len)
8798 {
8799 	assert(pn != vm_page_fictitious_addr);
8800 	assert(offset + len <= PAGE_SIZE);
8801 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8802 }
8803 
8804 void
8805 pmap_map_globals(
8806 	void)
8807 {
8808 	pt_entry_t      *ptep, pte;
8809 
8810 	ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8811 	assert(ptep != PT_ENTRY_NULL);
8812 	assert(*ptep == ARM_PTE_EMPTY);
8813 
8814 	pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8815 #if __ARM_KERNEL_PROTECT__
8816 	pte |= ARM_PTE_NG;
8817 #endif /* __ARM_KERNEL_PROTECT__ */
8818 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8819 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8820 	*ptep = pte;
8821 	FLUSH_PTE();
8822 	PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8823 
8824 #if KASAN
8825 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8826 #endif
8827 }
8828 
8829 vm_offset_t
8830 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8831 {
8832 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8833 		panic("%s: invalid index %u", __func__, index);
8834 	}
8835 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8836 }
8837 
8838 MARK_AS_PMAP_TEXT unsigned int
8839 pmap_map_cpu_windows_copy_internal(
8840 	ppnum_t pn,
8841 	vm_prot_t prot,
8842 	unsigned int wimg_bits)
8843 {
8844 	pt_entry_t      *ptep = NULL, pte;
8845 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8846 	unsigned int    cpu_num;
8847 	unsigned int    i;
8848 	vm_offset_t     cpu_copywindow_vaddr = 0;
8849 	bool            need_strong_sync = false;
8850 
8851 #if XNU_MONITOR
8852 	unsigned int    cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8853 	need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8854 #endif
8855 
8856 #if XNU_MONITOR
8857 #ifdef  __ARM_COHERENT_IO__
8858 	if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8859 		panic("%s: attempted to map a managed page, "
8860 		    "pn=%u, prot=0x%x, wimg_bits=0x%x",
8861 		    __FUNCTION__,
8862 		    pn, prot, wimg_bits);
8863 	}
8864 	if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8865 		panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8866 	}
8867 
8868 #else /* __ARM_COHERENT_IO__ */
8869 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8870 #endif /* __ARM_COHERENT_IO__ */
8871 #endif /* XNU_MONITOR */
8872 	cpu_num = pmap_cpu_data->cpu_number;
8873 
8874 	for (i = 0; i < CPUWINDOWS_MAX; i++) {
8875 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8876 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8877 		assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8878 		if (!pte_is_valid(*ptep)) {
8879 			break;
8880 		}
8881 	}
8882 	if (i == CPUWINDOWS_MAX) {
8883 		panic("pmap_map_cpu_windows_copy: out of window");
8884 	}
8885 
8886 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8887 #if __ARM_KERNEL_PROTECT__
8888 	pte |= ARM_PTE_NG;
8889 #endif /* __ARM_KERNEL_PROTECT__ */
8890 
8891 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8892 
8893 	if (prot & VM_PROT_WRITE) {
8894 		pte |= ARM_PTE_AP(AP_RWNA);
8895 	} else {
8896 		pte |= ARM_PTE_AP(AP_RONA);
8897 	}
8898 #if HAS_FEAT_XS
8899 	need_strong_sync = pte_is_xs(native_pt_attr, pte);
8900 #endif
8901 	write_pte_fast(ptep, pte);
8902 	/*
8903 	 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8904 	 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8905 	 */
8906 	FLUSH_PTE_STRONG();
8907 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8908 	pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8909 
8910 	return i;
8911 }
8912 
8913 unsigned int
8914 pmap_map_cpu_windows_copy(
8915 	ppnum_t pn,
8916 	vm_prot_t prot,
8917 	unsigned int wimg_bits)
8918 {
8919 #if XNU_MONITOR
8920 	return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8921 #else
8922 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8923 #endif
8924 }
8925 
8926 MARK_AS_PMAP_TEXT void
8927 pmap_unmap_cpu_windows_copy_internal(
8928 	unsigned int index)
8929 {
8930 	pt_entry_t      *ptep;
8931 	unsigned int    cpu_num;
8932 	vm_offset_t     cpu_copywindow_vaddr = 0;
8933 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8934 
8935 	cpu_num = pmap_cpu_data->cpu_number;
8936 
8937 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8938 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8939 	 * (which are likely to have been on I/O memory) are complete before
8940 	 * tearing down the mapping. */
8941 	__builtin_arm_dsb(DSB_SY);
8942 	ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8943 	write_pte_strong(ptep, ARM_PTE_EMPTY);
8944 	PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8945 }
8946 
8947 void
8948 pmap_unmap_cpu_windows_copy(
8949 	unsigned int index)
8950 {
8951 #if XNU_MONITOR
8952 	return pmap_unmap_cpu_windows_copy_ppl(index);
8953 #else
8954 	return pmap_unmap_cpu_windows_copy_internal(index);
8955 #endif
8956 }
8957 
8958 #if XNU_MONITOR
8959 
8960 MARK_AS_PMAP_TEXT void
8961 pmap_invoke_with_page(
8962 	ppnum_t page_number,
8963 	void *ctx,
8964 	void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8965 {
8966 	#pragma unused(page_number, ctx, callback)
8967 }
8968 
8969 /*
8970  * Loop over every pmap_io_range (I/O ranges marked as owned by
8971  * the PPL in the device tree) and conditionally call callback() on each range
8972  * that needs to be included in the hibernation image.
8973  *
8974  * @param ctx      Will be passed as-is into the callback method. Use NULL if no
8975  *                 context is needed in the callback.
8976  * @param callback Callback function invoked on each range (gated by flag).
8977  */
8978 MARK_AS_PMAP_TEXT void
8979 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8980 {
8981 	extern const pmap_io_range_t* io_attr_table;
8982 	extern const unsigned int num_io_rgns;
8983 	for (unsigned int i = 0; i < num_io_rgns; ++i) {
8984 		if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8985 			callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8986 		}
8987 	}
8988 }
8989 
8990 /**
8991  * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8992  * PPL-owned page. Otherwise, do nothing.
8993  *
8994  * @param addr Physical address of the page to set the HASHED flag on.
8995  */
8996 MARK_AS_PMAP_TEXT void
8997 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8998 {
8999 	/* Ignore non-managed kernel memory. */
9000 	if (!pa_valid(addr)) {
9001 		return;
9002 	}
9003 
9004 	const unsigned int pai = pa_index(addr);
9005 	if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
9006 		pv_entry_t **pv_h = pai_to_pvh(pai);
9007 
9008 		/* Mark that the PPL-owned page has been hashed into the hibernation image. */
9009 		pvh_lock(pai);
9010 		pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
9011 		pvh_unlock(pai);
9012 	}
9013 }
9014 
9015 /**
9016  * Loop through every physical page in the system and clear out the HASHED flag
9017  * on every PPL-owned page. That flag is used to keep track of which pages have
9018  * been hashed into the hibernation image during the hibernation entry process.
9019  *
9020  * The HASHED flag needs to be cleared out between hibernation cycles because the
9021  * pv_head_table and pp_attr_table's might have been copied into the hibernation
9022  * image with the HASHED flag set on certain pages. It's important to clear the
9023  * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
9024  * into the hibernation image can't be compromised across hibernation cycles.
9025  */
9026 MARK_AS_PMAP_TEXT void
9027 pmap_clear_ppl_hashed_flag_all(void)
9028 {
9029 	const unsigned int last_index = pa_index(vm_last_phys);
9030 	pv_entry_t **pv_h = NULL;
9031 
9032 	for (int pai = 0; pai < last_index; ++pai) {
9033 		pv_h = pai_to_pvh(pai);
9034 
9035 		/* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
9036 		if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
9037 		    (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
9038 			pvh_lock(pai);
9039 			pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
9040 			pvh_unlock(pai);
9041 		}
9042 	}
9043 }
9044 
9045 /**
9046  * Enforce that all PPL-owned pages were hashed into the hibernation image. The
9047  * ppl_hib driver will call this after all wired pages have been copied into the
9048  * hibernation image.
9049  */
9050 MARK_AS_PMAP_TEXT void
9051 pmap_check_ppl_hashed_flag_all(void)
9052 {
9053 	const unsigned int last_index = pa_index(vm_last_phys);
9054 	pv_entry_t **pv_h = NULL;
9055 
9056 	for (int pai = 0; pai < last_index; ++pai) {
9057 		pv_h = pai_to_pvh(pai);
9058 
9059 		/**
9060 		 * The PMAP stacks are explicitly not saved into the image so skip checking
9061 		 * the pages that contain the PMAP stacks.
9062 		 */
9063 		const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
9064 		    (pai < pa_index(pmap_stacks_end_pa));
9065 
9066 		if (!is_pmap_stack &&
9067 		    (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
9068 		    !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
9069 			panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
9070 		}
9071 	}
9072 }
9073 
9074 #endif /* XNU_MONITOR */
9075 
9076 /*
9077  * Indicate that a pmap is intended to be used as a nested pmap
9078  * within one or more larger address spaces.  This must be set
9079  * before pmap_nest() is called with this pmap as the 'subordinate'.
9080  */
9081 MARK_AS_PMAP_TEXT void
9082 pmap_set_nested_internal(
9083 	pmap_t pmap)
9084 {
9085 	validate_pmap_mutable(pmap);
9086 	if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
9087 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9088 		    __func__, pmap, pmap->type);
9089 	}
9090 
9091 #if XNU_MONITOR
9092 	/**
9093 	 * The "seq_cst" ordering of the atomic load here guarantees
9094 	 * the check below is performed after the type update above
9095 	 * is observed. Together with similar order guarantee at
9096 	 * pmap_switch_internal(), it makes sure a pmap is never
9097 	 * active-and-nested:
9098 	 *
9099 	 * pmap_set_nested() | pmap_switch()
9100 	 * --------------------------------------
9101 	 * set nested        | set active
9102 	 * store-load barrier| store-load barrier
9103 	 * assert !active    | assert !nested
9104 	 */
9105 	const int max_cpu = ml_get_max_cpu_number();
9106 	for (unsigned int i = 0; i <= max_cpu; ++i) {
9107 		const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
9108 		if (cpu_data == NULL) {
9109 			continue;
9110 		}
9111 		if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
9112 			panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
9113 		}
9114 	}
9115 #endif /* XNU_MONITOR */
9116 
9117 	/**
9118 	 * Ensure that a (potentially concurrent) call to pmap_set_shared_region() hasn't tried
9119 	 * to give this pmap its own nested pmap.
9120 	 */
9121 	if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
9122 		panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
9123 	}
9124 
9125 	pmap_get_pt_ops(pmap)->free_id(pmap);
9126 }
9127 
9128 __mockable void
9129 pmap_set_nested(
9130 	pmap_t pmap)
9131 {
9132 #if XNU_MONITOR
9133 	pmap_set_nested_ppl(pmap);
9134 #else
9135 	pmap_set_nested_internal(pmap);
9136 #endif
9137 }
9138 
9139 bool
9140 pmap_is_nested(
9141 	pmap_t pmap)
9142 {
9143 	return pmap->type == PMAP_TYPE_NESTED;
9144 }
9145 
9146 /*
9147  * pmap_trim_range(pmap, start, end)
9148  *
9149  * pmap  = pmap to operate on
9150  * start = start of the range
9151  * end   = end of the range
9152  *
9153  * Attempts to deallocate TTEs for the given range in the nested range.
9154  */
9155 MARK_AS_PMAP_TEXT static void
9156 pmap_trim_range(
9157 	pmap_t pmap,
9158 	addr64_t start,
9159 	addr64_t end)
9160 {
9161 	addr64_t cur;
9162 	addr64_t nested_region_start;
9163 	addr64_t nested_region_end;
9164 	addr64_t adjusted_start;
9165 	addr64_t adjusted_end;
9166 	addr64_t adjust_offmask;
9167 	tt_entry_t * tte_p;
9168 	pt_entry_t * pte_p;
9169 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9170 
9171 	if (__improbable(end < start)) {
9172 		panic("%s: invalid address range, "
9173 		    "pmap=%p, start=%p, end=%p",
9174 		    __func__,
9175 		    pmap, (void*)start, (void*)end);
9176 	}
9177 
9178 	nested_region_start = pmap->nested_region_addr;
9179 	nested_region_end = nested_region_start + pmap->nested_region_size;
9180 
9181 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9182 		panic("%s: range outside nested region %p-%p, "
9183 		    "pmap=%p, start=%p, end=%p",
9184 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9185 		    pmap, (void*)start, (void*)end);
9186 	}
9187 
9188 	/* Contract the range to TT page boundaries. */
9189 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9190 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9191 	adjusted_end = end & ~adjust_offmask;
9192 
9193 	/* Iterate over the range, trying to remove TTEs. */
9194 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9195 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9196 
9197 		tte_p = pmap_tte(pmap, cur);
9198 
9199 		if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9200 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
9201 
9202 			/* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9203 			if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9204 				/* Deallocate for the nested map. */
9205 				pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9206 			} else if (pmap->type == PMAP_TYPE_USER) {
9207 				/**
9208 				 * Just remove for the parent map. If the leaf table pointed
9209 				 * to by the TTE being removed (owned by the nested pmap)
9210 				 * has any mappings, then this call will panic. This
9211 				 * enforces the policy that tables being trimmed must be
9212 				 * empty to prevent possible use-after-free attacks.
9213 				 */
9214 				pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9215 			} else {
9216 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9217 			}
9218 		} else {
9219 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9220 		}
9221 	}
9222 
9223 	/* Remove empty L2 TTs. */
9224 	adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9225 	adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9226 
9227 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9228 		/* For each L1 entry in our range... */
9229 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9230 
9231 		bool remove_tt1e = true;
9232 		tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9233 		tt_entry_t * tt2e_start;
9234 		tt_entry_t * tt2e_end;
9235 		tt_entry_t * tt2e_p;
9236 		tt_entry_t tt1e;
9237 
9238 		if (tt1e_p == NULL) {
9239 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9240 			continue;
9241 		}
9242 
9243 		tt1e = *tt1e_p;
9244 
9245 		if (tt1e == ARM_TTE_TYPE_FAULT) {
9246 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9247 			continue;
9248 		}
9249 
9250 		tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9251 		tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9252 
9253 		for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9254 			if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9255 				/*
9256 				 * If any TTEs are populated, don't remove the
9257 				 * L1 TT.
9258 				 */
9259 				remove_tt1e = false;
9260 			}
9261 		}
9262 
9263 		if (remove_tt1e) {
9264 			pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9265 		} else {
9266 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9267 		}
9268 	}
9269 }
9270 
9271 /**
9272  * State machine for multi-step pmap trimming. Trimming is the action of
9273  * deallocating the TTEs of the shared region of pmaps down to a given range.
9274  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9275  * disabling preemption for too long. These steps include computing the bounds
9276  * of the shared region, trimming the head of the "grand", trimming the tail of
9277  * the "grand", and trimming the "subord". Some of the steps can be skipped under
9278  * different conditions.
9279  *
9280  * @param grand the pmap in which the pages are nested
9281  * @param subord the pmap from which the pages are shared, or nested
9282  * @param vstart start of the used range in "grand"
9283  * @param size size of the used range
9284  * @param state the current state of the state machine
9285  *
9286  * @return the next state of the state machine, to be used in the next call
9287  *         into this function.
9288  */
9289 MARK_AS_PMAP_TEXT pmap_trim_state_t
9290 pmap_trim_internal(
9291 	pmap_t grand,
9292 	pmap_t subord,
9293 	addr64_t vstart,
9294 	uint64_t size,
9295 	pmap_trim_state_t state)
9296 {
9297 	/* Validation needs to be done regardless of state. */
9298 	addr64_t vend;
9299 
9300 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9301 		panic("%s: grand addr wraps around, "
9302 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9303 		    __func__, grand, subord, (void*)vstart, size, state);
9304 	}
9305 
9306 	validate_pmap_mutable(grand);
9307 	validate_pmap(subord);
9308 
9309 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9310 		panic("%s: subord is of non-nestable type 0x%hhx, "
9311 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9312 		    __func__, subord->type, grand, subord, (void*)vstart, size, state);
9313 	}
9314 
9315 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9316 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9317 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9318 		    __func__, grand->type, grand, subord, (void*)vstart, size, state);
9319 	}
9320 
9321 	if (__improbable(grand->nested_pmap != subord)) {
9322 		panic("%s: grand->nested != subord, "
9323 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9324 		    __func__, grand, subord, (void*)vstart, size, state);
9325 	}
9326 
9327 	if (__improbable((size != 0) &&
9328 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9329 		panic("%s: grand range not in nested region, "
9330 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9331 		    __func__, grand, subord, (void*)vstart, size, state);
9332 	}
9333 
9334 
9335 	/* Trimming starts with figuring out the bounds for the grand. */
9336 	if (state == PMAP_TRIM_STATE_START) {
9337 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9338 
9339 		/**
9340 		 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9341 		 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9342 		 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9343 		 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9344 		 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9345 		 * PMAP_TRIM_STATE_DONE.
9346 		 */
9347 		if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9348 			assert(subord->nested_bounds_set);
9349 
9350 			/* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9351 			if (!grand->nested_bounds_set) {
9352 				/* Inherit the bounds from subord. */
9353 				grand->nested_region_true_start = subord->nested_region_true_start;
9354 				grand->nested_region_true_end = subord->nested_region_true_end;
9355 				grand->nested_bounds_set = true;
9356 			}
9357 
9358 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9359 
9360 			/* Now that the grand has bounds, we are done. */
9361 			return PMAP_TRIM_STATE_DONE;
9362 		}
9363 
9364 		/* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9365 		if ((!subord->nested_bounds_set) && size) {
9366 			const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9367 			const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9368 
9369 			subord->nested_region_true_start = vstart;
9370 			subord->nested_region_true_end = vend;
9371 			subord->nested_region_true_start &= ~adjust_offmask;
9372 
9373 			if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9374 				panic("%s: padded true end wraps around, "
9375 				    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9376 				    __func__, grand, subord, (void*)vstart, size, state);
9377 			}
9378 
9379 			subord->nested_region_true_end &= ~adjust_offmask;
9380 			subord->nested_bounds_set = true;
9381 		}
9382 
9383 		/* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9384 		if (subord->nested_bounds_set) {
9385 			/* Inherit the bounds from subord. */
9386 			grand->nested_region_true_start = subord->nested_region_true_start;
9387 			grand->nested_region_true_end = subord->nested_region_true_end;
9388 			grand->nested_bounds_set = true;
9389 
9390 			/* If we know the bounds, we can trim the pmap. */
9391 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9392 
9393 			state = PMAP_TRIM_STATE_GRAND_BEFORE;
9394 		} else {
9395 			/* Don't trim if we don't know the bounds. */
9396 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9397 
9398 			return PMAP_TRIM_STATE_DONE;
9399 		}
9400 	}
9401 
9402 	/* Sanity check here: we are ready to trim, do we know the bounds yet? */
9403 	if (!grand->nested_bounds_set) {
9404 		panic("%s: !grand->nested_bounds_set, "
9405 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9406 		    __func__, grand, subord, (void*)vstart, size, state);
9407 	}
9408 
9409 	if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9410 		pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9411 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9412 		    NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9413 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9414 			    (unsigned int)grand->nested_no_bounds_ref_state);
9415 		}
9416 
9417 #if XNU_MONITOR
9418 		if (pmap_pending_preemption()) {
9419 			return PMAP_TRIM_STATE_GRAND_AFTER;
9420 		}
9421 #endif
9422 
9423 		state = PMAP_TRIM_STATE_GRAND_AFTER;
9424 	}
9425 
9426 	if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9427 		pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9428 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9429 		    NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9430 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9431 			    (unsigned int)grand->nested_no_bounds_ref_state);
9432 		}
9433 
9434 #if XNU_MONITOR
9435 		if (pmap_pending_preemption()) {
9436 			return PMAP_TRIM_STATE_SUBORD;
9437 		}
9438 #endif
9439 
9440 		state = PMAP_TRIM_STATE_SUBORD;
9441 	}
9442 
9443 	/* START state is guaranteed to compute the bounds for the subord. */
9444 	if (!subord->nested_bounds_set) {
9445 		panic("%s: !subord->nested_bounds_set, "
9446 		    "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9447 		    __func__, grand, subord, (void*)vstart, size, state);
9448 	}
9449 
9450 	if (state == PMAP_TRIM_STATE_SUBORD) {
9451 		/**
9452 		 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9453 		 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9454 		 * called once grand's nested tables have been fully trimmed, and can only be called once
9455 		 * for each 'grand' pmap.  We use release ordering for the atomics above to ensure that
9456 		 * the state update is visible only once the preceding trim operation is complete.  An
9457 		 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9458 		 * but locking within pmap_trim_range() should make that harmless (and all but one will
9459 		 * ultimately panic due to a failed atomic state CAS).  We use acquire ordering here to
9460 		 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9461 		 * of the state CAS.
9462 		 */
9463 		if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9464 		    NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9465 			panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9466 			    (unsigned int)grand->nested_no_bounds_ref_state);
9467 		}
9468 		pmap_trim_subord(subord);
9469 	}
9470 
9471 	return PMAP_TRIM_STATE_DONE;
9472 }
9473 
9474 MARK_AS_PMAP_TEXT static void
9475 pmap_trim_self(pmap_t pmap)
9476 {
9477 	if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9478 		/* If we have a no bounds ref, we need to drop it. */
9479 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9480 		pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9481 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9482 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9483 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9484 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9485 
9486 		if (nested_bounds_set) {
9487 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9488 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9489 		}
9490 		/*
9491 		 * Try trimming the nested pmap, in case we had the
9492 		 * last reference.
9493 		 */
9494 		pmap_trim_subord(pmap->nested_pmap);
9495 	}
9496 }
9497 
9498 /*
9499  * pmap_trim_subord(grand, subord)
9500  *
9501  * grand  = pmap that we have nested subord in
9502  * subord = nested pmap we are attempting to trim
9503  *
9504  * Trims subord if possible
9505  */
9506 MARK_AS_PMAP_TEXT static void
9507 pmap_trim_subord(pmap_t subord)
9508 {
9509 	bool contract_subord = false;
9510 
9511 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9512 
9513 	subord->nested_no_bounds_refcnt--;
9514 
9515 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9516 		/* If this was the last no bounds reference, trim subord. */
9517 		contract_subord = true;
9518 	}
9519 
9520 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9521 
9522 	if (contract_subord) {
9523 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9524 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9525 	}
9526 }
9527 
9528 /**
9529  * Deallocates the TTEs of the shared region of pmaps down to a given range.
9530  * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9531  * disabling preemption for too long.
9532  *
9533  * @note When we load the shared region we always create pages tables for the
9534  *       entire region. In practice, the shared cache may use just a portion
9535  *       of that. Before we know the bounds of the shared region, it can
9536  *       already be mapped into processes. Therefore, once the bounds are
9537  *       known, "trimming" comes in handy to remove the unnecessary page
9538  *       tables in the processes the shared region is mapped in, and eventually
9539  *       those in the shared region itself. Note that the shared region must
9540  *       be trimmed after the user processes because it has the L3 entries
9541  *       everyone else is pointing to.
9542  *
9543  * @param grand the pmap in which the pages are nested
9544  * @param subord the pmap from which the pages are shared, or nested
9545  * @param vstart start of the used range in "grand"
9546  * @param size size of the used range
9547  */
9548 void
9549 pmap_trim(
9550 	pmap_t grand,
9551 	pmap_t subord,
9552 	addr64_t vstart,
9553 	uint64_t size)
9554 {
9555 	pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9556 
9557 #if XNU_MONITOR
9558 	/* On PPL systems, drives the state machine until its done. */
9559 	while (state != PMAP_TRIM_STATE_DONE) {
9560 		__assert_only pmap_trim_state_t old_state = state;
9561 		state = pmap_trim_ppl(grand, subord, vstart, size, state);
9562 
9563 		/* Are we making progress? */
9564 		assert(old_state != state);
9565 	}
9566 
9567 	pmap_ledger_check_balance(grand);
9568 	pmap_ledger_check_balance(subord);
9569 #else
9570 	state = pmap_trim_internal(grand, subord, vstart, size, state);
9571 
9572 	/* On non-PPL systems, we expect the implementation to finish in one call. */
9573 	assert(state == PMAP_TRIM_STATE_DONE);
9574 #endif
9575 }
9576 
9577 #if HAS_APPLE_PAC
9578 void *
9579 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9580 {
9581 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9582 		panic("attempt to sign user pointer without process independent key");
9583 	}
9584 
9585 	void *res = NULL;
9586 	uint64_t current_intr_state = pmap_interrupts_disable();
9587 
9588 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9589 
9590 	__compiler_materialize_and_prevent_reordering_on(value);
9591 	switch (key) {
9592 	case ptrauth_key_asia:
9593 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9594 		break;
9595 	case ptrauth_key_asda:
9596 		res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9597 		break;
9598 	default:
9599 		__builtin_unreachable();
9600 	}
9601 	__compiler_materialize_and_prevent_reordering_on(res);
9602 
9603 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9604 
9605 	pmap_interrupts_restore(current_intr_state);
9606 
9607 	return res;
9608 }
9609 
9610 void *
9611 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9612 {
9613 	return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9614 }
9615 
9616 void *
9617 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9618 {
9619 	if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9620 		panic("attempt to auth user pointer without process independent key");
9621 	}
9622 
9623 	void *res = NULL;
9624 	uint64_t current_intr_state = pmap_interrupts_disable();
9625 
9626 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9627 	__compiler_materialize_and_prevent_reordering_on(value);
9628 	res = ml_auth_ptr_unchecked(value, key, discriminator);
9629 	__compiler_materialize_and_prevent_reordering_on(res);
9630 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9631 
9632 	pmap_interrupts_restore(current_intr_state);
9633 
9634 	return res;
9635 }
9636 
9637 void *
9638 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9639 {
9640 	return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9641 }
9642 #endif /* HAS_APPLE_PAC */
9643 
9644 /*
9645  * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9646  * the 'subordinate' pmap and has begun operating on the 'grand' pmap.  This
9647  * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9648  * return value, to indicate where a preempted [un]nest operation should resume.
9649  * When the return value contains the ending address of the nested region with
9650  * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9651  */
9652 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9653 
9654 /**
9655  * Establishes the pmap associated with a shared region as the nested pmap
9656  * for a top-level user pmap.
9657  *
9658  * @param grand The top-level user pmap
9659  * @param subord The pmap to be set as [grand]'s nested pmap
9660  * @param vstart The base VA of the region to be nested.
9661  * @param size The size (in bytes) of the region to be nested.
9662  */
9663 MARK_AS_PMAP_TEXT kern_return_t
9664 pmap_set_shared_region_internal(
9665 	pmap_t grand,
9666 	pmap_t subord,
9667 	addr64_t vstart,
9668 	uint64_t size)
9669 {
9670 	addr64_t        vend;
9671 	uint64_t        nested_region_unnested_table_bitmap_size;
9672 	unsigned int*   nested_region_unnested_table_bitmap = NULL;
9673 	kern_return_t   kr = KERN_SUCCESS;
9674 
9675 	validate_pmap_mutable(grand);
9676 	validate_pmap(subord);
9677 
9678 #if XNU_MONITOR
9679 	/*
9680 	 * Ordering is important here.  validate_pmap() has already ensured subord is a
9681 	 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9682 	 * be in the process of being destroyed.  If destruction is already committed,
9683 	 * then the check of ref_count below will cover us.  If destruction is initiated
9684 	 * during or after this call, then pmap_destroy() will catch the non-zero
9685 	 * nested_count.
9686 	 */
9687 	os_atomic_inc(&subord->nested_count, relaxed);
9688 	os_atomic_thread_fence(seq_cst);
9689 #endif
9690 	if (__improbable(os_atomic_inc_orig(&subord->ref_count, acquire) <= 0)) {
9691 		panic("%s: invalid subordinate pmap %p", __func__, subord);
9692 	}
9693 
9694 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9695 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9696 	}
9697 
9698 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9699 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9700 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9701 	}
9702 	if (__improbable(((size | vstart) & (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9703 		panic("%s: pmap %p unaligned set_shared_region request 0x%llx, 0x%llx",
9704 		    __func__, grand, vstart, size);
9705 	}
9706 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9707 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9708 	}
9709 
9710 	if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
9711 		nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9712 
9713 		/**
9714 		 * Each even-numbered entry in the unnested table bit stores the table's unnesting state to be used
9715 		 * by pmap_enter() in determining whether to set the NG bit, while the subsequent odd-numbered
9716 		 * entry stores the "unnest in progress" indicator for the table, which is used by pmap_unnest()
9717 		 * to determine if an L3 table may not have been fully marked NG due to an interrupted operation.
9718 		 */
9719 		nested_region_unnested_table_bitmap_size <<= 1;
9720 
9721 		if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9722 			panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9723 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9724 			    __func__, nested_region_unnested_table_bitmap_size,
9725 			    grand, subord, vstart, size);
9726 		}
9727 
9728 #if XNU_MONITOR
9729 		pmap_paddr_t pa = 0;
9730 
9731 		if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9732 			panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9733 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9734 			    __FUNCTION__, nested_region_unnested_table_bitmap_size,
9735 			    grand, subord, vstart, size);
9736 		}
9737 
9738 		kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9739 
9740 		if (kr != KERN_SUCCESS) {
9741 			goto done;
9742 		}
9743 
9744 		assert(pa);
9745 
9746 		nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9747 #else
9748 		nested_region_unnested_table_bitmap = kalloc_data(
9749 			nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9750 			Z_WAITOK | Z_ZERO);
9751 #endif
9752 
9753 		if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9754 			kr = KERN_ABORTED;
9755 			goto done;
9756 		}
9757 
9758 		if (subord->nested_region_unnested_table_bitmap == NULL) {
9759 			subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9760 			subord->nested_region_addr = vstart;
9761 			subord->nested_region_size = (mach_vm_offset_t) size;
9762 
9763 			/**
9764 			 * Use a store-release operation to ensure that the rest of the subord->nested_region_*
9765 			 * fields are initialized and visible before setting the nested_region_unnested_table_bitmap
9766 			 * field (which is used as the flag to say that the rest are initialized).
9767 			 */
9768 			os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
9769 			nested_region_unnested_table_bitmap = NULL;
9770 		}
9771 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9772 	}
9773 
9774 	if (__improbable(!os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst))) {
9775 		panic("%s: attempt to nest pmap %p into pmap %p which already has a nested pmap %p",
9776 		    __func__, subord, grand, grand->nested_pmap);
9777 	}
9778 	/**
9779 	 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9780 	 * into a nested pmap, which would then produce multiple levels of nesting.
9781 	 */
9782 	if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9783 		panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9784 	}
9785 
9786 done:
9787 	if (nested_region_unnested_table_bitmap != NULL) {
9788 #if XNU_MONITOR
9789 		pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9790 #else
9791 		kfree_data(nested_region_unnested_table_bitmap,
9792 		    nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9793 #endif
9794 		nested_region_unnested_table_bitmap = NULL;
9795 	}
9796 
9797 	if (kr != KERN_SUCCESS) {
9798 #if XNU_MONITOR
9799 		os_atomic_dec(&subord->nested_count, relaxed);
9800 #endif
9801 		pmap_destroy_internal(subord);
9802 	}
9803 
9804 	return kr;
9805 }
9806 
9807 __mockable void
9808 pmap_set_shared_region(
9809 	pmap_t grand,
9810 	pmap_t subord,
9811 	addr64_t vstart,
9812 	uint64_t size)
9813 {
9814 	kern_return_t kr = KERN_SUCCESS;
9815 
9816 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9817 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9818 
9819 	pmap_verify_preemptible();
9820 #if XNU_MONITOR
9821 	do {
9822 		kr = pmap_set_shared_region_ppl(grand, subord, vstart, size);
9823 		if (kr == KERN_RESOURCE_SHORTAGE) {
9824 			pmap_alloc_page_for_ppl(0);
9825 		} else if ((kr != KERN_SUCCESS) && (kr != KERN_ABORTED)) {
9826 			panic("%s: unexpected return code 0x%x from pmap_set_shared_region_ppl",
9827 			    __func__, kr);
9828 		}
9829 	} while (kr != KERN_SUCCESS);
9830 
9831 	pmap_ledger_check_balance(grand);
9832 	pmap_ledger_check_balance(subord);
9833 #else
9834 	/**
9835 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9836 	 * we have verified preemptibility. Therefore, pmap_set_shared_region_internal()
9837 	 * will wait for a page or a lock instead of bailing out as in the PPL flavor.
9838 	 */
9839 	kr = pmap_set_shared_region_internal(grand, subord, vstart, size);
9840 	assert3u(kr, ==, KERN_SUCCESS);
9841 #endif
9842 
9843 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9844 }
9845 
9846 /**
9847  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9848  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9849  * This function operates in 3 main phases:
9850  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9851  * 2. Expansion of subord to ensure the required leaf-level page table pages for
9852  *    the mapping range are present in subord.
9853  * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9854  *    contains pointers to subord's leaf-level pagetable pages for the specified
9855  *    VA range.
9856  *
9857  * This function may return early due to pending AST_URGENT preemption; if so
9858  * it will indicate the need to be re-entered.
9859  *
9860  * @note This function requires that [subord] has already been associated with
9861  *       [grand] through a call to pmap_set_shared_region().
9862  *
9863  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9864  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9865  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9866  * @param size twig-aligned size of the nesting range
9867  * @param vrestart the twig-aligned starting address of the current call.  May contain
9868  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9869  * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9870  *        KERN_RESOURCE_SHORTAGE on allocation failure.
9871  *
9872  * @return the virtual address at which to restart the operation, possibly including
9873  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
9874  *         (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9875  */
9876 MARK_AS_PMAP_TEXT vm_map_offset_t
9877 pmap_nest_internal(
9878 	pmap_t grand,
9879 	pmap_t subord,
9880 	addr64_t vstart,
9881 	uint64_t size,
9882 	vm_map_offset_t vrestart,
9883 	kern_return_t *krp)
9884 {
9885 	kern_return_t kr = KERN_FAILURE;
9886 	vm_map_offset_t vaddr;
9887 	tt_entry_t     *stte_p;
9888 	tt_entry_t     *gtte_p;
9889 	int             expand_options = 0;
9890 	bool            grand_locked = false;
9891 
9892 	addr64_t vend;
9893 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9894 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9895 	}
9896 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9897 	    ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9898 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9899 		    (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9900 	}
9901 
9902 	assert(krp != NULL);
9903 	validate_pmap_mutable(grand);
9904 	validate_pmap(subord);
9905 
9906 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9907 
9908 	if (__improbable(subord != grand->nested_pmap)) {
9909 		panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9910 		    __func__, subord, grand, grand->nested_pmap);
9911 	}
9912 
9913 #if XNU_MONITOR
9914 	expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9915 #endif
9916 
9917 	if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9918 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9919 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9920 		    grand, vstart, size, (unsigned long long)vrestart);
9921 	}
9922 
9923 	if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9924 		kr = KERN_ABORTED;
9925 		goto nest_cleanup;
9926 	}
9927 
9928 	if (__improbable((subord->nested_region_addr + subord->nested_region_size) < vend) ||
9929 	    (subord->nested_region_addr > vstart)) {
9930 		panic("%s: attempt to nest [0x%llx, 0x%llx) in pmap %p outside nested pmap %p bounds [0x%llx, 0x%llx)\n",
9931 		    __func__, vstart, vend, grand, subord, subord->nested_region_addr, subord->nested_region_addr + subord->nested_region_size);
9932 	}
9933 	if (grand->nested_region_size == 0) {
9934 		/*
9935 		 * If this is grand's first nesting operation, keep the reference on subord.
9936 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
9937 		 */
9938 		if (!subord->nested_bounds_set) {
9939 			/*
9940 			 * We are nesting without the shared regions bounds
9941 			 * being known.  We'll have to trim the pmap later.
9942 			 */
9943 			if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9944 			    NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9945 				panic("%s: grand %p already nested", __func__, grand);
9946 			}
9947 			subord->nested_no_bounds_refcnt++;
9948 		}
9949 
9950 		/**
9951 		 * Ensure that we won't exceed the nested_region_unnested_table bitmap bounds established
9952 		 * in pmap_set_shared_region_internal().
9953 		 */
9954 		if (__improbable((vstart < subord->nested_region_addr) ||
9955 		    (vend > (subord->nested_region_addr + subord->nested_region_size)))) {
9956 			panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9957 			    __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9958 			    (void *) (subord->nested_region_addr + subord->nested_region_size));
9959 		}
9960 
9961 		grand->nested_region_addr = vstart;
9962 		grand->nested_region_size = (mach_vm_offset_t) size;
9963 	} else {
9964 		if (__improbable(grand->nested_region_addr > vstart)) {
9965 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9966 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9967 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9968 		}
9969 	}
9970 
9971 	vaddr = vrestart & ~PMAP_NEST_GRAND;
9972 	if (vaddr < subord->nested_region_true_start) {
9973 		vaddr = subord->nested_region_true_start;
9974 	}
9975 
9976 	addr64_t true_end = vend;
9977 	if (true_end > subord->nested_region_true_end) {
9978 		true_end = subord->nested_region_true_end;
9979 	}
9980 	__unused unsigned int ttecount = 0;
9981 
9982 	if (vrestart & PMAP_NEST_GRAND) {
9983 		goto nest_grand;
9984 	}
9985 
9986 	while (vaddr < true_end) {
9987 		stte_p = pmap_tte(subord, vaddr);
9988 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9989 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9990 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9991 
9992 			if (kr != KERN_SUCCESS) {
9993 				goto done;
9994 			}
9995 
9996 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9997 		}
9998 		vaddr += pt_attr_twig_size(pt_attr);
9999 		vrestart = vaddr;
10000 		++ttecount;
10001 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10002 		    pmap_pending_preemption())) {
10003 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10004 			kr = KERN_SUCCESS;
10005 			goto done;
10006 		}
10007 	}
10008 	/*
10009 	 * copy TTEs from subord pmap into grand pmap
10010 	 */
10011 
10012 	vaddr = (vm_map_offset_t) vstart;
10013 	if (vaddr < subord->nested_region_true_start) {
10014 		vaddr = subord->nested_region_true_start;
10015 	}
10016 	vrestart = vaddr | PMAP_NEST_GRAND;
10017 
10018 nest_grand:
10019 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
10020 
10021 	if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10022 		kr = KERN_ABORTED;
10023 		goto done;
10024 	}
10025 	while (vaddr < true_end) {
10026 		gtte_p = pmap_tte(grand, vaddr);
10027 		if (gtte_p == PT_ENTRY_NULL) {
10028 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10029 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
10030 			if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
10031 				if (kr == KERN_SUCCESS) {
10032 					kr = KERN_ABORTED;
10033 				}
10034 			}
10035 
10036 			if (kr != KERN_SUCCESS) {
10037 				goto done;
10038 			}
10039 
10040 			gtte_p = pmap_tt2e(grand, vaddr);
10041 		}
10042 		/* Don't leak a page table page.  Don't violate break-before-make. */
10043 		if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
10044 			panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
10045 			    __func__, gtte_p, grand);
10046 		}
10047 		/**
10048 		 * It's possible that grand was trimmed by pmap_trim_internal() while the
10049 		 * lock was dropped, in which case the previously stored "true" start/end
10050 		 * will no longer be accurate.  In that case, we need to avoid nesting
10051 		 * tables outside the trimmed range, as those tables may be immediately freed
10052 		 * which would lead to a dangling page table pointer in grand.
10053 		 * Note that pmap_trim() may concurrently update grand's bounds as we are
10054 		 * making these checks, but in that case pmap_trim_range() has not yet
10055 		 * been called on grand and will wait for us to drop grand's lock, so it
10056 		 * should see any TTEs we've nested here and clear them appropriately.
10057 		 */
10058 		if (__probable((vaddr >= grand->nested_region_true_start) &&
10059 		    (vaddr < grand->nested_region_true_end))) {
10060 			stte_p = pmap_tte(subord, vaddr);
10061 			if (__improbable(stte_p == PT_ENTRY_NULL)) {
10062 				panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
10063 			}
10064 			*gtte_p = *stte_p;
10065 		}
10066 
10067 		vaddr += pt_attr_twig_size(pt_attr);
10068 		vrestart = vaddr | PMAP_NEST_GRAND;
10069 		++ttecount;
10070 		if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10071 		    pmap_pending_preemption())) {
10072 			break;
10073 		}
10074 	}
10075 	if (vaddr >= true_end) {
10076 		vrestart = vend | PMAP_NEST_GRAND;
10077 	}
10078 
10079 	kr = KERN_SUCCESS;
10080 done:
10081 
10082 	FLUSH_PTE();
10083 	__builtin_arm_isb(ISB_SY);
10084 
10085 	if (grand_locked) {
10086 		pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10087 	}
10088 
10089 nest_cleanup:
10090 #if XNU_MONITOR
10091 	if (kr != KERN_SUCCESS) {
10092 		pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10093 		*krp = kr;
10094 		pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
10095 	}
10096 #else
10097 	if (kr != KERN_SUCCESS) {
10098 		*krp = kr;
10099 	}
10100 #endif
10101 	return vrestart;
10102 }
10103 
10104 __mockable kern_return_t
10105 pmap_nest(
10106 	pmap_t grand,
10107 	pmap_t subord,
10108 	addr64_t vstart,
10109 	uint64_t size)
10110 {
10111 	kern_return_t kr = KERN_SUCCESS;
10112 	vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
10113 	vm_map_offset_t vend = vaddr + size;
10114 	__unused vm_map_offset_t vlast = vaddr;
10115 
10116 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
10117 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
10118 	    VM_KERNEL_ADDRHIDE(vstart));
10119 
10120 	pmap_verify_preemptible();
10121 #if XNU_MONITOR
10122 	while (vaddr != (vend | PMAP_NEST_GRAND)) {
10123 		vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
10124 		if (kr == KERN_RESOURCE_SHORTAGE) {
10125 			pmap_alloc_page_for_ppl(0);
10126 			kr = KERN_SUCCESS;
10127 		} else if (kr == KERN_ABORTED) {
10128 			/**
10129 			 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
10130 			 * that it won't update kr when KERN_SUCCESS is to be returned.
10131 			 * Therefore, the KERN_ABORTED needs to be manually cleared here,
10132 			 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
10133 			 */
10134 			kr = KERN_SUCCESS;
10135 			continue;
10136 		} else if (kr != KERN_SUCCESS) {
10137 			break;
10138 		} else if (vaddr == vlast) {
10139 			panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
10140 			    __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
10141 		}
10142 		vlast = vaddr;
10143 	}
10144 
10145 	pmap_ledger_check_balance(grand);
10146 	pmap_ledger_check_balance(subord);
10147 #else
10148 	/**
10149 	 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
10150 	 * we have verified preemptibility. Therefore, pmap_nest_internal() will
10151 	 * wait for a page or a lock instead of bailing out as in the PPL flavor.
10152 	 */
10153 	while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
10154 		vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
10155 	}
10156 #endif
10157 
10158 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
10159 
10160 	return kr;
10161 }
10162 
10163 /*
10164  *	kern_return_t pmap_unnest(grand, vaddr)
10165  *
10166  *	grand  = the pmap that will have the virtual range unnested
10167  *	vaddr  = start of range in pmap to be unnested
10168  *	size   = size of range in pmap to be unnested
10169  *
10170  */
10171 
10172 kern_return_t
10173 pmap_unnest(
10174 	pmap_t grand,
10175 	addr64_t vaddr,
10176 	uint64_t size)
10177 {
10178 	return pmap_unnest_options(grand, vaddr, size, 0);
10179 }
10180 
10181 /**
10182  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10183  * from a top-level pmap ('grand').  The corresponding mappings in the nested
10184  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10185  * still have the region nested.  The mappings in 'grand' will be left empty
10186  * with the assumption that they will be demand-filled by subsequent access faults.
10187  *
10188  * This function operates in 2 main phases:
10189  * 1. Iteration over the nested pmap's mappings for the specified range to mark
10190  *    them non-global.
10191  * 2. Clearing of the twig-level TTEs for the address range in grand.
10192  *
10193  * This function may return early due to pending AST_URGENT preemption; if so
10194  * it will indicate the need to be re-entered.
10195  *
10196  * @param grand pmap from which to unnest mappings
10197  * @param vaddr twig-aligned virtual address for the beginning of the nested range
10198  * @param size twig-aligned size of the nested range
10199  * @param vrestart the page-aligned starting address of the current call.  May contain
10200  *        PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10201  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10202  *        grand is being torn down and step 1) above is not needed.
10203  *
10204  * @return the virtual address at which to restart the operation, possibly including
10205  *         PMAP_NEST_GRAND to indicate the phase at which to restart.  If
10206  *         (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10207  */
10208 MARK_AS_PMAP_TEXT vm_map_offset_t
10209 pmap_unnest_options_internal(
10210 	pmap_t grand,
10211 	addr64_t vaddr,
10212 	uint64_t size,
10213 	vm_map_offset_t vrestart,
10214 	unsigned int option)
10215 {
10216 	vm_map_offset_t start;
10217 	vm_map_offset_t addr;
10218 	tt_entry_t     *tte_p;
10219 	unsigned int    current_index;
10220 	unsigned int    start_index;
10221 	unsigned int    max_index;
10222 	unsigned int    entry_count = 0;
10223 
10224 	addr64_t vend;
10225 	addr64_t true_end;
10226 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10227 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10228 	}
10229 	if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10230 	    ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10231 		panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10232 		    (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10233 	}
10234 
10235 	validate_pmap_mutable(grand);
10236 
10237 	if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10238 		panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10239 	}
10240 
10241 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10242 
10243 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10244 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10245 		    (unsigned long long)vaddr, (unsigned long long)size);
10246 	}
10247 
10248 	if (__improbable(grand->nested_pmap == NULL)) {
10249 		panic("%s: %p has no nested pmap", __func__, grand);
10250 	}
10251 
10252 	true_end = vend;
10253 	if (true_end > grand->nested_pmap->nested_region_true_end) {
10254 		true_end = grand->nested_pmap->nested_region_true_end;
10255 	}
10256 
10257 	if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10258 		if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10259 			return vrestart;
10260 		}
10261 
10262 		start = vrestart;
10263 		if (start < grand->nested_pmap->nested_region_true_start) {
10264 			start = grand->nested_pmap->nested_region_true_start;
10265 		}
10266 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10267 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10268 		bool flush_tlb = false;
10269 
10270 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10271 			pt_entry_t  *bpte, *cpte;
10272 
10273 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10274 
10275 			bpte = pmap_pte(grand->nested_pmap, addr);
10276 
10277 			/*
10278 			 * If we've re-entered this function partway through unnesting a leaf region, the
10279 			 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10280 			 * the run of PTEs and the adjacent "in-progress" bit will be set.
10281 			 */
10282 			if (!testbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10283 			    testbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap)) {
10284 				/*
10285 				 * Mark the 'twig' region as being unnested.  Every mapping entered within
10286 				 * the nested pmap in this region will now be marked non-global.  Do this
10287 				 * before marking any of the PTEs within the region as non-global to avoid
10288 				 * the possibility of pmap_enter() subsequently inserting a global mapping
10289 				 * in the region, which could lead to a TLB conflict if a non-global entry
10290 				 * is later inserted for the same VA in a pmap which has fully unnested this
10291 				 * region.
10292 				 */
10293 				setbit(UNNEST_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10294 				setbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10295 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10296 					pmap_paddr_t    pa;
10297 					unsigned int    pai = 0;
10298 					boolean_t               managed = FALSE;
10299 					pt_entry_t  spte;
10300 
10301 					if (pte_is_valid(*cpte) && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10302 						spte = *((volatile pt_entry_t*)cpte);
10303 						while (!managed) {
10304 							pa = pte_to_pa(spte);
10305 							if (!pa_valid(pa)) {
10306 								break;
10307 							}
10308 							pai = pa_index(pa);
10309 							pvh_lock(pai);
10310 							spte = *((volatile pt_entry_t*)cpte);
10311 							pa = pte_to_pa(spte);
10312 							if (pai == pa_index(pa)) {
10313 								managed = TRUE;
10314 								break; // Leave the PVH locked as we'll unlock it after we update the PTE
10315 							}
10316 							pvh_unlock(pai);
10317 						}
10318 
10319 						if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10320 							write_pte_fast(cpte, (spte | ARM_PTE_NG));
10321 							flush_tlb = true;
10322 						}
10323 
10324 						if (managed) {
10325 							pvh_assert_locked(pai);
10326 							pvh_unlock(pai);
10327 						}
10328 					}
10329 
10330 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10331 					vrestart = addr;
10332 					++entry_count;
10333 					if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10334 					    pmap_pending_preemption())) {
10335 						goto unnest_subord_done;
10336 					}
10337 				}
10338 				clrbit(UNNEST_IN_PROGRESS_BIT(current_index), (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10339 			}
10340 			addr = vlim;
10341 			vrestart = addr;
10342 			++entry_count;
10343 			if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10344 			    pmap_pending_preemption())) {
10345 				break;
10346 			}
10347 		}
10348 
10349 unnest_subord_done:
10350 		if (flush_tlb) {
10351 			FLUSH_PTE_STRONG();
10352 			PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10353 		}
10354 
10355 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10356 		if (current_index < max_index) {
10357 			return vrestart;
10358 		}
10359 	}
10360 
10361 	/*
10362 	 * invalidate all pdes for segment at vaddr in pmap grand
10363 	 */
10364 	if (vrestart & PMAP_NEST_GRAND) {
10365 		addr = vrestart & ~PMAP_NEST_GRAND;
10366 		if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10367 			panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10368 		}
10369 	} else {
10370 		addr = vaddr;
10371 		vrestart = vaddr | PMAP_NEST_GRAND;
10372 	}
10373 
10374 	/**
10375 	 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10376 	 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10377 	 * upon reentry.
10378 	 */
10379 	if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10380 		return vrestart;
10381 	}
10382 
10383 	if (addr < grand->nested_pmap->nested_region_true_start) {
10384 		addr = grand->nested_pmap->nested_region_true_start;
10385 	}
10386 
10387 	start = addr;
10388 
10389 	while (addr < true_end) {
10390 		tte_p = pmap_tte(grand, addr);
10391 		/*
10392 		 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10393 		 * so it's possible that a region we're trying to unnest may not have been
10394 		 * nested in the first place.
10395 		 */
10396 		if (tte_p != NULL) {
10397 			*tte_p = ARM_TTE_TYPE_FAULT;
10398 		}
10399 		addr += pt_attr_twig_size(pt_attr);
10400 		vrestart = addr | PMAP_NEST_GRAND;
10401 		++entry_count;
10402 		if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10403 		    pmap_pending_preemption())) {
10404 			break;
10405 		}
10406 	}
10407 	if (addr >= true_end) {
10408 		vrestart = vend | PMAP_NEST_GRAND;
10409 	}
10410 
10411 	FLUSH_PTE_STRONG();
10412 	PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10413 
10414 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10415 
10416 	return vrestart;
10417 }
10418 
10419 __mockable kern_return_t
10420 pmap_unnest_options(
10421 	pmap_t grand,
10422 	addr64_t vaddr,
10423 	uint64_t size,
10424 	unsigned int option)
10425 {
10426 	vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10427 	vm_map_offset_t vend = vaddr + size;
10428 
10429 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10430 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10431 
10432 	pmap_verify_preemptible();
10433 	while (vrestart != (vend | PMAP_NEST_GRAND)) {
10434 #if XNU_MONITOR
10435 		vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10436 #else
10437 		vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10438 #endif
10439 	}
10440 
10441 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10442 
10443 	return KERN_SUCCESS;
10444 }
10445 
10446 boolean_t
10447 pmap_adjust_unnest_parameters(
10448 	__unused pmap_t p,
10449 	__unused vm_map_offset_t *s,
10450 	__unused vm_map_offset_t *e)
10451 {
10452 	return TRUE; /* to get to log_unnest_badness()... */
10453 }
10454 
10455 /**
10456  * Perform any necessary pre-nesting of the parent's shared region at fork()
10457  * time.
10458  *
10459  * @note This should only be called from vm_map_fork().
10460  *
10461  * @param old_pmap The pmap of the parent task.
10462  * @param new_pmap The pmap of the child task.
10463  *
10464  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10465  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
10466  */
10467 kern_return_t
10468 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
10469 {
10470 	if (old_pmap == NULL || new_pmap == NULL) {
10471 		return KERN_INVALID_ARGUMENT;
10472 	}
10473 	if (old_pmap->nested_pmap == NULL) {
10474 		return KERN_SUCCESS;
10475 	}
10476 	/**
10477 	 * Obtain the full shared region bounds from the nested pmap.  If old_pmap
10478 	 * hasn't been fully nested yet, its bounds may not yet be configured.
10479 	 */
10480 	pmap_set_shared_region(new_pmap,
10481 	    old_pmap->nested_pmap,
10482 	    old_pmap->nested_pmap->nested_region_addr,
10483 	    old_pmap->nested_pmap->nested_region_size);
10484 	return KERN_SUCCESS;
10485 }
10486 
10487 /*
10488  * disable no-execute capability on
10489  * the specified pmap
10490  */
10491 #if DEVELOPMENT || DEBUG
10492 void
10493 pmap_disable_NX(
10494 	pmap_t pmap)
10495 {
10496 	pmap->nx_enabled = FALSE;
10497 }
10498 #else
10499 void
10500 pmap_disable_NX(
10501 	__unused pmap_t pmap)
10502 {
10503 }
10504 #endif
10505 
10506 /*
10507  * flush a range of hardware TLB entries.
10508  * NOTE: assumes the smallest TLB entry in use will be for
10509  * an ARM small page (4K).
10510  */
10511 
10512 #if __ARM_RANGE_TLBI__
10513 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10514 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
10515 #else
10516 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
10517 #endif // __ARM_RANGE_TLBI__
10518 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10519     "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10520     "of npages to 32 bits below may truncate.");
10521 
10522 static void
10523 flush_mmu_tlb_region_asid_async(
10524 	vm_offset_t va,
10525 	size_t length,
10526 	pmap_t pmap,
10527 	bool last_level_only __unused,
10528 	bool strong __unused)
10529 {
10530 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10531 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10532 	size_t npages = length >> pmap_page_shift;
10533 	uint32_t asid;
10534 
10535 	asid = pmap->hw_asid;
10536 
10537 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10538 		boolean_t       flush_all = FALSE;
10539 
10540 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10541 			flush_all = TRUE;
10542 		}
10543 		if (flush_all) {
10544 			flush_mmu_tlb_async();
10545 		} else {
10546 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10547 		}
10548 		return;
10549 	}
10550 #if __ARM_RANGE_TLBI__
10551 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10552 		/**
10553 		 * Note that casting npages to 32 bits here is always safe thanks to
10554 		 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10555 		 */
10556 		va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10557 		if (pmap->type == PMAP_TYPE_NESTED) {
10558 			flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10559 		} else {
10560 			flush_mmu_tlb_range_async(va, last_level_only, strong);
10561 		}
10562 		return;
10563 	}
10564 #endif
10565 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10566 	va = tlbi_asid(asid) | tlbi_addr(va);
10567 
10568 	if (pmap->type == PMAP_TYPE_NESTED) {
10569 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10570 	} else {
10571 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10572 	}
10573 }
10574 
10575 MARK_AS_PMAP_TEXT static void
10576 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10577 {
10578 	flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10579 }
10580 
10581 void
10582 flush_mmu_tlb_region(
10583 	vm_offset_t va,
10584 	unsigned length)
10585 {
10586 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10587 	sync_tlb_flush();
10588 }
10589 
10590 unsigned int
10591 pmap_cache_attributes(
10592 	ppnum_t pn)
10593 {
10594 	pmap_paddr_t    paddr;
10595 	unsigned int    pai;
10596 	unsigned int    result;
10597 	pp_attr_t       pp_attr_current;
10598 
10599 	paddr = ptoa(pn);
10600 
10601 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10602 
10603 	if (!pa_valid(paddr)) {
10604 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10605 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10606 	}
10607 
10608 	result = VM_WIMG_DEFAULT;
10609 
10610 	pai = pa_index(paddr);
10611 
10612 	pp_attr_current = pp_attr_table[pai];
10613 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10614 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
10615 	}
10616 	return result;
10617 }
10618 
10619 MARK_AS_PMAP_TEXT static void
10620 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10621 {
10622 	if ((wimg_bits_prev != wimg_bits_new)
10623 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10624 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10625 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10626 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10627 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10628 		pmap_sync_page_attributes_phys(pn);
10629 	}
10630 
10631 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10632 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10633 	}
10634 }
10635 
10636 MARK_AS_PMAP_TEXT __unused void
10637 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10638 {
10639 	pmap_paddr_t paddr = ptoa(pn);
10640 	const unsigned int pai = pa_index(paddr);
10641 
10642 	if (__improbable(!pa_valid(paddr))) {
10643 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10644 	}
10645 
10646 	pvh_lock(pai);
10647 
10648 #if XNU_MONITOR
10649 	if (__improbable(ppattr_pa_test_monitor(paddr))) {
10650 		panic("%s invoked on PPL page 0x%08x", __func__, pn);
10651 	}
10652 #endif
10653 
10654 	pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10655 
10656 	pvh_unlock(pai);
10657 
10658 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10659 }
10660 
10661 void *
10662 pmap_map_compressor_page(ppnum_t pn)
10663 {
10664 #if __ARM_PTE_PHYSMAP__
10665 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10666 	if (cacheattr != VM_WIMG_DEFAULT) {
10667 #if XNU_MONITOR
10668 		pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10669 #else
10670 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10671 #endif
10672 	}
10673 #endif
10674 	return (void*)phystokv(ptoa(pn));
10675 }
10676 
10677 void
10678 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10679 {
10680 #if __ARM_PTE_PHYSMAP__
10681 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10682 	if (cacheattr != VM_WIMG_DEFAULT) {
10683 #if XNU_MONITOR
10684 		pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10685 #else
10686 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10687 #endif
10688 	}
10689 #endif
10690 }
10691 
10692 /**
10693  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10694  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10695  *
10696  * @param page_list List of pages to be updated.
10697  * @param cacheattr The new cache attribute.
10698  */
10699 void
10700 pmap_batch_set_cache_attributes(
10701 	const unified_page_list_t *page_list,
10702 	unsigned int cacheattr)
10703 {
10704 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10705 
10706 	if (page_list->type != UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY) {
10707 		/**
10708 		 * For the UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY case, we process the UPL in batch below.
10709 		 * In an ideal world we would just use these iterator functions within
10710 		 * pmap_batch_set_cache_attributes_internal() for all cases, but since this is the PPL
10711 		 * that means we'll need to take special care to handle pending preemption and
10712 		 * if necessary return the iterator position out to this function and then re-enter
10713 		 * pmap_batch_set_cache_attributes_internal() at the same iterator position in a
10714 		 * secure manner.  Not impossible, but also not trivial, so unless someone asks for
10715 		 * this perf improvement on the PPL I'm going to take the lazy approach here.
10716 		 */
10717 		unified_page_list_iterator_t iter;
10718 
10719 		for (unified_page_list_iterator_init(page_list, &iter);
10720 		    !unified_page_list_iterator_end(&iter);
10721 		    unified_page_list_iterator_next(&iter)) {
10722 			bool is_fictitious = false;
10723 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10724 			if (__probable(!is_fictitious)) {
10725 #if XNU_MONITOR
10726 				pmap_set_cache_attributes_ppl(pn, cacheattr);
10727 #else /* !XNU_MONITOR */
10728 				pmap_set_cache_attributes_internal(pn, cacheattr);
10729 #endif /* XNU_MONITOR */
10730 			}
10731 		}
10732 		return;
10733 	}
10734 
10735 	if (page_list->upl.upl_size == 0) {
10736 		return;
10737 	}
10738 
10739 	batch_set_cache_attr_state_t states;
10740 	states.page_index = 0;
10741 	states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10742 	states.tlb_flush_pass_needed = false;
10743 	states.rt_cache_flush_pass_needed = false;
10744 
10745 	/* Verify we are being called from a preemptible context. */
10746 	pmap_verify_preemptible();
10747 
10748 	while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10749 #if XNU_MONITOR
10750 		states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) page_list->upl.upl_info,
10751 		    states, page_list->upl.upl_size, cacheattr);
10752 #else /* !XNU_MONITOR */
10753 		states = pmap_batch_set_cache_attributes_internal(page_list->upl.upl_info,
10754 		    states, page_list->upl.upl_size, cacheattr);
10755 #endif /* XNU_MONITOR */
10756 	}
10757 
10758 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10759 }
10760 
10761 /**
10762  * Flushes TLB entries associated with the page specified by paddr, but do not
10763  * issue barriers yet.
10764  *
10765  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10766  */
10767 MARK_AS_PMAP_TEXT static void
10768 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10769 {
10770 #if __ARM_PTE_PHYSMAP__
10771 	/* Flush the physical aperture mappings. */
10772 	const vm_offset_t kva = phystokv(paddr);
10773 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10774 #endif /* __ARM_PTE_PHYSMAP__ */
10775 
10776 	/* Flush the mappings tracked in the ptes. */
10777 	const unsigned int pai = pa_index(paddr);
10778 	pv_entry_t **pv_h = pai_to_pvh(pai);
10779 
10780 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10781 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10782 
10783 	pvh_assert_locked(pai);
10784 
10785 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10786 		pte_p = pvh_ptep(pv_h);
10787 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10788 		pve_p = pvh_pve_list(pv_h);
10789 		pte_p = PT_ENTRY_NULL;
10790 	}
10791 
10792 	int pve_ptep_idx = 0;
10793 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10794 		if (pve_p != PV_ENTRY_NULL) {
10795 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10796 			if (pte_p == PT_ENTRY_NULL) {
10797 				goto flush_tlb_skip_pte;
10798 			}
10799 		}
10800 
10801 #ifdef PVH_FLAG_IOMMU
10802 		if (pvh_ptep_is_iommu(pte_p)) {
10803 			goto flush_tlb_skip_pte;
10804 		}
10805 #endif /* PVH_FLAG_IOMMU */
10806 		pmap_t pmap = ptep_get_pmap(pte_p);
10807 		vm_map_address_t va = ptep_get_va(pte_p);
10808 
10809 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10810 		    pmap, true, false);
10811 
10812 flush_tlb_skip_pte:
10813 		pte_p = PT_ENTRY_NULL;
10814 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10815 			pve_ptep_idx = 0;
10816 			pve_p = pve_next(pve_p);
10817 		}
10818 	}
10819 }
10820 
10821 /**
10822  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10823  *
10824  * @param pai The Physical Address Index of the entry.
10825  * @param cacheattr The new cache attribute.
10826  */
10827 MARK_AS_PMAP_TEXT static void
10828 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10829 {
10830 	pvh_assert_locked(pai);
10831 
10832 	pp_attr_t pp_attr_current, pp_attr_template;
10833 	do {
10834 		pp_attr_current = pp_attr_table[pai];
10835 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10836 
10837 		/**
10838 		 * WIMG bits should only be updated under the PVH lock, but we should do
10839 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10840 		 */
10841 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10842 }
10843 
10844 /**
10845  * Batch updates the cache attributes of a list of pages in three passes.
10846  *
10847  * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10848  * In pass two, TLB entries are flushed for each page in the list if necessary.
10849  * In pass three, caches are cleaned for each page in the list if necessary.
10850  *
10851  * When running in PPL, this function may decide to return to the caller in response
10852  * to AST_URGENT.
10853  *
10854  * @param user_page_list List of pages to be updated.
10855  * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10856  * @param page_cnt Number of pages in total in user_page_list.
10857  * @param cacheattr The new cache attributes.
10858  *
10859  * @return The new state of the state machine.
10860  */
10861 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10862 pmap_batch_set_cache_attributes_internal(
10863 #if XNU_MONITOR
10864 	volatile upl_page_info_t *user_page_list,
10865 #else /* !XNU_MONITOR */
10866 	upl_page_info_array_t user_page_list,
10867 #endif /* XNU_MONITOR */
10868 	batch_set_cache_attr_state_t states,
10869 	unsigned int page_cnt,
10870 	unsigned int cacheattr)
10871 {
10872 	uint64_t page_index = states.page_index;
10873 	uint64_t state = states.state;
10874 	bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10875 	bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10876 
10877 	/* For verifying progress. */
10878 	__assert_only const uint64_t page_index_old = page_index;
10879 	__assert_only const uint64_t state_old = state;
10880 
10881 	/* Assert page_index and state are within their range. */
10882 	if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10883 		panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10884 	}
10885 
10886 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10887 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10888 		/* Update cache attributes of the pages until there's an urgent AST or it's done. */
10889 		while (page_index < page_cnt) {
10890 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10891 			const pmap_paddr_t paddr = ptoa(pn);
10892 
10893 			if (!pa_valid(paddr)) {
10894 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10895 			}
10896 
10897 			const unsigned int pai = pa_index(paddr);
10898 
10899 			/* Lock the page. */
10900 			pvh_lock(pai);
10901 
10902 #if XNU_MONITOR
10903 			if (ppattr_pa_test_monitor(paddr)) {
10904 				panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10905 			}
10906 #endif /* XNU_MONITOR */
10907 			const pp_attr_t pp_attr_current = pp_attr_table[pai];
10908 
10909 			unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10910 			if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10911 				wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10912 			}
10913 
10914 			const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10915 
10916 			unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10917 			if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10918 				wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10919 			}
10920 
10921 			/* Update the cache attributes in PTE and PP_ATTR table. */
10922 			if (wimg_bits_new != wimg_bits_prev) {
10923 				tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10924 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10925 			}
10926 
10927 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10928 				rt_cache_flush_pass_needed = true;
10929 			}
10930 
10931 			pvh_unlock(pai);
10932 
10933 			page_index++;
10934 
10935 #if XNU_MONITOR
10936 			/**
10937 			 * Check for AST_URGENT every page, as the pve list search in cache
10938 			 * update can take non-constant time.
10939 			 */
10940 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10941 				goto pbscai_exit;
10942 			}
10943 #endif /* XNU_MONITOR */
10944 		}
10945 
10946 		/* page_index == page_cnt && !pmap_pending_preemption() */
10947 		if (tlb_flush_pass_needed) {
10948 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10949 		} else if (rt_cache_flush_pass_needed) {
10950 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10951 		} else {
10952 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10953 		}
10954 		page_index = 0;
10955 
10956 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10957 		FLUSH_PTE_STRONG();
10958 
10959 #if XNU_MONITOR
10960 		if (__improbable(pmap_pending_preemption())) {
10961 			goto pbscai_exit;
10962 		}
10963 #endif /* XNU_MONITOR */
10964 	}
10965 
10966 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10967 		/**
10968 		 * Pass 2: for each physical page and for each mapping, we need to flush
10969 		 * the TLB for it.
10970 		 */
10971 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10972 		while (page_index < page_cnt) {
10973 			const ppnum_t pn = user_page_list[page_index].phys_addr;
10974 
10975 			const pmap_paddr_t paddr = ptoa(pn);
10976 			if (!pa_valid(paddr)) {
10977 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10978 			}
10979 
10980 			const unsigned int pai = pa_index(paddr);
10981 
10982 			pvh_lock(pai);
10983 			pmap_flush_tlb_for_paddr_locked_async(paddr);
10984 			pvh_unlock(pai);
10985 
10986 			page_index++;
10987 
10988 #if XNU_MONITOR
10989 			/**
10990 			 * Check for AST_URGENT every page, as the pve list search in cache
10991 			 * update can take non-constant time.
10992 			 */
10993 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10994 				goto pbscai_exit;
10995 			}
10996 #endif /* XNU_MONITOR */
10997 		}
10998 
10999 #if HAS_FEAT_XS
11000 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11001 		arm64_sync_tlb(false);
11002 #else
11003 		/**
11004 		 * For targets that distinguish between mild and strong DSB, mild DSB
11005 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11006 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11007 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11008 		 */
11009 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
11010 #endif
11011 
11012 		if (rt_cache_flush_pass_needed) {
11013 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
11014 		} else {
11015 			state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11016 		}
11017 		page_index = 0;
11018 
11019 #if XNU_MONITOR
11020 		if (__improbable(pmap_pending_preemption())) {
11021 			goto pbscai_exit;
11022 		}
11023 #endif /* XNU_MONITOR */
11024 	}
11025 
11026 	if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
11027 		/* Pass 3: Flush the cache if the page is recently set to RT */
11028 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
11029 #if !XNU_MONITOR
11030 		/**
11031 		 * On non-PPL platforms, we disable preemption to ensure we are not preempted
11032 		 * in the state where DC by VA instructions remain enabled.
11033 		 */
11034 		disable_preemption();
11035 #endif /* !XNU_MONITOR */
11036 
11037 		assert(get_preemption_level() > 0);
11038 
11039 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11040 		/**
11041 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
11042 		 * and the host will handle cache maintenance for it. So we don't need to
11043 		 * worry about enabling the ops here for AVP.
11044 		 */
11045 		enable_dc_mva_ops();
11046 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11047 
11048 		while (page_index < page_cnt) {
11049 			const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
11050 
11051 			if (!pa_valid(paddr)) {
11052 				panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
11053 			}
11054 
11055 			CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
11056 
11057 			page_index++;
11058 
11059 #if XNU_MONITOR
11060 			if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
11061 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11062 				disable_dc_mva_ops();
11063 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11064 				goto pbscai_exit;
11065 			}
11066 #endif /* XNU_MONITOR */
11067 		}
11068 
11069 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
11070 		disable_dc_mva_ops();
11071 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
11072 
11073 #if !XNU_MONITOR
11074 		enable_preemption();
11075 #endif /* !XNU_MONITOR */
11076 
11077 		state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
11078 		page_index = 0;
11079 	}
11080 
11081 #if XNU_MONITOR
11082 pbscai_exit:
11083 #endif /* XNU_MONITOR */
11084 	/* Assert page_index and state are within their range. */
11085 	assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
11086 
11087 	/* Make sure we are making progress in this call. */
11088 	assert(page_index > page_index_old || state > state_old);
11089 
11090 	batch_set_cache_attr_state_t states_new;
11091 	states_new.page_index = page_index;
11092 	states_new.state = state;
11093 	states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
11094 	states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
11095 	return states_new;
11096 }
11097 
11098 MARK_AS_PMAP_TEXT static void
11099 pmap_set_cache_attributes_priv(
11100 	ppnum_t pn,
11101 	unsigned int cacheattr,
11102 	boolean_t external __unused)
11103 {
11104 	pmap_paddr_t    paddr;
11105 	unsigned int    pai;
11106 	pp_attr_t       pp_attr_current;
11107 	pp_attr_t       pp_attr_template;
11108 	unsigned int    wimg_bits_prev, wimg_bits_new;
11109 
11110 	paddr = ptoa(pn);
11111 
11112 	if (!pa_valid(paddr)) {
11113 		return;                         /* Not a managed page. */
11114 	}
11115 
11116 	if (cacheattr & VM_WIMG_USE_DEFAULT) {
11117 		cacheattr = VM_WIMG_DEFAULT;
11118 	}
11119 
11120 	pai = pa_index(paddr);
11121 
11122 	pvh_lock(pai);
11123 
11124 #if XNU_MONITOR
11125 	if (external && ppattr_pa_test_monitor(paddr)) {
11126 		panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
11127 	} else if (!external && !ppattr_pa_test_monitor(paddr)) {
11128 		panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
11129 	}
11130 #endif
11131 
11132 	do {
11133 		pp_attr_current = pp_attr_table[pai];
11134 		wimg_bits_prev = VM_WIMG_DEFAULT;
11135 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
11136 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
11137 		}
11138 
11139 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
11140 
11141 		/**
11142 		 * WIMG bits should only be updated under the PVH lock, but we should do
11143 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
11144 		 */
11145 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
11146 
11147 	wimg_bits_new = VM_WIMG_DEFAULT;
11148 	if (pp_attr_template & PP_ATTR_WIMG_MASK) {
11149 		wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
11150 	}
11151 
11152 	if (wimg_bits_new != wimg_bits_prev) {
11153 		pmap_update_cache_attributes_locked(pn, cacheattr, true);
11154 	}
11155 
11156 	pvh_unlock(pai);
11157 
11158 	pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
11159 }
11160 
11161 MARK_AS_PMAP_TEXT void
11162 pmap_set_cache_attributes_internal(
11163 	ppnum_t pn,
11164 	unsigned int cacheattr)
11165 {
11166 	pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
11167 }
11168 
11169 void
11170 pmap_set_cache_attributes(
11171 	ppnum_t pn,
11172 	unsigned int cacheattr)
11173 {
11174 #if XNU_MONITOR
11175 	pmap_set_cache_attributes_ppl(pn, cacheattr);
11176 #else
11177 	pmap_set_cache_attributes_internal(pn, cacheattr);
11178 #endif
11179 }
11180 
11181 /**
11182  * Updates the page numbered ppnum to have attribute specified by attributes.
11183  * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
11184  * The necessity of the TLB flush is returned in case this function is called
11185  * in a batched manner and the TLB flush is intended to be done at a different
11186  * timing.
11187  *
11188  * @param ppnum Page Number of the page to be updated.
11189  * @param attributes The new cache attributes.
11190  * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11191  *        immediately.
11192  *
11193  * @return Returns true if a TLB flush is needed for this update regardless of
11194  *         whether a flush has occurred already.
11195  */
11196 MARK_AS_PMAP_TEXT bool
11197 pmap_update_cache_attributes_locked(
11198 	ppnum_t ppnum,
11199 	unsigned attributes,
11200 	bool perform_tlbi)
11201 {
11202 	pmap_paddr_t    phys = ptoa(ppnum);
11203 	pv_entry_t      *pve_p;
11204 	pt_entry_t      *pte_p;
11205 	pv_entry_t      **pv_h;
11206 	pt_entry_t      tmplate;
11207 	unsigned int    pai;
11208 	boolean_t       tlb_flush_needed = false;
11209 
11210 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11211 
11212 	if (pmap_panic_dev_wimg_on_managed) {
11213 		switch (attributes & VM_WIMG_MASK) {
11214 		case VM_WIMG_IO:                        // nGnRnE
11215 		case VM_WIMG_POSTED:                    // nGnRE
11216 		/* supported on DRAM, but slow, so we disallow */
11217 
11218 		case VM_WIMG_POSTED_REORDERED:          // nGRE
11219 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11220 			/* unsupported on DRAM */
11221 
11222 			panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11223 			    __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11224 			break;
11225 
11226 		default:
11227 			/* not device type memory, all good */
11228 
11229 			break;
11230 		}
11231 	}
11232 
11233 #if __ARM_PTE_PHYSMAP__
11234 	vm_offset_t kva = phystokv(phys);
11235 	pte_p = pmap_pte(kernel_pmap, kva);
11236 
11237 	tmplate = *pte_p;
11238 	tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11239 #if XNU_MONITOR
11240 	tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11241 #else
11242 	tmplate |= wimg_to_pte(attributes, phys);
11243 #endif
11244 	if (tmplate & ARM_PTE_HINT_MASK) {
11245 		panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11246 		    __FUNCTION__, pte_p, (void *)kva, tmplate);
11247 	}
11248 
11249 	if (perform_tlbi) {
11250 		write_pte_strong(pte_p, tmplate);
11251 		flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11252 	} else {
11253 		write_pte_fast(pte_p, tmplate);
11254 	}
11255 	tlb_flush_needed = true;
11256 #endif
11257 
11258 	pai = pa_index(phys);
11259 
11260 	pv_h = pai_to_pvh(pai);
11261 
11262 	pte_p = PT_ENTRY_NULL;
11263 	pve_p = PV_ENTRY_NULL;
11264 	if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11265 		pte_p = pvh_ptep(pv_h);
11266 	} else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11267 		pve_p = pvh_pve_list(pv_h);
11268 		pte_p = PT_ENTRY_NULL;
11269 	}
11270 
11271 	int pve_ptep_idx = 0;
11272 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11273 		vm_map_address_t va;
11274 		pmap_t          pmap;
11275 
11276 		if (pve_p != PV_ENTRY_NULL) {
11277 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11278 			if (pte_p == PT_ENTRY_NULL) {
11279 				goto cache_skip_pve;
11280 			}
11281 		}
11282 
11283 #ifdef PVH_FLAG_IOMMU
11284 		if (pvh_ptep_is_iommu(pte_p)) {
11285 			goto cache_skip_pve;
11286 		}
11287 #endif
11288 		pmap = ptep_get_pmap(pte_p);
11289 #if HAS_FEAT_XS
11290 		/**
11291 		 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11292 		 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11293 		 */
11294 		assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11295 #endif /* HAS_FEAT_XS */
11296 		va = ptep_get_va(pte_p);
11297 
11298 		tmplate = *pte_p;
11299 		tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11300 		tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11301 
11302 		if (perform_tlbi) {
11303 			write_pte_strong(pte_p, tmplate);
11304 			pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11305 			    pmap, true, false);
11306 		} else {
11307 			write_pte_fast(pte_p, tmplate);
11308 		}
11309 		tlb_flush_needed = true;
11310 
11311 cache_skip_pve:
11312 		pte_p = PT_ENTRY_NULL;
11313 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11314 			pve_ptep_idx = 0;
11315 			pve_p = pve_next(pve_p);
11316 		}
11317 	}
11318 	if (perform_tlbi && tlb_flush_needed) {
11319 #if HAS_FEAT_XS
11320 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11321 		arm64_sync_tlb(false);
11322 #else
11323 		/**
11324 		 * For targets that distinguish between mild and strong DSB, mild DSB
11325 		 * will not drain the prefetcher.  This can lead to prefetch-driven
11326 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
11327 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11328 		 */
11329 		arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11330 #endif
11331 	}
11332 
11333 	PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11334 
11335 	return tlb_flush_needed;
11336 }
11337 
11338 /**
11339  * Mark a pmap as being dedicated to use for a commpage mapping.
11340  * The pmap itself will never be activated on a CPU; its mappings will
11341  * only be embedded in userspace pmaps at a fixed virtual address.
11342  *
11343  * @param pmap the pmap to mark as belonging to a commpage.
11344  */
11345 static void
11346 pmap_set_commpage(pmap_t pmap)
11347 {
11348 #if XNU_MONITOR
11349 	assert(!pmap_ppl_locked_down);
11350 #endif
11351 	assert(pmap->type == PMAP_TYPE_USER);
11352 	pmap->type = PMAP_TYPE_COMMPAGE;
11353 	/*
11354 	 * Free the pmap's ASID.  This pmap should not ever be directly
11355 	 * activated in a CPU's TTBR.  Freeing the ASID will not only reduce
11356 	 * ASID space contention but will also cause pmap_switch() to panic
11357 	 * if an attacker tries to activate this pmap.  Disable preemption to
11358 	 * accommodate the *_nopreempt spinlock in free_asid().
11359 	 */
11360 	mp_disable_preemption();
11361 	pmap_get_pt_ops(pmap)->free_id(pmap);
11362 	mp_enable_preemption();
11363 }
11364 
11365 static void
11366 pmap_update_tt3e(
11367 	pmap_t pmap,
11368 	vm_address_t address,
11369 	tt_entry_t template)
11370 {
11371 	tt_entry_t *ptep, pte;
11372 
11373 	ptep = pmap_tt3e(pmap, address);
11374 	if (ptep == NULL) {
11375 		panic("%s: no ptep?", __FUNCTION__);
11376 	}
11377 
11378 	pte = *ptep;
11379 	pte = tte_to_pa(pte) | template;
11380 	write_pte_strong(ptep, pte);
11381 }
11382 
11383 /* Note absence of non-global bit */
11384 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11385 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11386 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11387 	        | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11388 
11389 /* Note absence of non-global bit and no-execute bit.  */
11390 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11391 	        | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11392 	        | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11393 	        | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11394 
11395 void
11396 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11397     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11398 {
11399 	kern_return_t kr;
11400 	pmap_paddr_t data_pa = 0; // data address
11401 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11402 	pmap_paddr_t text_pa = 0; // text address
11403 
11404 	*kernel_data_addr = 0;
11405 	*kernel_text_addr = 0;
11406 	*user_text_addr = 0;
11407 
11408 #if XNU_MONITOR
11409 	data_pa = pmap_alloc_page_for_kern(0);
11410 	assert(data_pa);
11411 	memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11412 	ro_data_pa = pmap_alloc_page_for_kern(0);
11413 	assert(ro_data_pa);
11414 	memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11415 #if CONFIG_ARM_PFZ
11416 	text_pa = pmap_alloc_page_for_kern(0);
11417 	assert(text_pa);
11418 	memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11419 #endif
11420 
11421 #else /* XNU_MONITOR */
11422 	(void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11423 	/*
11424 	 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11425 	 * mapped at page granularity, so a separate page for kernel RO data would not
11426 	 * be useful.
11427 	 */
11428 	ro_data_pa = data_pa;
11429 #if CONFIG_ARM_PFZ
11430 	(void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11431 #endif
11432 
11433 #endif /* XNU_MONITOR */
11434 
11435 	/*
11436 	 * In order to avoid burning extra pages on mapping the shared page, we
11437 	 * create a dedicated pmap for the shared page.  We forcibly nest the
11438 	 * translation tables from this pmap into other pmaps.  The level we
11439 	 * will nest at depends on the MMU configuration (page size, TTBR range,
11440 	 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11441 	 *
11442 	 * Note that this is NOT "the nested pmap" (which is used to nest the
11443 	 * shared cache).
11444 	 *
11445 	 * Note that we update parameters of the entry for our unique needs (NG
11446 	 * entry, etc.).
11447 	 */
11448 	commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11449 	assert(commpage_pmap_default != NULL);
11450 	pmap_set_commpage(commpage_pmap_default);
11451 
11452 	/* The user 64-bit mappings... */
11453 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11454 	assert(kr == KERN_SUCCESS);
11455 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11456 
11457 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11458 	assert(kr == KERN_SUCCESS);
11459 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11460 #if CONFIG_ARM_PFZ
11461 	/* User mapping of comm page text section for 64 bit mapping only
11462 	 *
11463 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11464 	 * user processes to get this page mapped in, they should never call into
11465 	 * this page.
11466 	 *
11467 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11468 	 * is slid in the same L3 as the data commpage.  It is either outside the
11469 	 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11470 	 * it is reserved and unavailable to mach VM for future mappings.
11471 	 */
11472 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11473 	int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11474 
11475 	vm_map_address_t commpage_text_va = 0;
11476 
11477 	do {
11478 		int text_leaf_index = random() % num_ptes;
11479 
11480 		// Generate a VA for the commpage text with the same root and twig index as data
11481 		// comm page, but with new leaf index we've just generated.
11482 		commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11483 		commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11484 	} while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11485 
11486 	// Assert that this is empty
11487 	__assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11488 	assert(ptep != PT_ENTRY_NULL);
11489 	assert(*ptep == ARM_TTE_EMPTY);
11490 
11491 	// At this point, we've found the address we want to insert our comm page at
11492 	kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11493 	assert(kr == KERN_SUCCESS);
11494 	// Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11495 	pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11496 
11497 	*user_text_addr = commpage_text_va;
11498 #endif
11499 
11500 	/* ...and the user 32-bit mappings. */
11501 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11502 	assert(kr == KERN_SUCCESS);
11503 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11504 
11505 	kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11506 	assert(kr == KERN_SUCCESS);
11507 	pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11508 #if __ARM_MIXED_PAGE_SIZE__
11509 	/**
11510 	 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11511 	 * new set of page tables that point to the exact same 16K shared page as
11512 	 * before. Only the first 4K of the 16K shared page is mapped since that's
11513 	 * the only part that contains relevant data.
11514 	 */
11515 	commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11516 	assert(commpage_pmap_4k != NULL);
11517 	pmap_set_commpage(commpage_pmap_4k);
11518 
11519 	/* The user 64-bit mappings... */
11520 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11521 	assert(kr == KERN_SUCCESS);
11522 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11523 
11524 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11525 	assert(kr == KERN_SUCCESS);
11526 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11527 
11528 	/* ...and the user 32-bit mapping. */
11529 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11530 	assert(kr == KERN_SUCCESS);
11531 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11532 
11533 	kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11534 	assert(kr == KERN_SUCCESS);
11535 	pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11536 #endif
11537 
11538 	/* For manipulation in kernel, go straight to physical page */
11539 	*kernel_data_addr = phystokv(data_pa);
11540 	assert(commpage_ro_data_kva == 0);
11541 	*kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11542 	assert(commpage_text_kva == 0);
11543 	*kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11544 }
11545 
11546 
11547 /*
11548  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11549  * with user controlled TTEs for regions that aren't explicitly reserved by the
11550  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11551  */
11552 #if (ARM_PGSHIFT == 14)
11553 /**
11554  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11555  * commpage completely above the maximum 32-bit userspace VA.
11556  */
11557 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11558 
11559 /**
11560  * Normally there'd be an assert to check that 64-bit devices with 64-bit
11561  * userspace VAs can nest the commpage completely above the maximum 64-bit
11562  * userpace VA, but that technically isn't true on macOS. On those systems, the
11563  * commpage lives within the userspace VA range, but is protected by the VM as
11564  * a reserved region (see vm_reserved_regions[] definition for more info).
11565  */
11566 
11567 #elif (ARM_PGSHIFT == 12)
11568 /**
11569  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11570  * above the maximum userspace VA.
11571  */
11572 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11573 #else
11574 #error Nested shared page mapping is unsupported on this config
11575 #endif
11576 
11577 MARK_AS_PMAP_TEXT kern_return_t
11578 pmap_insert_commpage_internal(
11579 	pmap_t pmap)
11580 {
11581 	kern_return_t kr = KERN_SUCCESS;
11582 	vm_offset_t commpage_vaddr;
11583 	pt_entry_t *ttep, *src_ttep;
11584 	int options = 0;
11585 	pmap_t commpage_pmap = commpage_pmap_default;
11586 
11587 	/* Validate the pmap input before accessing its data. */
11588 	validate_pmap_mutable(pmap);
11589 
11590 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11591 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11592 
11593 #if __ARM_MIXED_PAGE_SIZE__
11594 #if !__ARM_16K_PG__
11595 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11596 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11597 #endif /* !__ARM_16K_PG__ */
11598 
11599 	/* Choose the correct shared page pmap to use. */
11600 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11601 	if (pmap_page_size == 16384) {
11602 		commpage_pmap = commpage_pmap_default;
11603 	} else if (pmap_page_size == 4096) {
11604 		commpage_pmap = commpage_pmap_4k;
11605 	} else {
11606 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11607 	}
11608 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11609 
11610 #if XNU_MONITOR
11611 	options |= PMAP_OPTIONS_NOWAIT;
11612 #endif /* XNU_MONITOR */
11613 
11614 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11615 #error We assume a single page.
11616 #endif
11617 
11618 	if (pmap_is_64bit(pmap)) {
11619 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11620 	} else {
11621 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11622 	}
11623 
11624 
11625 	pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11626 
11627 	/*
11628 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11629 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
11630 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11631 	 * to "nest".
11632 	 *
11633 	 * Note: This is not "nesting" in the shared cache sense. This definition of
11634 	 * nesting just means inserting pointers to pre-allocated tables inside of
11635 	 * the passed in pmap to allow us to share page tables (which map the shared
11636 	 * page) for every task. This saves at least one page of memory per process
11637 	 * compared to creating new page tables in every process for mapping the
11638 	 * shared page.
11639 	 */
11640 
11641 	/**
11642 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
11643 	 * page's tables into place.
11644 	 */
11645 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11646 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11647 
11648 		kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11649 
11650 		if (kr != KERN_SUCCESS) {
11651 #if XNU_MONITOR
11652 			if (kr == KERN_RESOURCE_SHORTAGE) {
11653 				return kr;
11654 			} else
11655 #endif
11656 			if (kr == KERN_ABORTED) {
11657 				return kr;
11658 			} else {
11659 				panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11660 			}
11661 		}
11662 
11663 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11664 	}
11665 
11666 	if (*ttep != ARM_PTE_EMPTY) {
11667 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11668 	}
11669 
11670 	src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11671 
11672 	*ttep = *src_ttep;
11673 	FLUSH_PTE_STRONG();
11674 
11675 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11676 
11677 	return kr;
11678 }
11679 
11680 static void
11681 pmap_unmap_commpage(
11682 	pmap_t pmap)
11683 {
11684 	pt_entry_t *ttep;
11685 	vm_offset_t commpage_vaddr;
11686 	pmap_t commpage_pmap = commpage_pmap_default;
11687 
11688 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11689 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11690 
11691 #if __ARM_MIXED_PAGE_SIZE__
11692 #if !__ARM_16K_PG__
11693 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11694 	#error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11695 #endif /* !__ARM_16K_PG__ */
11696 
11697 	/* Choose the correct shared page pmap to use. */
11698 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11699 	if (pmap_page_size == 16384) {
11700 		commpage_pmap = commpage_pmap_default;
11701 	} else if (pmap_page_size == 4096) {
11702 		commpage_pmap = commpage_pmap_4k;
11703 	} else {
11704 		panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11705 	}
11706 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11707 
11708 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11709 #error We assume a single page.
11710 #endif
11711 
11712 	if (pmap_is_64bit(pmap)) {
11713 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11714 	} else {
11715 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11716 	}
11717 
11718 
11719 	ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11720 
11721 	if (ttep == NULL) {
11722 		return;
11723 	}
11724 
11725 	/* It had better be mapped to the shared page. */
11726 	if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11727 		panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11728 	}
11729 
11730 	*ttep = ARM_TTE_EMPTY;
11731 	FLUSH_PTE_STRONG();
11732 
11733 	flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11734 	sync_tlb_flush();
11735 }
11736 
11737 void
11738 pmap_insert_commpage(
11739 	pmap_t pmap)
11740 {
11741 	kern_return_t kr = KERN_FAILURE;
11742 #if XNU_MONITOR
11743 	do {
11744 		kr = pmap_insert_commpage_ppl(pmap);
11745 
11746 		if (kr == KERN_RESOURCE_SHORTAGE) {
11747 			pmap_alloc_page_for_ppl(0);
11748 		}
11749 	} while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11750 
11751 	pmap_ledger_check_balance(pmap);
11752 #else
11753 	do {
11754 		kr = pmap_insert_commpage_internal(pmap);
11755 	} while (kr == KERN_ABORTED);
11756 #endif
11757 
11758 	if (kr != KERN_SUCCESS) {
11759 		panic("%s: failed to insert the shared page, kr=%d, "
11760 		    "pmap=%p",
11761 		    __FUNCTION__, kr,
11762 		    pmap);
11763 	}
11764 }
11765 
11766 static boolean_t
11767 pmap_is_64bit(
11768 	pmap_t pmap)
11769 {
11770 	return pmap->is_64bit;
11771 }
11772 
11773 bool
11774 pmap_is_exotic(
11775 	pmap_t pmap __unused)
11776 {
11777 	return false;
11778 }
11779 
11780 
11781 /* ARMTODO -- an implementation that accounts for
11782  * holes in the physical map, if any.
11783  */
11784 boolean_t
11785 pmap_valid_page(
11786 	ppnum_t pn)
11787 {
11788 	return pa_valid(ptoa(pn));
11789 }
11790 
11791 boolean_t
11792 pmap_bootloader_page(
11793 	ppnum_t pn)
11794 {
11795 	pmap_paddr_t paddr = ptoa(pn);
11796 
11797 	if (pa_valid(paddr)) {
11798 		return FALSE;
11799 	}
11800 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11801 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11802 }
11803 
11804 MARK_AS_PMAP_TEXT boolean_t
11805 pmap_is_empty_internal(
11806 	pmap_t pmap,
11807 	vm_map_offset_t va_start,
11808 	vm_map_offset_t va_end)
11809 {
11810 	vm_map_offset_t block_start, block_end;
11811 	tt_entry_t *tte_p;
11812 
11813 	if (pmap == NULL) {
11814 		return TRUE;
11815 	}
11816 
11817 	validate_pmap(pmap);
11818 
11819 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11820 	unsigned int initial_not_in_kdp = not_in_kdp;
11821 
11822 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11823 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11824 	}
11825 
11826 
11827 	/* TODO: This will be faster if we increment ttep at each level. */
11828 	block_start = va_start;
11829 
11830 	while (block_start < va_end) {
11831 		pt_entry_t     *bpte_p, *epte_p;
11832 		pt_entry_t     *pte_p;
11833 
11834 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11835 		if (block_end > va_end) {
11836 			block_end = va_end;
11837 		}
11838 
11839 		tte_p = pmap_tte(pmap, block_start);
11840 		if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11841 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11842 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11843 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11844 
11845 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11846 				if (*pte_p != ARM_PTE_EMPTY) {
11847 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11848 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11849 					}
11850 					return FALSE;
11851 				}
11852 			}
11853 		}
11854 		block_start = block_end;
11855 	}
11856 
11857 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11858 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11859 	}
11860 
11861 	return TRUE;
11862 }
11863 
11864 boolean_t
11865 pmap_is_empty(
11866 	pmap_t pmap,
11867 	vm_map_offset_t va_start,
11868 	vm_map_offset_t va_end)
11869 {
11870 #if XNU_MONITOR
11871 	return pmap_is_empty_ppl(pmap, va_start, va_end);
11872 #else
11873 	return pmap_is_empty_internal(pmap, va_start, va_end);
11874 #endif
11875 }
11876 
11877 vm_map_offset_t
11878 pmap_max_offset(
11879 	boolean_t               is64,
11880 	unsigned int    option)
11881 {
11882 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11883 }
11884 
11885 vm_map_offset_t
11886 pmap_max_64bit_offset(
11887 	__unused unsigned int option)
11888 {
11889 	vm_map_offset_t max_offset_ret = 0;
11890 
11891 #if defined(__arm64__)
11892 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11893 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11894 		max_offset_ret = arm64_pmap_max_offset_default;
11895 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11896 		max_offset_ret = min_max_offset;
11897 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11898 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11899 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11900 		if (arm64_pmap_max_offset_default) {
11901 			max_offset_ret = arm64_pmap_max_offset_default;
11902 		} else if (max_mem > 0xC0000000) {
11903 			// devices with > 3GB of memory
11904 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11905 		} else if (max_mem > 0x40000000) {
11906 			// devices with > 1GB and <= 3GB of memory
11907 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11908 		} else {
11909 			// devices with <= 1 GB of memory
11910 			max_offset_ret = min_max_offset;
11911 		}
11912 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11913 		if (arm64_pmap_max_offset_default) {
11914 			// Allow the boot-arg to override jumbo size
11915 			max_offset_ret = arm64_pmap_max_offset_default;
11916 		} else {
11917 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11918 		}
11919 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11920 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11921 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11922 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11923 	} else {
11924 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11925 	}
11926 
11927 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11928 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11929 		assert(max_offset_ret >= min_max_offset);
11930 	}
11931 #else
11932 	panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11933 #endif
11934 
11935 	return max_offset_ret;
11936 }
11937 
11938 vm_map_offset_t
11939 pmap_max_32bit_offset(
11940 	unsigned int option)
11941 {
11942 	vm_map_offset_t max_offset_ret = 0;
11943 
11944 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11945 		max_offset_ret = arm_pmap_max_offset_default;
11946 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11947 		max_offset_ret = VM_MAX_ADDRESS;
11948 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11949 		max_offset_ret = VM_MAX_ADDRESS;
11950 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11951 		if (arm_pmap_max_offset_default) {
11952 			max_offset_ret = arm_pmap_max_offset_default;
11953 		} else if (max_mem > 0x20000000) {
11954 			max_offset_ret = VM_MAX_ADDRESS;
11955 		} else {
11956 			max_offset_ret = VM_MAX_ADDRESS;
11957 		}
11958 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11959 		max_offset_ret = VM_MAX_ADDRESS;
11960 	} else {
11961 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11962 	}
11963 
11964 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11965 	return max_offset_ret;
11966 }
11967 
11968 #if CONFIG_DTRACE
11969 /*
11970  * Constrain DTrace copyin/copyout actions
11971  */
11972 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11973 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11974 
11975 kern_return_t
11976 dtrace_copyio_preflight(
11977 	__unused addr64_t va)
11978 {
11979 	if (current_map() == kernel_map) {
11980 		return KERN_FAILURE;
11981 	} else {
11982 		return KERN_SUCCESS;
11983 	}
11984 }
11985 
11986 kern_return_t
11987 dtrace_copyio_postflight(
11988 	__unused addr64_t va)
11989 {
11990 	return KERN_SUCCESS;
11991 }
11992 #endif /* CONFIG_DTRACE */
11993 
11994 
11995 void
11996 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11997 {
11998 }
11999 
12000 
12001 void
12002 pmap_flush(
12003 	__unused pmap_flush_context *cpus_to_flush)
12004 {
12005 	/* not implemented yet */
12006 	return;
12007 }
12008 
12009 #if XNU_MONITOR
12010 
12011 /*
12012  * Enforce that the address range described by kva and nbytes is not currently
12013  * PPL-owned, and won't become PPL-owned while pinned.  This is to prevent
12014  * unintentionally writing to PPL-owned memory.
12015  */
12016 void
12017 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
12018 {
12019 	vm_offset_t end;
12020 	if (os_add_overflow(kva, nbytes, &end)) {
12021 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12022 	}
12023 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12024 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12025 		unsigned int pai = pa_index(pa);
12026 		pp_attr_t attr;
12027 		if (__improbable(!pa_valid(pa))) {
12028 			panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12029 		}
12030 		pvh_lock(pai);
12031 		if (__improbable(ckva == phystokv(pa))) {
12032 			panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12033 		}
12034 		do {
12035 			attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
12036 			if (__improbable(attr & PP_ATTR_MONITOR)) {
12037 				panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
12038 			}
12039 		} while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
12040 		pvh_unlock(pai);
12041 		if (__improbable(kvtophys_nofail(ckva) != pa)) {
12042 			panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
12043 		}
12044 	}
12045 }
12046 
12047 void
12048 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
12049 {
12050 	vm_offset_t end;
12051 	if (os_add_overflow(kva, nbytes, &end)) {
12052 		panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
12053 	}
12054 	for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
12055 		pmap_paddr_t pa = kvtophys_nofail(ckva);
12056 
12057 		if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
12058 			panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
12059 		}
12060 		assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
12061 		ppattr_pa_clear_no_monitor(pa);
12062 	}
12063 }
12064 
12065 /**
12066  * Lock down a page, making all mappings read-only, and preventing further
12067  * mappings or removal of this particular kva's mapping. Effectively, it makes
12068  * the physical page at kva immutable (see the ppl_writable parameter for an
12069  * exception to this).
12070  *
12071  * @param kva Valid address to any mapping of the physical page to lockdown.
12072  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12073  * @param ppl_writable True if the PPL should still be able to write to the page
12074  *                     using the physical aperture mapping. False will make the
12075  *                     page read-only for both the kernel and PPL in the
12076  *                     physical aperture.
12077  */
12078 
12079 MARK_AS_PMAP_TEXT static void
12080 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12081 {
12082 	pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
12083 }
12084 
12085 /**
12086  * Lock down a page, giving all mappings the specified maximum permissions, and
12087  * preventing further mappings or removal of this particular kva's mapping.
12088  * Effectively, it makes the physical page at kva immutable (see the ppl_writable
12089  * parameter for an exception to this).
12090  *
12091  * @param kva Valid address to any mapping of the physical page to lockdown.
12092  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12093  * @param ppl_writable True if the PPL should still be able to write to the page
12094  *                     using the physical aperture mapping. False will make the
12095  *                     page read-only for both the kernel and PPL in the
12096  *                     physical aperture.
12097  * @param prot Maximum permissions to allow in existing alias mappings
12098  */
12099 MARK_AS_PMAP_TEXT static void
12100 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
12101 {
12102 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12103 	const unsigned int pai = pa_index(pa);
12104 
12105 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12106 	pvh_lock(pai);
12107 	pv_entry_t **pvh = pai_to_pvh(pai);
12108 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12109 
12110 	if (__improbable(ppattr_pa_test_monitor(pa))) {
12111 		panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
12112 	}
12113 
12114 	if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
12115 		panic("%s: %#lx already locked down/executable (%#llx)",
12116 		    __func__, kva, (uint64_t)pvh_flags);
12117 	}
12118 
12119 
12120 	pvh_set_flags(pvh, pvh_flags | lockdown_flag);
12121 
12122 	/* Update the physical aperture mapping to prevent kernel write access. */
12123 	const unsigned int new_xprr_perm =
12124 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12125 	pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
12126 
12127 	pvh_unlock(pai);
12128 
12129 	pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
12130 
12131 	/**
12132 	 * Double-check that the mapping didn't change physical addresses before the
12133 	 * LOCKDOWN flag was set (there is a brief window between the above
12134 	 * kvtophys() and pvh_lock() calls where the mapping could have changed).
12135 	 *
12136 	 * This doesn't solve the ABA problem, but this doesn't have to since once
12137 	 * the pvh_lock() is grabbed no new mappings can be created on this physical
12138 	 * page without the LOCKDOWN flag already set (so any future mappings can
12139 	 * only be RO, and no existing mappings can be removed).
12140 	 */
12141 	if (kvtophys_nofail(kva) != pa) {
12142 		panic("%s: Physical address of mapping changed while setting LOCKDOWN "
12143 		    "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
12144 	}
12145 }
12146 
12147 /**
12148  * Helper for releasing a page from being locked down to the PPL, making it writable to the
12149  * kernel once again.
12150  *
12151  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12152  *       to unlockdown a page that was never locked down, will panic.
12153  *
12154  * @param pai physical page index to release from lockdown.  PVH lock for this page must be held.
12155  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12156  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12157  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12158  *                     deviation will result in a panic.
12159  */
12160 MARK_AS_PMAP_TEXT static void
12161 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
12162 {
12163 	pvh_assert_locked(pai);
12164 	pv_entry_t **pvh = pai_to_pvh(pai);
12165 	const vm_offset_t pvh_flags = pvh_get_flags(pvh);
12166 
12167 	if (__improbable(!(pvh_flags & lockdown_flag))) {
12168 		panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
12169 		    __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
12170 	}
12171 
12172 
12173 	pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
12174 
12175 	/* Restore the pre-lockdown physical aperture mapping permissions. */
12176 	const unsigned int old_xprr_perm =
12177 	    (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
12178 	pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
12179 }
12180 
12181 /**
12182  * Release a page from being locked down to the PPL, making it writable to the
12183  * kernel once again.
12184  *
12185  * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
12186  *       to unlockdown a page that was never locked down, will panic.
12187  *
12188  * @param kva Valid address to any mapping of the physical page to unlockdown.
12189  * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
12190  * @param ppl_writable This must match whatever `ppl_writable` parameter was
12191  *                     passed to the paired pmap_ppl_lockdown_page() call. Any
12192  *                     deviation will result in a panic.
12193  */
12194 MARK_AS_PMAP_TEXT static void
12195 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12196 {
12197 	const pmap_paddr_t pa = kvtophys_nofail(kva);
12198 	const unsigned int pai = pa_index(pa);
12199 
12200 	assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12201 	pvh_lock(pai);
12202 	pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12203 	pvh_unlock(pai);
12204 }
12205 
12206 #else /* XNU_MONITOR */
12207 
12208 void __unused
12209 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12210 {
12211 }
12212 
12213 void __unused
12214 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12215 {
12216 }
12217 
12218 #endif /* !XNU_MONITOR */
12219 
12220 
12221 MARK_AS_PMAP_TEXT static inline void
12222 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12223 {
12224 #if XNU_MONITOR
12225 	pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12226 #else
12227 	pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12228 #endif
12229 }
12230 
12231 MARK_AS_PMAP_TEXT static inline void
12232 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12233 {
12234 #if XNU_MONITOR
12235 	pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12236 #else
12237 	pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12238 #endif
12239 }
12240 
12241 /**
12242  * Perform basic validation checks on the destination only and
12243  * corresponding offset/sizes prior to writing to a read only allocation.
12244  *
12245  * @note Should be called before writing to an allocation from the read
12246  * only allocator.
12247  *
12248  * @param zid The ID of the zone the allocation belongs to.
12249  * @param va VA of element being modified (destination).
12250  * @param offset Offset being written to, in the element.
12251  * @param new_data_size Size of modification.
12252  *
12253  */
12254 
12255 MARK_AS_PMAP_TEXT static void
12256 pmap_ro_zone_validate_element_dst(
12257 	zone_id_t           zid,
12258 	vm_offset_t         va,
12259 	vm_offset_t         offset,
12260 	vm_size_t           new_data_size)
12261 {
12262 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12263 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12264 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12265 	}
12266 
12267 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12268 
12269 	/* Check element is from correct zone and properly aligned */
12270 	zone_require_ro(zid, elem_size, (void*)va);
12271 
12272 	if (__improbable(new_data_size > (elem_size - offset))) {
12273 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
12274 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12275 	}
12276 	if (__improbable(offset >= elem_size)) {
12277 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
12278 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12279 	}
12280 }
12281 
12282 
12283 /**
12284  * Perform basic validation checks on the source, destination and
12285  * corresponding offset/sizes prior to writing to a read only allocation.
12286  *
12287  * @note Should be called before writing to an allocation from the read
12288  * only allocator.
12289  *
12290  * @param zid The ID of the zone the allocation belongs to.
12291  * @param va VA of element being modified (destination).
12292  * @param offset Offset being written to, in the element.
12293  * @param new_data Pointer to new data (source).
12294  * @param new_data_size Size of modification.
12295  *
12296  */
12297 
12298 MARK_AS_PMAP_TEXT static void
12299 pmap_ro_zone_validate_element(
12300 	zone_id_t           zid,
12301 	vm_offset_t         va,
12302 	vm_offset_t         offset,
12303 	const vm_offset_t   new_data,
12304 	vm_size_t           new_data_size)
12305 {
12306 	vm_offset_t sum = 0;
12307 
12308 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12309 		panic("%s: Integer addition overflow %p + %lu = %lu",
12310 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12311 	}
12312 
12313 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12314 }
12315 
12316 /**
12317  * Ensure that physical page is locked down before writing to it.
12318  *
12319  * @note Should be called before writing to an allocation from the read
12320  * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12321  * ensure that it is called after the modification.
12322  *
12323  *
12324  * @param pa Physical address of the element being modified.
12325  * @param va Virtual address of element being modified.
12326  * @param size Size of the modification.
12327  *
12328  */
12329 
12330 MARK_AS_PMAP_TEXT static void
12331 pmap_ro_zone_lock_phy_page(
12332 	const pmap_paddr_t  pa,
12333 	vm_offset_t         va,
12334 	vm_size_t           size)
12335 {
12336 	if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12337 		panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12338 		    __func__, (unsigned long long)va, (unsigned long long)size);
12339 	}
12340 	const unsigned int pai = pa_index(pa);
12341 	pvh_lock(pai);
12342 
12343 	/* Ensure that the physical page is locked down */
12344 #if XNU_MONITOR
12345 	pv_entry_t **pvh = pai_to_pvh(pai);
12346 	if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12347 		panic("%s: Physical page not locked down %llx", __func__, pa);
12348 	}
12349 #endif /* XNU_MONITOR */
12350 }
12351 
12352 /**
12353  * Unlock physical page after writing to it.
12354  *
12355  * @note Should be called after writing to an allocation from the read
12356  * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12357  * ensure that it has been called prior to the modification.
12358  *
12359  * @param pa Physical address of the element that was modified.
12360  * @param va Virtual address of element that was modified.
12361  * @param size Size of the modification.
12362  *
12363  */
12364 
12365 MARK_AS_PMAP_TEXT static void
12366 pmap_ro_zone_unlock_phy_page(
12367 	const pmap_paddr_t  pa,
12368 	vm_offset_t         va __unused,
12369 	vm_size_t           size __unused)
12370 {
12371 	const unsigned int pai = pa_index(pa);
12372 	pvh_unlock(pai);
12373 }
12374 
12375 /**
12376  * Function to copy kauth_cred from new_data to kv.
12377  * Function defined in "kern_prot.c"
12378  *
12379  * @note Will be removed upon completion of
12380  * <rdar://problem/72635194> Compiler PAC support for memcpy.
12381  *
12382  * @param kv Address to copy new data to.
12383  * @param new_data Pointer to new data.
12384  *
12385  */
12386 
12387 extern void
12388 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12389 
12390 /**
12391  * Zalloc-specific memcpy that writes through the physical aperture
12392  * and ensures the element being modified is from a read-only zone.
12393  *
12394  * @note Designed to work only with the zone allocator's read-only submap.
12395  *
12396  * @param zid The ID of the zone to allocate from.
12397  * @param va VA of element to be modified.
12398  * @param offset Offset from element.
12399  * @param new_data Pointer to new data.
12400  * @param new_data_size	Size of modification.
12401  *
12402  */
12403 
12404 void
12405 pmap_ro_zone_memcpy(
12406 	zone_id_t           zid,
12407 	vm_offset_t         va,
12408 	vm_offset_t         offset,
12409 	const vm_offset_t   new_data,
12410 	vm_size_t           new_data_size)
12411 {
12412 #if XNU_MONITOR
12413 	pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12414 #else /* XNU_MONITOR */
12415 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12416 #endif /* XNU_MONITOR */
12417 }
12418 
12419 MARK_AS_PMAP_TEXT void
12420 pmap_ro_zone_memcpy_internal(
12421 	zone_id_t             zid,
12422 	vm_offset_t           va,
12423 	vm_offset_t           offset,
12424 	const vm_offset_t     new_data,
12425 	vm_size_t             new_data_size)
12426 {
12427 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12428 
12429 	if (!new_data || new_data_size == 0) {
12430 		return;
12431 	}
12432 
12433 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12434 	pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12435 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12436 	pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12437 }
12438 
12439 /**
12440  * Zalloc-specific function to atomically mutate fields of an element that
12441  * belongs to a read-only zone, via the physcial aperture.
12442  *
12443  * @note Designed to work only with the zone allocator's read-only submap.
12444  *
12445  * @param zid The ID of the zone the element belongs to.
12446  * @param va VA of element to be modified.
12447  * @param offset Offset in element.
12448  * @param op Atomic operation to perform.
12449  * @param value	Mutation value.
12450  *
12451  */
12452 
12453 uint64_t
12454 pmap_ro_zone_atomic_op(
12455 	zone_id_t             zid,
12456 	vm_offset_t           va,
12457 	vm_offset_t           offset,
12458 	zro_atomic_op_t       op,
12459 	uint64_t              value)
12460 {
12461 #if XNU_MONITOR
12462 	return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12463 #else /* XNU_MONITOR */
12464 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12465 #endif /* XNU_MONITOR */
12466 }
12467 
12468 MARK_AS_PMAP_TEXT uint64_t
12469 pmap_ro_zone_atomic_op_internal(
12470 	zone_id_t             zid,
12471 	vm_offset_t           va,
12472 	vm_offset_t           offset,
12473 	zro_atomic_op_t       op,
12474 	uint64_t              value)
12475 {
12476 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12477 	vm_size_t value_size = op & 0xf;
12478 
12479 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12480 	pmap_ro_zone_lock_phy_page(pa, va, value_size);
12481 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12482 	pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12483 
12484 	return value;
12485 }
12486 
12487 /**
12488  * bzero for allocations from read only zones, that writes through the
12489  * physical aperture.
12490  *
12491  * @note This is called by the zfree path of all allocations from read
12492  * only zones.
12493  *
12494  * @param zid The ID of the zone the allocation belongs to.
12495  * @param va VA of element to be zeroed.
12496  * @param offset Offset in the element.
12497  * @param size	Size of allocation.
12498  *
12499  */
12500 
12501 void
12502 pmap_ro_zone_bzero(
12503 	zone_id_t       zid,
12504 	vm_offset_t     va,
12505 	vm_offset_t     offset,
12506 	vm_size_t       size)
12507 {
12508 #if XNU_MONITOR
12509 	pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12510 #else /* XNU_MONITOR */
12511 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
12512 #endif /* XNU_MONITOR */
12513 }
12514 
12515 MARK_AS_PMAP_TEXT void
12516 pmap_ro_zone_bzero_internal(
12517 	zone_id_t       zid,
12518 	vm_offset_t     va,
12519 	vm_offset_t     offset,
12520 	vm_size_t       size)
12521 {
12522 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12523 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12524 	pmap_ro_zone_lock_phy_page(pa, va, size);
12525 	bzero((void*)phystokv(pa), size);
12526 	pmap_ro_zone_unlock_phy_page(pa, va, size);
12527 }
12528 
12529 /**
12530  * Removes write access from the Physical Aperture.
12531  *
12532  * @note For non-PPL devices, it simply makes all virtual mappings RO.
12533  * @note Designed to work only with the zone allocator's read-only submap.
12534  *
12535  * @param va VA of the page to restore write access to.
12536  *
12537  */
12538 MARK_AS_PMAP_TEXT static void
12539 pmap_phys_write_disable(vm_address_t va)
12540 {
12541 #if XNU_MONITOR
12542 	pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12543 #else /* XNU_MONITOR */
12544 	pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12545 #endif /* XNU_MONITOR */
12546 }
12547 
12548 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
12549 
12550 MARK_AS_PMAP_TEXT mach_vm_size_t
12551 pmap_query_resident_internal(
12552 	pmap_t                  pmap,
12553 	vm_map_address_t        start,
12554 	vm_map_address_t        end,
12555 	mach_vm_size_t          *compressed_bytes_p)
12556 {
12557 	mach_vm_size_t  resident_bytes = 0;
12558 	mach_vm_size_t  compressed_bytes = 0;
12559 
12560 	pt_entry_t     *bpte, *epte;
12561 	pt_entry_t     *pte_p;
12562 	tt_entry_t     *tte_p;
12563 
12564 	if (pmap == NULL) {
12565 		return PMAP_RESIDENT_INVALID;
12566 	}
12567 
12568 	validate_pmap(pmap);
12569 
12570 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12571 
12572 	/* Ensure that this request is valid, and addresses exactly one TTE. */
12573 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12574 	    (end % pt_attr_page_size(pt_attr)))) {
12575 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12576 	}
12577 
12578 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12579 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12580 	}
12581 
12582 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12583 	tte_p = pmap_tte(pmap, start);
12584 	if (tte_p == (tt_entry_t *) NULL) {
12585 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
12586 		return PMAP_RESIDENT_INVALID;
12587 	}
12588 	if (tte_is_valid_table(*tte_p)) {
12589 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
12590 		bpte = &pte_p[pte_index(pt_attr, start)];
12591 		epte = &pte_p[pte_index(pt_attr, end)];
12592 
12593 		for (; bpte < epte; bpte++) {
12594 			if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12595 				compressed_bytes += pt_attr_page_size(pt_attr);
12596 			} else if (pa_valid(pte_to_pa(*bpte))) {
12597 				resident_bytes += pt_attr_page_size(pt_attr);
12598 			}
12599 		}
12600 	}
12601 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12602 
12603 	if (compressed_bytes_p) {
12604 		pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12605 		*compressed_bytes_p += compressed_bytes;
12606 		pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12607 	}
12608 
12609 	return resident_bytes;
12610 }
12611 
12612 mach_vm_size_t
12613 pmap_query_resident(
12614 	pmap_t                  pmap,
12615 	vm_map_address_t        start,
12616 	vm_map_address_t        end,
12617 	mach_vm_size_t          *compressed_bytes_p)
12618 {
12619 	mach_vm_size_t          total_resident_bytes;
12620 	mach_vm_size_t          compressed_bytes;
12621 	vm_map_address_t        va;
12622 
12623 
12624 	if (pmap == PMAP_NULL) {
12625 		if (compressed_bytes_p) {
12626 			*compressed_bytes_p = 0;
12627 		}
12628 		return 0;
12629 	}
12630 
12631 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12632 
12633 	total_resident_bytes = 0;
12634 	compressed_bytes = 0;
12635 
12636 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12637 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12638 	    VM_KERNEL_ADDRHIDE(end));
12639 
12640 	va = start;
12641 	while (va < end) {
12642 		vm_map_address_t l;
12643 		mach_vm_size_t resident_bytes;
12644 
12645 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12646 
12647 		if (l > end) {
12648 			l = end;
12649 		}
12650 #if XNU_MONITOR
12651 		resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12652 #else
12653 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12654 #endif
12655 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
12656 			break;
12657 		}
12658 
12659 		total_resident_bytes += resident_bytes;
12660 
12661 		va = l;
12662 	}
12663 
12664 	if (compressed_bytes_p) {
12665 		*compressed_bytes_p = compressed_bytes;
12666 	}
12667 
12668 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12669 	    total_resident_bytes);
12670 
12671 	return total_resident_bytes;
12672 }
12673 
12674 #if MACH_ASSERT
12675 static void
12676 pmap_check_ledgers(
12677 	pmap_t pmap)
12678 {
12679 	int     pid;
12680 	char    *procname;
12681 
12682 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12683 		/*
12684 		 * This pmap was not or is no longer fully associated
12685 		 * with a task (e.g. the old pmap after a fork()/exec() or
12686 		 * spawn()).  Its "ledger" still points at a task that is
12687 		 * now using a different (and active) address space, so
12688 		 * we can't check that all the pmap ledgers are balanced here.
12689 		 *
12690 		 * If the "pid" is set, that means that we went through
12691 		 * pmap_set_process() in task_terminate_internal(), so
12692 		 * this task's ledger should not have been re-used and
12693 		 * all the pmap ledgers should be back to 0.
12694 		 */
12695 		return;
12696 	}
12697 
12698 	pid = pmap->pmap_pid;
12699 	procname = pmap->pmap_procname;
12700 
12701 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12702 }
12703 #endif /* MACH_ASSERT */
12704 
12705 void
12706 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12707 {
12708 }
12709 
12710 /**
12711  * The minimum shared region nesting size is used by the VM to determine when to
12712  * break up large mappings to nested regions. The smallest size that these
12713  * mappings can be broken into is determined by what page table level those
12714  * regions are being nested in at and the size of the page tables.
12715  *
12716  * For instance, if a nested region is nesting at L2 for a process utilizing
12717  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12718  * block entry).
12719  *
12720  * @param pmap The target pmap to determine the block size based on whether it's
12721  *             using 16KB or 4KB page tables.
12722  */
12723 uint64_t
12724 pmap_shared_region_size_min(__unused pmap_t pmap)
12725 {
12726 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12727 
12728 	/**
12729 	 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12730 	 * 4KB pages). This means that a target pmap will contain L2 entries that
12731 	 * point to shared L3 page tables in the shared region pmap.
12732 	 */
12733 	return pt_attr_twig_size(pt_attr);
12734 }
12735 
12736 boolean_t
12737 pmap_enforces_execute_only(
12738 	pmap_t pmap)
12739 {
12740 	return pmap != kernel_pmap;
12741 }
12742 
12743 MARK_AS_PMAP_TEXT void
12744 pmap_set_vm_map_cs_enforced_internal(
12745 	pmap_t pmap,
12746 	bool new_value)
12747 {
12748 	validate_pmap_mutable(pmap);
12749 	pmap->pmap_vm_map_cs_enforced = new_value;
12750 }
12751 
12752 void
12753 pmap_set_vm_map_cs_enforced(
12754 	pmap_t pmap,
12755 	bool new_value)
12756 {
12757 #if XNU_MONITOR
12758 	pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12759 #else
12760 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12761 #endif
12762 }
12763 
12764 extern int cs_process_enforcement_enable;
12765 bool
12766 pmap_get_vm_map_cs_enforced(
12767 	pmap_t pmap)
12768 {
12769 	if (cs_process_enforcement_enable) {
12770 		return true;
12771 	}
12772 	return pmap->pmap_vm_map_cs_enforced;
12773 }
12774 
12775 MARK_AS_PMAP_TEXT void
12776 pmap_set_jit_entitled_internal(
12777 	__unused pmap_t pmap)
12778 {
12779 	return;
12780 }
12781 
12782 void
12783 pmap_set_jit_entitled(
12784 	pmap_t pmap)
12785 {
12786 #if XNU_MONITOR
12787 	pmap_set_jit_entitled_ppl(pmap);
12788 #else
12789 	pmap_set_jit_entitled_internal(pmap);
12790 #endif
12791 }
12792 
12793 bool
12794 pmap_get_jit_entitled(
12795 	__unused pmap_t pmap)
12796 {
12797 	return false;
12798 }
12799 
12800 MARK_AS_PMAP_TEXT void
12801 pmap_set_tpro_internal(
12802 	__unused pmap_t pmap)
12803 {
12804 	return;
12805 }
12806 
12807 void
12808 pmap_set_tpro(
12809 	pmap_t pmap)
12810 {
12811 #if XNU_MONITOR
12812 	pmap_set_tpro_ppl(pmap);
12813 #else /* XNU_MONITOR */
12814 	pmap_set_tpro_internal(pmap);
12815 #endif /* XNU_MONITOR */
12816 }
12817 
12818 bool
12819 pmap_get_tpro(
12820 	__unused pmap_t pmap)
12821 {
12822 	return false;
12823 }
12824 
12825 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12826 
12827 MARK_AS_PMAP_TEXT kern_return_t
12828 pmap_query_page_info_internal(
12829 	pmap_t          pmap,
12830 	vm_map_offset_t va,
12831 	int             *disp_p)
12832 {
12833 	pmap_paddr_t    pa;
12834 	int             disp;
12835 	unsigned int    pai;
12836 	pt_entry_t      *pte_p, pte;
12837 	pv_entry_t      **pv_h, *pve_p;
12838 
12839 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12840 		pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12841 		*disp_p = 0;
12842 		pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12843 		return KERN_INVALID_ARGUMENT;
12844 	}
12845 
12846 	validate_pmap(pmap);
12847 	pmap_lock(pmap, PMAP_LOCK_SHARED);
12848 
12849 try_again:
12850 	disp = 0;
12851 	pte_p = pmap_pte(pmap, va);
12852 	if (pte_p == PT_ENTRY_NULL) {
12853 		goto done;
12854 	}
12855 	pte = *(volatile pt_entry_t*)pte_p;
12856 	pa = pte_to_pa(pte);
12857 	if (pa == 0) {
12858 		if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12859 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
12860 			if (pte & ARM_PTE_COMPRESSED_ALT) {
12861 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12862 			}
12863 		}
12864 	} else {
12865 		disp |= PMAP_QUERY_PAGE_PRESENT;
12866 		pai = pa_index(pa);
12867 		if (!pa_valid(pa)) {
12868 			goto done;
12869 		}
12870 		pvh_lock(pai);
12871 		if (pte != *(volatile pt_entry_t*)pte_p) {
12872 			/* something changed: try again */
12873 			pvh_unlock(pai);
12874 			pmap_query_page_info_retries++;
12875 			goto try_again;
12876 		}
12877 		pv_h = pai_to_pvh(pai);
12878 		pve_p = PV_ENTRY_NULL;
12879 		int pve_ptep_idx = 0;
12880 		if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12881 			pve_p = pvh_pve_list(pv_h);
12882 			while (pve_p != PV_ENTRY_NULL &&
12883 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12884 				pve_p = pve_next(pve_p);
12885 			}
12886 		}
12887 
12888 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12889 			disp |= PMAP_QUERY_PAGE_ALTACCT;
12890 		} else if (ppattr_test_reusable(pai)) {
12891 			disp |= PMAP_QUERY_PAGE_REUSABLE;
12892 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12893 			disp |= PMAP_QUERY_PAGE_INTERNAL;
12894 		}
12895 		pvh_unlock(pai);
12896 	}
12897 
12898 done:
12899 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
12900 	pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12901 	*disp_p = disp;
12902 	pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12903 	return KERN_SUCCESS;
12904 }
12905 
12906 kern_return_t
12907 pmap_query_page_info(
12908 	pmap_t          pmap,
12909 	vm_map_offset_t va,
12910 	int             *disp_p)
12911 {
12912 #if XNU_MONITOR
12913 	return pmap_query_page_info_ppl(pmap, va, disp_p);
12914 #else
12915 	return pmap_query_page_info_internal(pmap, va, disp_p);
12916 #endif
12917 }
12918 
12919 
12920 
12921 uint32_t
12922 pmap_user_va_bits(pmap_t pmap __unused)
12923 {
12924 #if __ARM_MIXED_PAGE_SIZE__
12925 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12926 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12927 #else
12928 	return 64 - T0SZ_BOOT;
12929 #endif
12930 }
12931 
12932 uint32_t
12933 pmap_kernel_va_bits(void)
12934 {
12935 	return 64 - T1SZ_BOOT;
12936 }
12937 
12938 static vm_map_size_t
12939 pmap_user_va_size(pmap_t pmap)
12940 {
12941 	return 1ULL << pmap_user_va_bits(pmap);
12942 }
12943 
12944 
12945 
12946 
12947 bool
12948 pmap_in_ppl(void)
12949 {
12950 	// Unsupported
12951 	return false;
12952 }
12953 
12954 __attribute__((__noreturn__))
12955 void
12956 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12957 {
12958 	panic("%s called on an unsupported platform.", __FUNCTION__);
12959 }
12960 
12961 void *
12962 pmap_claim_reserved_ppl_page(void)
12963 {
12964 	// Unsupported
12965 	return NULL;
12966 }
12967 
12968 void
12969 pmap_free_reserved_ppl_page(void __unused *kva)
12970 {
12971 	// Unsupported
12972 }
12973 
12974 
12975 #if PMAP_CS_PPL_MONITOR
12976 
12977 /* Immutable part of the trust cache runtime */
12978 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12979 
12980 /* Mutable part of the trust cache runtime */
12981 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12982 
12983 /* Lock for the trust cache runtime */
12984 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12985 
12986 MARK_AS_PMAP_TEXT kern_return_t
12987 pmap_check_trust_cache_runtime_for_uuid_internal(
12988 	const uint8_t check_uuid[kUUIDSize])
12989 {
12990 	kern_return_t ret = KERN_DENIED;
12991 
12992 	/* Lock the runtime as shared */
12993 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12994 
12995 	TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12996 		&ppl_trust_cache_rt,
12997 		check_uuid,
12998 		NULL);
12999 
13000 	/* Unlock the runtime */
13001 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13002 
13003 	if (tc_ret.error == kTCReturnSuccess) {
13004 		ret = KERN_SUCCESS;
13005 	} else if (tc_ret.error == kTCReturnNotFound) {
13006 		ret = KERN_NOT_FOUND;
13007 	} else {
13008 		ret = KERN_FAILURE;
13009 		pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
13010 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13011 	}
13012 
13013 	return ret;
13014 }
13015 
13016 kern_return_t
13017 pmap_check_trust_cache_runtime_for_uuid(
13018 	const uint8_t check_uuid[kUUIDSize])
13019 {
13020 	return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
13021 }
13022 
13023 MARK_AS_PMAP_TEXT kern_return_t
13024 pmap_load_trust_cache_with_type_internal(
13025 	TCType_t type,
13026 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13027 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13028 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13029 {
13030 	kern_return_t ret = KERN_DENIED;
13031 	pmap_img4_payload_t *payload = NULL;
13032 	size_t img4_payload_len = 0;
13033 	size_t payload_len_aligned = 0;
13034 	size_t manifest_len_aligned = 0;
13035 
13036 	/* Ignore the auxiliary manifest until we add support for it */
13037 	(void)img4_aux_manifest;
13038 	(void)img4_aux_manifest_len;
13039 
13040 
13041 #if PMAP_CS_INCLUDE_CODE_SIGNING
13042 	if (pmap_cs) {
13043 		if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
13044 			panic("trust cache type not loadable from interface: %u", type);
13045 		} else if (type >= kTCTypeTotal) {
13046 			panic("attempted to load an unsupported trust cache type: %u", type);
13047 		}
13048 
13049 		/* Validate entitlement for the calling process */
13050 		if (TCTypeConfig[type].entitlementValue != NULL) {
13051 			const bool entitlement_satisfied = check_entitlement_pmap(
13052 				NULL,
13053 				"com.apple.private.pmap.load-trust-cache",
13054 				TCTypeConfig[type].entitlementValue,
13055 				false,
13056 				true);
13057 
13058 			if (entitlement_satisfied == false) {
13059 				panic("attempted to load trust cache without entitlement: %u", type);
13060 			}
13061 		}
13062 	}
13063 #endif
13064 
13065 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13066 	ret = pmap_reserve_ppl_page();
13067 	if (ret != KERN_SUCCESS) {
13068 		if (ret != KERN_RESOURCE_SHORTAGE) {
13069 			pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
13070 		}
13071 		return ret;
13072 	}
13073 
13074 	/* Align the passed in lengths to the page size -- round_page is overflow safe */
13075 	payload_len_aligned = round_page(pmap_img4_payload_len);
13076 	manifest_len_aligned = round_page(img4_manifest_len);
13077 
13078 	/* Ensure we have valid data passed in */
13079 	pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
13080 	pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
13081 
13082 	/*
13083 	 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
13084 	 * data structure used by libTrustCache to manage the payload. We need to be able to
13085 	 * write to that data structure, so we keep the payload PPL writable.
13086 	 */
13087 	pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13088 	pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
13089 
13090 	/* Should be safe to read from this now */
13091 	payload = (pmap_img4_payload_t*)pmap_img4_payload;
13092 
13093 	/* Acquire a writable version of the trust cache data structure */
13094 	TrustCache_t *trust_cache = &payload->trust_cache;
13095 	trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
13096 
13097 	/* Calculate the correct length of the img4 payload */
13098 	if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
13099 		panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
13100 	}
13101 
13102 	/* Exclusively lock the runtime */
13103 	lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
13104 
13105 	/* Load the trust cache */
13106 	TCReturn_t tc_ret = amfi->TrustCache.load(
13107 		&ppl_trust_cache_rt,
13108 		type,
13109 		trust_cache,
13110 		(const uintptr_t)payload->img4_payload, img4_payload_len,
13111 		(const uintptr_t)img4_manifest, img4_manifest_len);
13112 
13113 	/* Unlock the runtime */
13114 	lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
13115 
13116 	if (tc_ret.error == kTCReturnSuccess) {
13117 		ret = KERN_SUCCESS;
13118 	} else {
13119 		if (tc_ret.error == kTCReturnDuplicate) {
13120 			ret = KERN_ALREADY_IN_SET;
13121 		} else {
13122 			pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
13123 			    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13124 
13125 			ret = KERN_FAILURE;
13126 		}
13127 
13128 		/* Unlock the payload data */
13129 		pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
13130 		trust_cache = NULL;
13131 		payload = NULL;
13132 	}
13133 
13134 	/* Unlock the manifest since it is no longer needed */
13135 	pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
13136 
13137 	/* Return the CoreCrypto reserved page back to the free list */
13138 	pmap_release_reserved_ppl_page();
13139 
13140 	return ret;
13141 }
13142 
13143 kern_return_t
13144 pmap_load_trust_cache_with_type(
13145 	TCType_t type,
13146 	const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
13147 	const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
13148 	const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
13149 {
13150 	kern_return_t ret = KERN_DENIED;
13151 
13152 	ret = pmap_load_trust_cache_with_type_ppl(
13153 		type,
13154 		pmap_img4_payload, pmap_img4_payload_len,
13155 		img4_manifest, img4_manifest_len,
13156 		img4_aux_manifest, img4_aux_manifest_len);
13157 
13158 	while (ret == KERN_RESOURCE_SHORTAGE) {
13159 		/* Allocate a page from the free list */
13160 		pmap_alloc_page_for_ppl(0);
13161 
13162 		/* Attempt the call again */
13163 		ret = pmap_load_trust_cache_with_type_ppl(
13164 			type,
13165 			pmap_img4_payload, pmap_img4_payload_len,
13166 			img4_manifest, img4_manifest_len,
13167 			img4_aux_manifest, img4_aux_manifest_len);
13168 	}
13169 
13170 	return ret;
13171 }
13172 
13173 MARK_AS_PMAP_TEXT kern_return_t
13174 pmap_query_trust_cache_safe(
13175 	TCQueryType_t query_type,
13176 	const uint8_t cdhash[kTCEntryHashSize],
13177 	TrustCacheQueryToken_t *query_token)
13178 {
13179 	kern_return_t ret = KERN_NOT_FOUND;
13180 
13181 	/* Validate the query type preemptively */
13182 	if (query_type >= kTCQueryTypeTotal) {
13183 		pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
13184 		return KERN_INVALID_ARGUMENT;
13185 	}
13186 
13187 	/* Lock the runtime as shared */
13188 	lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13189 
13190 	TCReturn_t tc_ret = amfi->TrustCache.query(
13191 		&ppl_trust_cache_rt,
13192 		query_type,
13193 		cdhash,
13194 		query_token);
13195 
13196 	/* Unlock the runtime */
13197 	lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13198 
13199 	if (tc_ret.error == kTCReturnSuccess) {
13200 		ret = KERN_SUCCESS;
13201 	} else if (tc_ret.error == kTCReturnNotFound) {
13202 		ret = KERN_NOT_FOUND;
13203 	} else {
13204 		ret = KERN_FAILURE;
13205 		pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13206 		    tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13207 	}
13208 
13209 	return ret;
13210 }
13211 
13212 MARK_AS_PMAP_TEXT kern_return_t
13213 pmap_query_trust_cache_internal(
13214 	TCQueryType_t query_type,
13215 	const uint8_t cdhash[kTCEntryHashSize],
13216 	TrustCacheQueryToken_t *query_token)
13217 {
13218 	kern_return_t ret = KERN_NOT_FOUND;
13219 	TrustCacheQueryToken_t query_token_safe = {0};
13220 	uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13221 
13222 	/* Copy in the CDHash into PPL storage */
13223 	memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13224 
13225 	/* Query through the safe API since we're in the PPL now */
13226 	ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13227 
13228 	if (query_token != NULL) {
13229 		pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13230 		memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13231 		pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13232 	}
13233 
13234 	return ret;
13235 }
13236 
13237 kern_return_t
13238 pmap_query_trust_cache(
13239 	TCQueryType_t query_type,
13240 	const uint8_t cdhash[kTCEntryHashSize],
13241 	TrustCacheQueryToken_t *query_token)
13242 {
13243 	kern_return_t ret = KERN_NOT_FOUND;
13244 
13245 	ret = pmap_query_trust_cache_ppl(
13246 		query_type,
13247 		cdhash,
13248 		query_token);
13249 
13250 	return ret;
13251 }
13252 
13253 MARK_AS_PMAP_DATA uint8_t ppl_developer_mode_set = 0;
13254 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13255 
13256 MARK_AS_PMAP_TEXT void
13257 pmap_toggle_developer_mode_internal(
13258 	bool state)
13259 {
13260 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13261 	/*
13262 	 * On internal builds, we may call into the PPL twice in order to enable developer
13263 	 * mode during early boot and during data migration. The latter does not happen for
13264 	 * non-internal builds, and thus those only need to support a single transition to
13265 	 * enabling developer mode.
13266 	 */
13267 	const uint8_t epoch_enable = 2;
13268 #else
13269 	const uint8_t epoch_enable = 1;
13270 #endif
13271 
13272 	/*
13273 	 * We don't really care if the state is false -- in that case, the transition can
13274 	 * happen as many times as needed. However, we still need to increment whenever we
13275 	 * set the state as such. This is partly because we need to track whether we have
13276 	 * actually resolved the state or not, and also because we expect developer mode
13277 	 * to only be enabled during the first or second (internal-only) call into this
13278 	 * function.
13279 	 */
13280 	uint8_t epoch = os_atomic_inc_orig(&ppl_developer_mode_set, relaxed);
13281 
13282 	if (state == os_atomic_load(&ppl_developer_mode_storage, relaxed)) {
13283 		return;
13284 	} else if ((state == true) && (epoch >= epoch_enable)) {
13285 		panic("PMAP_CS: enabling developer mode incorrectly [%u]", epoch);
13286 	}
13287 
13288 	/* Update the developer mode state on the system */
13289 	os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13290 }
13291 
13292 void
13293 pmap_toggle_developer_mode(
13294 	bool state)
13295 {
13296 	pmap_toggle_developer_mode_ppl(state);
13297 }
13298 
13299 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13300 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13301 
13302 #pragma mark Image4 - New
13303 
13304 typedef struct _pmap_image4_dispatch {
13305 	image4_cs_trap_t selector;
13306 	image4_cs_trap_handler_t handler;
13307 } pmap_image4_dispatch_t;
13308 
13309 MARK_AS_PMAP_TEXT static errno_t
13310 _pmap_image4_monitor_trap_set_release_type(
13311 	const pmap_image4_dispatch_t *dispatch,
13312 	const void *input_data)
13313 {
13314 	/*
13315 	 * csmx_release_type --> __cs_copy
13316 	 */
13317 	image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13318 
13319 	/* Copy the input data to prevent ToCToU */
13320 	memcpy(&input, input_data, sizeof(input));
13321 
13322 	/* Dispatch to AppleImage4 */
13323 	return dispatch->handler(
13324 		dispatch->selector,
13325 		&input, sizeof(input),
13326 		NULL, NULL);
13327 }
13328 
13329 
13330 
13331 MARK_AS_PMAP_TEXT static errno_t
13332 _pmap_image4_monitor_trap_nonce_set(
13333 	const pmap_image4_dispatch_t *dispatch,
13334 	const void *input_data)
13335 {
13336 	/*
13337 	 * csmx_clear --> __cs_copy
13338 	 * csmx_cipher --> __cs_copy
13339 	 */
13340 	image4_cs_trap_argv_nonce_set_t input = {0};
13341 
13342 	/* Copy the input data to prevent ToCToU */
13343 	memcpy(&input, input_data, sizeof(input));
13344 
13345 	/* Dispatch to AppleImage4 */
13346 	return dispatch->handler(
13347 		dispatch->selector,
13348 		&input, sizeof(input),
13349 		NULL, NULL);
13350 }
13351 
13352 MARK_AS_PMAP_TEXT static errno_t
13353 _pmap_image4_monitor_trap_nonce_roll(
13354 	const pmap_image4_dispatch_t *dispatch,
13355 	const void *input_data)
13356 {
13357 	image4_cs_trap_argv_nonce_roll_t input = {0};
13358 
13359 	/* Copy the input data to prevent ToCToU */
13360 	memcpy(&input, input_data, sizeof(input));
13361 
13362 	/* Dispatch to AppleImage4 */
13363 	return dispatch->handler(
13364 		dispatch->selector,
13365 		&input, sizeof(input),
13366 		NULL, NULL);
13367 }
13368 
13369 MARK_AS_PMAP_TEXT static errno_t
13370 _pmap_image4_monitor_trap_image_activate(
13371 	const pmap_image4_dispatch_t *dispatch,
13372 	const void *input_data)
13373 {
13374 	/*
13375 	 * csmx_payload (csmx_payload_len) --> __cs_xfer
13376 	 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13377 	 */
13378 	image4_cs_trap_argv_image_activate_t input = {0};
13379 
13380 	/* Copy the input data to prevent ToCToU */
13381 	memcpy(&input, input_data, sizeof(input));
13382 
13383 	/* Validate the payload region */
13384 	pmap_cs_assert_addr(
13385 		input.csmx_payload, round_page(input.csmx_payload_len),
13386 		false, false);
13387 
13388 	/* Validate the manifest region */
13389 	pmap_cs_assert_addr(
13390 		input.csmx_manifest, round_page(input.csmx_manifest_len),
13391 		false, false);
13392 
13393 	/* Lockdown the payload region */
13394 	pmap_cs_lockdown_pages(
13395 		input.csmx_payload, round_page(input.csmx_payload_len), false);
13396 
13397 	/* Lockdown the manifest region */
13398 	pmap_cs_lockdown_pages(
13399 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13400 
13401 	/* Dispatch the handler */
13402 	errno_t err = dispatch->handler(
13403 		dispatch->selector,
13404 		&input, sizeof(input),
13405 		NULL, NULL);
13406 
13407 	/*
13408 	 * Image activation always returns the manifest back to the kernel since it isn't
13409 	 * needed once the evaluation of the image has been completed. The payload must
13410 	 * remain owned by the monitor if the activation was successful.
13411 	 */
13412 	if (err != 0) {
13413 		/* Unlock the payload region */
13414 		pmap_cs_unlockdown_pages(
13415 			input.csmx_payload, round_page(input.csmx_payload_len), false);
13416 	}
13417 
13418 	/* Unlock the manifest region */
13419 	pmap_cs_unlockdown_pages(
13420 		input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13421 
13422 	return err;
13423 }
13424 
13425 MARK_AS_PMAP_TEXT static errno_t
13426 _pmap_image4_monitor_trap_passthrough(
13427 	__unused const pmap_image4_dispatch_t *dispatch,
13428 	__unused const void *input_data,
13429 	__unused size_t input_size)
13430 {
13431 #if DEVELOPMENT || DEBUG || KASAN
13432 	return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13433 #else
13434 	pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13435 	return ENOSYS;
13436 #endif
13437 }
13438 
13439 MARK_AS_PMAP_TEXT errno_t
13440 pmap_image4_monitor_trap_internal(
13441 	image4_cs_trap_t selector,
13442 	const void *input_data,
13443 	size_t input_size)
13444 {
13445 	kern_return_t ret = KERN_DENIED;
13446 	errno_t err = EPERM;
13447 
13448 	/* Acquire the handler for this selector */
13449 	image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13450 	if (handler == NULL) {
13451 		pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13452 		return EINVAL;
13453 	}
13454 
13455 	/* Verify input size for the handler */
13456 	if (input_size != image4_cs_trap_vector_size(selector)) {
13457 		pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13458 		return EINVAL;
13459 	}
13460 
13461 	/* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13462 	ret = pmap_reserve_ppl_page();
13463 	if (ret != KERN_SUCCESS) {
13464 		if (ret == KERN_RESOURCE_SHORTAGE) {
13465 			return ENOMEM;
13466 		}
13467 		pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13468 		return EPERM;
13469 	}
13470 
13471 	/* Setup dispatch parameters */
13472 	pmap_image4_dispatch_t dispatch = {
13473 		.selector = selector,
13474 		.handler = handler
13475 	};
13476 
13477 	switch (selector) {
13478 	case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13479 		err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13480 		break;
13481 
13482 	case IMAGE4_CS_TRAP_NONCE_SET:
13483 		err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13484 		break;
13485 
13486 	case IMAGE4_CS_TRAP_NONCE_ROLL:
13487 		err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13488 		break;
13489 
13490 	case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13491 		err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13492 		break;
13493 
13494 	default:
13495 		err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13496 		break;
13497 	}
13498 
13499 	/* Return the CoreCrypto reserved page back to the free list */
13500 	pmap_release_reserved_ppl_page();
13501 
13502 	return err;
13503 }
13504 
13505 errno_t
13506 pmap_image4_monitor_trap(
13507 	image4_cs_trap_t selector,
13508 	const void *input_data,
13509 	size_t input_size)
13510 {
13511 	errno_t err = EPERM;
13512 
13513 	err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13514 	while (err == ENOMEM) {
13515 		/* Allocate a page from the free list */
13516 		pmap_alloc_page_for_ppl(0);
13517 
13518 		/* Call the monitor dispatch again */
13519 		err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13520 	}
13521 
13522 	return err;
13523 }
13524 
13525 #endif /* PMAP_CS_PPL_MONITOR */
13526 
13527 #if PMAP_CS_INCLUDE_CODE_SIGNING
13528 
13529 static int
13530 pmap_cs_profiles_rbtree_compare(
13531 	void *profile0,
13532 	void *profile1)
13533 {
13534 	if (profile0 < profile1) {
13535 		return -1;
13536 	} else if (profile0 > profile1) {
13537 		return 1;
13538 	}
13539 	return 0;
13540 }
13541 
13542 /* Red-black tree for managing provisioning profiles */
13543 MARK_AS_PMAP_DATA static
13544 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13545 
13546 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13547 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13548 
13549 /* Lock for the profile red-black tree */
13550 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13551 
13552 void
13553 pmap_initialize_provisioning_profiles(void)
13554 {
13555 	/* Initialize the profiles red-black tree lock */
13556 	lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13557 	pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13558 
13559 	/* Initialize the red-black tree itself */
13560 	RB_INIT(&pmap_cs_registered_profiles);
13561 
13562 	printf("initialized PPL provisioning profile data\n");
13563 }
13564 
13565 static bool
13566 pmap_is_testflight_profile(
13567 	pmap_cs_profile_t *profile_obj)
13568 {
13569 	const char *entitlement_name = "beta-reports-active";
13570 	const size_t entitlement_length = strlen(entitlement_name);
13571 	CEQueryOperation_t query[2] = {0};
13572 
13573 	/* If the profile provisions no entitlements, then it isn't a test flight one */
13574 	if (profile_obj->entitlements_ctx == NULL) {
13575 		return false;
13576 	}
13577 
13578 	/* Build our CoreEntitlements query */
13579 	query[0].opcode = kCEOpSelectKey;
13580 	memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13581 	query[0].parameters.stringParameter.length = entitlement_length;
13582 	query[1] = CEMatchBool(true);
13583 
13584 	CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13585 		profile_obj->entitlements_ctx,
13586 		query, 2);
13587 
13588 	if (ce_err == amfi->CoreEntitlements.kNoError) {
13589 		return true;
13590 	}
13591 
13592 	return false;
13593 }
13594 
13595 static bool
13596 pmap_is_development_profile(
13597 	pmap_cs_profile_t *profile_obj)
13598 {
13599 	/* Check for UPP */
13600 	const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13601 		*profile_obj->profile_ctx,
13602 		CESelectDictValue("ProvisionsAllDevices"));
13603 	if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13604 		if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13605 			pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13606 			return false;
13607 		}
13608 	}
13609 
13610 	/* Check for TestFlight profile */
13611 	if (pmap_is_testflight_profile(profile_obj) == true) {
13612 		pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13613 		return false;
13614 	}
13615 
13616 	pmap_cs_log_info("%p: development profile", profile_obj);
13617 	return true;
13618 }
13619 
13620 static kern_return_t
13621 pmap_initialize_profile_entitlements(
13622 	pmap_cs_profile_t *profile_obj)
13623 {
13624 	const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13625 		*profile_obj->profile_ctx,
13626 		CESelectDictValue("Entitlements"));
13627 
13628 	if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13629 		memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13630 		profile_obj->entitlements_ctx = NULL;
13631 
13632 		pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13633 		return KERN_NOT_FOUND;
13634 	}
13635 
13636 	const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13637 	const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13638 
13639 	CEValidationResult ce_result = {0};
13640 	CEError_t ce_err = amfi->CoreEntitlements.Validate(
13641 		pmap_cs_core_entitlements_runtime,
13642 		&ce_result,
13643 		der_start, der_end);
13644 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13645 		pmap_cs_log_error("unable to validate profile entitlements: %s",
13646 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13647 
13648 		return KERN_ABORTED;
13649 	}
13650 
13651 	struct CEQueryContext query_ctx = {0};
13652 	ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13653 		pmap_cs_core_entitlements_runtime,
13654 		ce_result,
13655 		&query_ctx);
13656 	if (ce_err != amfi->CoreEntitlements.kNoError) {
13657 		pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13658 		    amfi->CoreEntitlements.GetErrorString(ce_err));
13659 
13660 		return KERN_ABORTED;
13661 	}
13662 
13663 	/* Setup the entitlements context within the profile object */
13664 	profile_obj->entitlements_ctx_storage = query_ctx;
13665 	profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13666 
13667 	pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13668 	return KERN_SUCCESS;
13669 }
13670 
13671 kern_return_t
13672 pmap_register_provisioning_profile_internal(
13673 	const vm_address_t payload_addr,
13674 	const vm_size_t payload_size)
13675 {
13676 	kern_return_t ret = KERN_DENIED;
13677 	pmap_cs_profile_t *profile_obj = NULL;
13678 	pmap_profile_payload_t *profile_payload = NULL;
13679 	vm_size_t max_profile_blob_size = 0;
13680 	const uint8_t *profile_content = NULL;
13681 	size_t profile_content_length = 0;
13682 
13683 
13684 	/* CoreTrust validation uses CoreCrypto -- requires a spare page */
13685 	ret = pmap_reserve_ppl_page();
13686 	if (ret != KERN_SUCCESS) {
13687 		if (ret != KERN_RESOURCE_SHORTAGE) {
13688 			pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13689 		}
13690 		return ret;
13691 	}
13692 
13693 	/* Ensure we have valid data passed in */
13694 	pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13695 
13696 	/*
13697 	 * Lockdown the data passed in. The pmap profile payload also contains the profile
13698 	 * data structure used by the PPL to manage the payload. We need to be able to write
13699 	 * to that data structure, so we keep the payload PPL writable.
13700 	 */
13701 	pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13702 
13703 	/* Should be safe to read from this now */
13704 	profile_payload = (pmap_profile_payload_t*)payload_addr;
13705 
13706 	/* Ensure the profile blob size provided is valid */
13707 	if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13708 		panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13709 	} else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13710 		panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13711 	}
13712 
13713 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13714 	const bool allow_development_root_cert = true;
13715 #else
13716 	const bool allow_development_root_cert = false;
13717 #endif
13718 
13719 	int ct_result = coretrust->CTEvaluateProvisioningProfile(
13720 		profile_payload->profile_blob, profile_payload->profile_blob_size,
13721 		allow_development_root_cert,
13722 		&profile_content, &profile_content_length);
13723 
13724 	/* Release the PPL page allocated for CoreCrypto */
13725 	pmap_release_reserved_ppl_page();
13726 
13727 	if (ct_result != 0) {
13728 		panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13729 	} else if ((profile_content == NULL) || profile_content_length == 0) {
13730 		panic("PMAP_CS: profile does not have any content: %p | %lu",
13731 		    profile_content, profile_content_length);
13732 	}
13733 
13734 	der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13735 		pmap_cs_core_entitlements_runtime,
13736 		CCDER_CONSTRUCTED_SET,
13737 		false,
13738 		profile_content, profile_content + profile_content_length);
13739 	if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13740 		panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13741 	}
13742 
13743 	/* Acquire a writable version of the profile data structure */
13744 	profile_obj = &profile_payload->profile_obj_storage;
13745 	profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13746 
13747 	profile_obj->original_payload = profile_payload;
13748 	profile_obj->profile_ctx_storage = profile_ctx_storage;
13749 	profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13750 	os_atomic_store(&profile_obj->reference_count, 0, release);
13751 
13752 	/* Setup the entitlements provisioned by the profile */
13753 	ret = pmap_initialize_profile_entitlements(profile_obj);
13754 	if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13755 		panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13756 	}
13757 
13758 	/* Setup properties of the profile */
13759 	profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13760 
13761 	/* Mark as validated since it passed all checks */
13762 	profile_obj->profile_validated = true;
13763 
13764 	/* Add the profile to the red-black tree */
13765 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13766 	if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13767 		panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13768 	}
13769 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13770 
13771 	pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13772 	return KERN_SUCCESS;
13773 }
13774 
13775 kern_return_t
13776 pmap_register_provisioning_profile(
13777 	const vm_address_t payload_addr,
13778 	const vm_size_t payload_size)
13779 {
13780 	kern_return_t ret = KERN_DENIED;
13781 
13782 	ret = pmap_register_provisioning_profile_ppl(
13783 		payload_addr,
13784 		payload_size);
13785 
13786 	while (ret == KERN_RESOURCE_SHORTAGE) {
13787 		/* Allocate a page from the free list */
13788 		pmap_alloc_page_for_ppl(0);
13789 
13790 		/* Attempt the call again */
13791 		ret = pmap_register_provisioning_profile_ppl(
13792 			payload_addr,
13793 			payload_size);
13794 	}
13795 
13796 	return ret;
13797 }
13798 
13799 kern_return_t
13800 pmap_unregister_provisioning_profile_internal(
13801 	pmap_cs_profile_t *profile_obj)
13802 {
13803 	kern_return_t ret = KERN_DENIED;
13804 
13805 	/* Lock the red-black tree exclusively */
13806 	lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13807 
13808 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13809 		panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13810 	}
13811 
13812 	uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13813 	if (reference_count != 0) {
13814 		ret = KERN_FAILURE;
13815 		goto exit;
13816 	}
13817 
13818 	/* Remove the profile from the red-black tree */
13819 	RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13820 
13821 	/* Unregistration was a success */
13822 	ret = KERN_SUCCESS;
13823 
13824 exit:
13825 	/* Unlock the red-black tree */
13826 	lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13827 
13828 	if (ret == KERN_SUCCESS) {
13829 		/* Get the original payload address */
13830 		const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13831 		const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13832 
13833 		/* Get the original payload size */
13834 		vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13835 		payload_size = round_page(payload_size);
13836 
13837 		/* Unlock the profile payload */
13838 		pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13839 		pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13840 		    profile_payload, payload_size);
13841 
13842 		profile_obj = NULL;
13843 	}
13844 	return ret;
13845 }
13846 
13847 kern_return_t
13848 pmap_unregister_provisioning_profile(
13849 	pmap_cs_profile_t *profile_obj)
13850 {
13851 	return pmap_unregister_provisioning_profile_ppl(profile_obj);
13852 }
13853 
13854 kern_return_t
13855 pmap_associate_provisioning_profile_internal(
13856 	pmap_cs_code_directory_t *cd_entry,
13857 	pmap_cs_profile_t *profile_obj)
13858 {
13859 	kern_return_t ret = KERN_DENIED;
13860 
13861 	/* Acquire the lock on the code directory */
13862 	pmap_cs_lock_code_directory(cd_entry);
13863 
13864 	if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13865 		pmap_cs_log_error("disallowing profile association with verified signature");
13866 		goto exit;
13867 	} else if (cd_entry->profile_obj != NULL) {
13868 		pmap_cs_log_error("disallowing multiple profile associations with signature");
13869 		goto exit;
13870 	}
13871 
13872 	/* Lock the red-black tree as shared */
13873 	lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13874 
13875 	if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13876 		panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13877 	} else if (profile_obj->profile_validated == false) {
13878 		panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13879 	}
13880 
13881 	/* Associate the profile with the signature */
13882 	cd_entry->profile_obj = profile_obj;
13883 
13884 	/* Increment the reference count on the profile object */
13885 	uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13886 	if (reference_count == 0) {
13887 		panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13888 	}
13889 
13890 	/* Unlock the red-black tree */
13891 	lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13892 
13893 	/* Association was a success */
13894 	pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13895 	ret = KERN_SUCCESS;
13896 
13897 exit:
13898 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13899 
13900 	return ret;
13901 }
13902 
13903 kern_return_t
13904 pmap_associate_provisioning_profile(
13905 	pmap_cs_code_directory_t *cd_entry,
13906 	pmap_cs_profile_t *profile_obj)
13907 {
13908 	return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13909 }
13910 
13911 kern_return_t
13912 pmap_disassociate_provisioning_profile_internal(
13913 	pmap_cs_code_directory_t *cd_entry)
13914 {
13915 	pmap_cs_profile_t *profile_obj = NULL;
13916 	kern_return_t ret = KERN_DENIED;
13917 
13918 	/* Acquire the lock on the code directory */
13919 	pmap_cs_lock_code_directory(cd_entry);
13920 
13921 	if (cd_entry->profile_obj == NULL) {
13922 		ret = KERN_NOT_FOUND;
13923 		goto exit;
13924 	}
13925 	profile_obj = cd_entry->profile_obj;
13926 
13927 	/* Disassociate the profile from the signature */
13928 	cd_entry->profile_obj = NULL;
13929 
13930 	/* Disassociation was a success */
13931 	ret = KERN_SUCCESS;
13932 
13933 exit:
13934 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13935 
13936 	if (ret == KERN_SUCCESS) {
13937 		/* Decrement the reference count on the profile object */
13938 		uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13939 		if (reference_count == UINT32_MAX) {
13940 			panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13941 		}
13942 		pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13943 	}
13944 	return ret;
13945 }
13946 
13947 kern_return_t
13948 pmap_disassociate_provisioning_profile(
13949 	pmap_cs_code_directory_t *cd_entry)
13950 {
13951 	return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13952 }
13953 
13954 kern_return_t
13955 pmap_associate_kernel_entitlements_internal(
13956 	pmap_cs_code_directory_t *cd_entry,
13957 	const void *kernel_entitlements)
13958 {
13959 	kern_return_t ret = KERN_DENIED;
13960 
13961 	if (kernel_entitlements == NULL) {
13962 		panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13963 	}
13964 
13965 	/* Acquire the lock on the code directory */
13966 	pmap_cs_lock_code_directory(cd_entry);
13967 
13968 	if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13969 		ret = KERN_DENIED;
13970 		goto out;
13971 	} else if (cd_entry->kernel_entitlements != NULL) {
13972 		ret = KERN_DENIED;
13973 		goto out;
13974 	}
13975 	cd_entry->kernel_entitlements = kernel_entitlements;
13976 
13977 	/* Association was a success */
13978 	ret = KERN_SUCCESS;
13979 
13980 out:
13981 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
13982 	return ret;
13983 }
13984 
13985 kern_return_t
13986 pmap_associate_kernel_entitlements(
13987 	pmap_cs_code_directory_t *cd_entry,
13988 	const void *kernel_entitlements)
13989 {
13990 	return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13991 }
13992 
13993 kern_return_t
13994 pmap_resolve_kernel_entitlements_internal(
13995 	pmap_t pmap,
13996 	const void **kernel_entitlements)
13997 {
13998 	const void *entitlements = NULL;
13999 	pmap_cs_code_directory_t *cd_entry = NULL;
14000 	kern_return_t ret = KERN_DENIED;
14001 
14002 	/* Validate the PMAP object */
14003 	validate_pmap(pmap);
14004 
14005 	/* Ensure no kernel PMAP */
14006 	if (pmap == kernel_pmap) {
14007 		return KERN_NOT_FOUND;
14008 	}
14009 
14010 	/* Attempt a shared lock on the PMAP */
14011 	if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
14012 		return KERN_ABORTED;
14013 	}
14014 
14015 	/*
14016 	 * Acquire the code signature from the PMAP. This function is called when
14017 	 * performing an entitlement check, and since we've confirmed this isn't
14018 	 * the kernel_pmap, at this stage, each pmap _should_ have a main region
14019 	 * with a code signature.
14020 	 */
14021 	cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
14022 	if (cd_entry == NULL) {
14023 		ret = KERN_NOT_FOUND;
14024 		goto out;
14025 	}
14026 
14027 	entitlements = cd_entry->kernel_entitlements;
14028 	if (entitlements == NULL) {
14029 		ret = KERN_NOT_FOUND;
14030 		goto out;
14031 	}
14032 
14033 	/* Pin and write out the entitlements object pointer */
14034 	if (kernel_entitlements != NULL) {
14035 		pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14036 		*kernel_entitlements = entitlements;
14037 		pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
14038 	}
14039 
14040 	/* Successfully resolved the entitlements */
14041 	ret = KERN_SUCCESS;
14042 
14043 out:
14044 	/* Unlock the code signature object */
14045 	if (cd_entry != NULL) {
14046 		lck_rw_unlock_shared(&cd_entry->rwlock);
14047 		cd_entry = NULL;
14048 	}
14049 
14050 	/* Unlock the PMAP object */
14051 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
14052 
14053 	return ret;
14054 }
14055 
14056 kern_return_t
14057 pmap_resolve_kernel_entitlements(
14058 	pmap_t pmap,
14059 	const void **kernel_entitlements)
14060 {
14061 	kern_return_t ret = KERN_DENIED;
14062 
14063 	do {
14064 		ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
14065 	} while (ret == KERN_ABORTED);
14066 
14067 	return ret;
14068 }
14069 
14070 kern_return_t
14071 pmap_accelerate_entitlements_internal(
14072 	pmap_cs_code_directory_t *cd_entry)
14073 {
14074 	const coreentitlements_t *CoreEntitlements = NULL;
14075 	const CS_SuperBlob *superblob = NULL;
14076 	pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
14077 	size_t signature_length = 0;
14078 	size_t acceleration_length = 0;
14079 	size_t required_length = 0;
14080 	kern_return_t ret = KERN_DENIED;
14081 
14082 	/* Setup the CoreEntitlements interface */
14083 	CoreEntitlements = &amfi->CoreEntitlements;
14084 
14085 	CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
14086 
14087 	/* Acquire the lock on the code directory */
14088 	pmap_cs_lock_code_directory(cd_entry);
14089 
14090 	/*
14091 	 * Only reconstituted code signatures can be accelerated. This is only a policy
14092 	 * decision we make since this allows us to re-use any unused space within the
14093 	 * locked down code signature region. There is also a decent bit of validation
14094 	 * within the reconstitution function to ensure blobs are ordered and do not
14095 	 * contain any padding around them which can cause issues here.
14096 	 *
14097 	 * This also serves as a check to ensure the signature is trusted.
14098 	 */
14099 	if (cd_entry->unneeded_code_signature_unlocked == false) {
14100 		ret = KERN_DENIED;
14101 		goto out;
14102 	}
14103 
14104 	if (cd_entry->ce_ctx == NULL) {
14105 		ret = KERN_SUCCESS;
14106 		goto out;
14107 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
14108 		ret = KERN_SUCCESS;
14109 		goto out;
14110 	}
14111 
14112 	/* We only support accelerating when size <= PAGE_SIZE */
14113 	ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
14114 	if (ce_err != CoreEntitlements->kNoError) {
14115 		if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
14116 			/* Small entitlement blobs aren't eligible */
14117 			ret = KERN_SUCCESS;
14118 			goto out;
14119 		}
14120 		panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
14121 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14122 	} else if (acceleration_length > PAGE_SIZE) {
14123 		ret = KERN_ABORTED;
14124 		goto out;
14125 	}
14126 	assert(acceleration_length > 0);
14127 
14128 	superblob = cd_entry->superblob;
14129 	signature_length = ntohl(superblob->length);
14130 
14131 	/* Adjust the required length for the overhead structure -- can't overflow */
14132 	required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
14133 	if (required_length > PAGE_SIZE) {
14134 		ret = KERN_ABORTED;
14135 		goto out;
14136 	}
14137 
14138 	/*
14139 	 * First we'll check if the code signature has enough space within the locked down
14140 	 * region of memory to hold the buffer. If not, then we'll see if we can bucket
14141 	 * allocate the buffer, and if not, we'll just allocate an entire page from the
14142 	 * free list.
14143 	 *
14144 	 * When we're storing the buffer within the code signature, we also need to make
14145 	 * sure we account for alignment of the buffer.
14146 	 */
14147 	const vm_address_t align_mask = sizeof(void*) - 1;
14148 	size_t required_length_within_sig = required_length + align_mask;
14149 
14150 	if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
14151 		vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
14152 		aligned_buf = (aligned_buf + align_mask) & ~align_mask;
14153 
14154 		/* We need to resolve to the physical aperture */
14155 		pmap_paddr_t phys_addr = kvtophys(aligned_buf);
14156 		acceleration_buf = (void*)phystokv(phys_addr);
14157 
14158 		/* Ensure the offset within the page wasn't lost */
14159 		assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
14160 
14161 		acceleration_buf->allocated = false;
14162 		pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
14163 	} else {
14164 		if (required_length <= pmap_cs_blob_limit) {
14165 			struct pmap_cs_blob *bucket = NULL;
14166 			size_t bucket_size = 0;
14167 
14168 			/* Allocate a buffer from the blob allocator */
14169 			ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
14170 			if (ret != KERN_SUCCESS) {
14171 				goto out;
14172 			}
14173 			acceleration_buf = (void*)bucket->blob;
14174 			pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
14175 		} else {
14176 			pmap_paddr_t phys_addr = 0;
14177 			ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
14178 			if (ret != KERN_SUCCESS) {
14179 				goto out;
14180 			}
14181 			acceleration_buf = (void*)phystokv(phys_addr);
14182 			pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
14183 		}
14184 		acceleration_buf->allocated = true;
14185 	}
14186 	acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
14187 	acceleration_buf->length = acceleration_length;
14188 
14189 	/* Take the acceleration buffer lock */
14190 	pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
14191 
14192 	/* Setup the global acceleration buffer state */
14193 	pmap_cs_acceleration_buf = acceleration_buf;
14194 
14195 	/* Accelerate the entitlements */
14196 	ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14197 	if (ce_err != CoreEntitlements->kNoError) {
14198 		panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14199 		    cd_entry, CoreEntitlements->GetErrorString(ce_err));
14200 	} else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14201 		panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14202 	}
14203 
14204 	/*
14205 	 * The global acceleration buffer lock is unlocked by the allocation function itself
14206 	 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14207 	 * an assert that the lock is unlocked here since another thread could have acquired
14208 	 * it by now.
14209 	 */
14210 	ret = KERN_SUCCESS;
14211 
14212 out:
14213 	lck_rw_unlock_exclusive(&cd_entry->rwlock);
14214 	return ret;
14215 }
14216 
14217 kern_return_t
14218 pmap_accelerate_entitlements(
14219 	pmap_cs_code_directory_t *cd_entry)
14220 {
14221 	kern_return_t ret = KERN_DENIED;
14222 
14223 	ret = pmap_accelerate_entitlements_ppl(cd_entry);
14224 	while (ret == KERN_RESOURCE_SHORTAGE) {
14225 		/* Allocate a page for the PPL */
14226 		pmap_alloc_page_for_ppl(0);
14227 
14228 		/* Try again */
14229 		ret = pmap_accelerate_entitlements_ppl(cd_entry);
14230 	}
14231 
14232 	return ret;
14233 }
14234 
14235 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14236 
14237 MARK_AS_PMAP_TEXT bool
14238 pmap_lookup_in_loaded_trust_caches_internal(
14239 	const uint8_t cdhash[CS_CDHASH_LEN])
14240 {
14241 	kern_return_t kr = KERN_NOT_FOUND;
14242 
14243 #if PMAP_CS_PPL_MONITOR
14244 	/*
14245 	 * If we have the PPL monitor, then this function can only be called from
14246 	 * within the PPL. Calling it directly would've caused a panic, so we can
14247 	 * assume that we're in the PPL here.
14248 	 */
14249 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14250 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14251 
14252 	kr = pmap_query_trust_cache_safe(
14253 		kTCQueryTypeLoadable,
14254 		cdhash_safe,
14255 		NULL);
14256 #else
14257 	kr = query_trust_cache(
14258 		kTCQueryTypeLoadable,
14259 		cdhash,
14260 		NULL);
14261 #endif
14262 
14263 	if (kr == KERN_SUCCESS) {
14264 		return true;
14265 	}
14266 	return false;
14267 }
14268 
14269 bool
14270 pmap_lookup_in_loaded_trust_caches(
14271 	const uint8_t cdhash[CS_CDHASH_LEN])
14272 {
14273 #if XNU_MONITOR
14274 	return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14275 #else
14276 	return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14277 #endif
14278 }
14279 
14280 MARK_AS_PMAP_TEXT uint32_t
14281 pmap_lookup_in_static_trust_cache_internal(
14282 	const uint8_t cdhash[CS_CDHASH_LEN])
14283 {
14284 	TrustCacheQueryToken_t query_token = {0};
14285 	kern_return_t kr = KERN_NOT_FOUND;
14286 	uint64_t flags = 0;
14287 	uint8_t hash_type = 0;
14288 
14289 #if PMAP_CS_PPL_MONITOR
14290 	/*
14291 	 * If we have the PPL monitor, then this function can only be called from
14292 	 * within the PPL. Calling it directly would've caused a panic, so we can
14293 	 * assume that we're in the PPL here.
14294 	 */
14295 	uint8_t cdhash_safe[CS_CDHASH_LEN];
14296 	memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14297 
14298 	kr = pmap_query_trust_cache_safe(
14299 		kTCQueryTypeStatic,
14300 		cdhash_safe,
14301 		&query_token);
14302 #else
14303 	kr = query_trust_cache(
14304 		kTCQueryTypeStatic,
14305 		cdhash,
14306 		&query_token);
14307 #endif
14308 
14309 	if (kr == KERN_SUCCESS) {
14310 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
14311 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14312 
14313 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14314 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14315 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14316 	}
14317 
14318 	return 0;
14319 }
14320 
14321 uint32_t
14322 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14323 {
14324 #if XNU_MONITOR
14325 	return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14326 #else
14327 	return pmap_lookup_in_static_trust_cache_internal(cdhash);
14328 #endif
14329 }
14330 
14331 #if PMAP_CS_INCLUDE_CODE_SIGNING
14332 
14333 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14334 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14335 
14336 MARK_AS_PMAP_TEXT void
14337 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14338 {
14339 
14340 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14341 	memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14342 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14343 
14344 	pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14345 	    cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14346 }
14347 
14348 MARK_AS_PMAP_TEXT bool
14349 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14350 {
14351 	bool match = false;
14352 
14353 	/* Lockdown mode disallows compilation service */
14354 	if (ppl_lockdown_mode_enabled == true) {
14355 		return false;
14356 	}
14357 
14358 	pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14359 	if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14360 		match = true;
14361 	}
14362 	pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14363 
14364 	if (match) {
14365 		pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14366 	}
14367 
14368 	return match;
14369 }
14370 
14371 void
14372 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14373 {
14374 #if XNU_MONITOR
14375 	pmap_set_compilation_service_cdhash_ppl(cdhash);
14376 #else
14377 	pmap_set_compilation_service_cdhash_internal(cdhash);
14378 #endif
14379 }
14380 
14381 bool
14382 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14383 {
14384 #if XNU_MONITOR
14385 	return pmap_match_compilation_service_cdhash_ppl(cdhash);
14386 #else
14387 	return pmap_match_compilation_service_cdhash_internal(cdhash);
14388 #endif
14389 }
14390 
14391 /*
14392  * As part of supporting local signing on the device, we need the PMAP layer
14393  * to store the local signing key so that PMAP_CS can validate with it. We
14394  * store it at the PMAP layer such that it is accessible to both AMFI and
14395  * PMAP_CS should they need it.
14396  */
14397 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14398 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14399 
14400 MARK_AS_PMAP_TEXT void
14401 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14402 {
14403 	bool key_set = false;
14404 
14405 	/*
14406 	 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14407 	 * a successful exchange means that the local signing public key has _not_ been
14408 	 * set. In case the key has been set, we panic as we would never expect the
14409 	 * kernel to attempt to set the key more than once.
14410 	 */
14411 	key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14412 
14413 	if (key_set) {
14414 		panic("attempted to set the local signing public key multiple times");
14415 	}
14416 
14417 	memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14418 	pmap_cs_log_info("set local signing public key");
14419 }
14420 
14421 void
14422 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14423 {
14424 #if XNU_MONITOR
14425 	return pmap_set_local_signing_public_key_ppl(public_key);
14426 #else
14427 	return pmap_set_local_signing_public_key_internal(public_key);
14428 #endif
14429 }
14430 
14431 uint8_t*
14432 pmap_get_local_signing_public_key(void)
14433 {
14434 	bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14435 
14436 	if (key_set) {
14437 		return pmap_local_signing_public_key;
14438 	}
14439 
14440 	return NULL;
14441 }
14442 
14443 /*
14444  * Locally signed applications need to be explicitly authorized by an entitled application
14445  * before we allow them to run.
14446  */
14447 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14448 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14449 
14450 MARK_AS_PMAP_TEXT void
14451 pmap_unrestrict_local_signing_internal(
14452 	const uint8_t cdhash[CS_CDHASH_LEN])
14453 {
14454 
14455 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14456 	memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14457 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14458 
14459 	pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14460 	    cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14461 }
14462 
14463 void
14464 pmap_unrestrict_local_signing(
14465 	const uint8_t cdhash[CS_CDHASH_LEN])
14466 {
14467 #if XNU_MONITOR
14468 	return pmap_unrestrict_local_signing_ppl(cdhash);
14469 #else
14470 	return pmap_unrestrict_local_signing_internal(cdhash);
14471 #endif
14472 }
14473 
14474 #if PMAP_CS
14475 MARK_AS_PMAP_TEXT static void
14476 pmap_restrict_local_signing(void)
14477 {
14478 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14479 	memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14480 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14481 }
14482 
14483 MARK_AS_PMAP_TEXT static bool
14484 pmap_local_signing_restricted(
14485 	const uint8_t cdhash[CS_CDHASH_LEN])
14486 {
14487 	pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14488 	int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14489 	pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14490 
14491 	return ret != 0;
14492 }
14493 
14494 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14495 #endif
14496 
14497 MARK_AS_PMAP_TEXT void
14498 pmap_footprint_suspend_internal(
14499 	vm_map_t        map,
14500 	boolean_t       suspend)
14501 {
14502 #if DEVELOPMENT || DEBUG
14503 	if (suspend) {
14504 		current_thread()->pmap_footprint_suspended = TRUE;
14505 		map->pmap->footprint_was_suspended = TRUE;
14506 	} else {
14507 		current_thread()->pmap_footprint_suspended = FALSE;
14508 	}
14509 #else /* DEVELOPMENT || DEBUG */
14510 	(void) map;
14511 	(void) suspend;
14512 #endif /* DEVELOPMENT || DEBUG */
14513 }
14514 
14515 void
14516 pmap_footprint_suspend(
14517 	vm_map_t map,
14518 	boolean_t suspend)
14519 {
14520 #if XNU_MONITOR
14521 	pmap_footprint_suspend_ppl(map, suspend);
14522 #else
14523 	pmap_footprint_suspend_internal(map, suspend);
14524 #endif
14525 }
14526 
14527 MARK_AS_PMAP_TEXT void
14528 pmap_nop_internal(pmap_t pmap __unused)
14529 {
14530 	validate_pmap_mutable(pmap);
14531 }
14532 
14533 void
14534 pmap_nop(pmap_t pmap)
14535 {
14536 #if XNU_MONITOR
14537 	pmap_nop_ppl(pmap);
14538 #else
14539 	pmap_nop_internal(pmap);
14540 #endif
14541 }
14542 
14543 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14544 
14545 struct page_table_dump_header {
14546 	uint64_t pa;
14547 	uint64_t num_entries;
14548 	uint64_t start_va;
14549 	uint64_t end_va;
14550 };
14551 
14552 static kern_return_t
14553 pmap_dump_page_tables_recurse(pmap_t pmap,
14554     const tt_entry_t *ttp,
14555     unsigned int cur_level,
14556     unsigned int level_mask,
14557     uint64_t start_va,
14558     void *buf_start,
14559     void *buf_end,
14560     size_t *bytes_copied)
14561 {
14562 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14563 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14564 
14565 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
14566 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14567 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14568 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14569 
14570 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
14571 
14572 	if (cur_level == pt_attr_root_level(pt_attr)) {
14573 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14574 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14575 	}
14576 
14577 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14578 	const tt_entry_t *tt_end = &ttp[num_entries];
14579 
14580 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14581 		return KERN_INSUFFICIENT_BUFFER_SIZE;
14582 	}
14583 
14584 	if (level_mask & (1U << cur_level)) {
14585 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14586 		header->pa = ml_static_vtop((vm_offset_t)ttp);
14587 		header->num_entries = num_entries;
14588 		header->start_va = start_va;
14589 		header->end_va = start_va + (num_entries * size);
14590 
14591 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14592 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14593 	}
14594 	uint64_t current_va = start_va;
14595 
14596 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14597 		tt_entry_t tte = *ttep;
14598 
14599 		if (!(tte & valid_mask)) {
14600 			continue;
14601 		}
14602 
14603 		if ((tte & type_mask) == type_block) {
14604 			continue;
14605 		} else {
14606 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14607 				panic("%s: corrupt entry %#llx at %p, "
14608 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14609 				    __FUNCTION__, tte, ttep,
14610 				    ttp, cur_level, bufp, buf_end);
14611 			}
14612 
14613 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14614 
14615 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14616 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
14617 
14618 			if (recurse_result != KERN_SUCCESS) {
14619 				return recurse_result;
14620 			}
14621 		}
14622 	}
14623 
14624 	return KERN_SUCCESS;
14625 }
14626 
14627 kern_return_t
14628 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14629 {
14630 	if (not_in_kdp) {
14631 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
14632 	}
14633 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14634 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
14635 }
14636 
14637 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14638 
14639 kern_return_t
14640 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14641     unsigned int level_mask __unused, size_t *bytes_copied __unused)
14642 {
14643 	return KERN_NOT_SUPPORTED;
14644 }
14645 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14646 
14647 
14648 #ifdef CONFIG_XNUPOST
14649 #ifdef __arm64__
14650 static volatile bool pmap_test_took_fault = false;
14651 
14652 static bool
14653 pmap_test_fault_handler(arm_saved_state_t * state)
14654 {
14655 	bool retval                 = false;
14656 	uint64_t esr                = get_saved_state_esr(state);
14657 	esr_exception_class_t class = ESR_EC(esr);
14658 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
14659 
14660 	if ((class == ESR_EC_DABORT_EL1) &&
14661 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14662 		pmap_test_took_fault = true;
14663 		/* return to the instruction immediately after the call to NX page */
14664 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14665 		retval = true;
14666 	}
14667 
14668 	return retval;
14669 }
14670 
14671 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14672 static NOKASAN bool
14673 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14674 {
14675 	pmap_t old_pmap = NULL;
14676 	thread_t thread = current_thread();
14677 
14678 	pmap_test_took_fault = false;
14679 
14680 	/*
14681 	 * We're potentially switching pmaps without using the normal thread
14682 	 * mechanism; disable interrupts and preemption to avoid any unexpected
14683 	 * memory accesses.
14684 	 */
14685 	uint64_t old_int_state = pmap_interrupts_disable();
14686 	mp_disable_preemption();
14687 
14688 	if (pmap != NULL) {
14689 		old_pmap = current_pmap();
14690 		pmap_switch(pmap, thread);
14691 
14692 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
14693 #if __ARM_PAN_AVAILABLE__
14694 		__builtin_arm_wsr("pan", 0);
14695 #endif /* __ARM_PAN_AVAILABLE__ */
14696 	}
14697 
14698 	ml_expect_fault_begin(pmap_test_fault_handler, va);
14699 
14700 	if (is_write) {
14701 		*((volatile uint64_t*)(va)) = 0xdec0de;
14702 	} else {
14703 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
14704 		(void)tmp;
14705 	}
14706 
14707 	/* Save the fault bool, and undo the gross stuff we did. */
14708 	bool took_fault = pmap_test_took_fault;
14709 	ml_expect_fault_end();
14710 
14711 	if (pmap != NULL) {
14712 #if __ARM_PAN_AVAILABLE__
14713 		__builtin_arm_wsr("pan", 1);
14714 #endif /* __ARM_PAN_AVAILABLE__ */
14715 
14716 		pmap_switch(old_pmap, thread);
14717 	}
14718 
14719 	mp_enable_preemption();
14720 	pmap_interrupts_restore(old_int_state);
14721 	bool retval = (took_fault == should_fault);
14722 	return retval;
14723 }
14724 
14725 static bool
14726 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14727 {
14728 	bool retval = pmap_test_access(pmap, va, should_fault, false);
14729 
14730 	if (!retval) {
14731 		T_FAIL("%s: %s, "
14732 		    "pmap=%p, va=%p, should_fault=%u",
14733 		    __func__, should_fault ? "did not fault" : "faulted",
14734 		    pmap, (void*)va, (unsigned)should_fault);
14735 	}
14736 
14737 	return retval;
14738 }
14739 
14740 static bool
14741 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14742 {
14743 	bool retval = pmap_test_access(pmap, va, should_fault, true);
14744 
14745 	if (!retval) {
14746 		T_FAIL("%s: %s, "
14747 		    "pmap=%p, va=%p, should_fault=%u",
14748 		    __func__, should_fault ? "did not fault" : "faulted",
14749 		    pmap, (void*)va, (unsigned)should_fault);
14750 	}
14751 
14752 	return retval;
14753 }
14754 
14755 static bool
14756 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14757 {
14758 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14759 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14760 
14761 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14762 
14763 	if (!retval) {
14764 		T_FAIL("%s: bits=%u, "
14765 		    "pa=%p, should_be_set=%u",
14766 		    __func__, bits,
14767 		    (void*)pa, should_be_set);
14768 	}
14769 
14770 	return retval;
14771 }
14772 
14773 static __attribute__((noinline)) bool
14774 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14775 {
14776 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14777 	return retval;
14778 }
14779 
14780 static int
14781 pmap_test_test_config(unsigned int flags)
14782 {
14783 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
14784 	unsigned int map_count = 0;
14785 	unsigned long page_ratio = 0;
14786 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
14787 
14788 	if (!pmap) {
14789 		panic("Failed to allocate pmap");
14790 	}
14791 
14792 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14793 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14794 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14795 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14796 
14797 	if (pmap_page_size <= native_page_size) {
14798 		page_ratio = native_page_size / pmap_page_size;
14799 	} else {
14800 		/*
14801 		 * We claim to support a page_ratio of less than 1, which is
14802 		 * not currently supported by the pmap layer; panic.
14803 		 */
14804 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14805 		    "flags=%u",
14806 		    __func__, native_page_size, pmap_page_size,
14807 		    flags);
14808 	}
14809 
14810 	if (PAGE_RATIO > 1) {
14811 		/*
14812 		 * The kernel is deliberately pretending to have 16KB pages.
14813 		 * The pmap layer has code that supports this, so pretend the
14814 		 * page size is larger than it is.
14815 		 */
14816 		pmap_page_size = PAGE_SIZE;
14817 		native_page_size = PAGE_SIZE;
14818 	}
14819 
14820 	/*
14821 	 * Get two pages from the VM; one to be mapped wired, and one to be
14822 	 * mapped nonwired.
14823 	 */
14824 	vm_page_t unwired_vm_page = vm_page_grab();
14825 	vm_page_t wired_vm_page = vm_page_grab();
14826 
14827 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14828 		panic("Failed to grab VM pages");
14829 	}
14830 
14831 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14832 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14833 
14834 	pmap_paddr_t pa = ptoa(pn);
14835 	pmap_paddr_t wired_pa = ptoa(wired_pn);
14836 
14837 	/*
14838 	 * We'll start mappings at the second twig TT.  This keeps us from only
14839 	 * using the first entry in each TT, which would trivially be address
14840 	 * 0; one of the things we will need to test is retrieving the VA for
14841 	 * a given PTE.
14842 	 */
14843 	vm_map_address_t va_base = pmap_twig_size;
14844 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14845 
14846 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14847 		/*
14848 		 * Not exactly a functional failure, but this test relies on
14849 		 * there being a spare PTE slot we can use to pin the TT.
14850 		 */
14851 		panic("Cannot pin translation table");
14852 	}
14853 
14854 	/*
14855 	 * Create the wired mapping; this will prevent the pmap layer from
14856 	 * reclaiming our test TTs, which would interfere with this test
14857 	 * ("interfere" -> "make it panic").
14858 	 */
14859 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14860 
14861 #if XNU_MONITOR
14862 	/*
14863 	 * If the PPL is enabled, make sure that the kernel cannot write
14864 	 * to PPL memory.
14865 	 */
14866 	if (!pmap_ppl_disable) {
14867 		T_LOG("Validate that kernel cannot write to PPL memory.");
14868 		pt_entry_t * ptep = pmap_pte(pmap, va_base);
14869 		pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14870 	}
14871 #endif
14872 
14873 	/*
14874 	 * Create read-only mappings of the nonwired page; if the pmap does
14875 	 * not use the same page size as the kernel, create multiple mappings
14876 	 * so that the kernel page is fully mapped.
14877 	 */
14878 	for (map_count = 0; map_count < page_ratio; map_count++) {
14879 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14880 	}
14881 
14882 	/* Validate that all the PTEs have the expected PA and VA. */
14883 	for (map_count = 0; map_count < page_ratio; map_count++) {
14884 		pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14885 
14886 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14887 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14888 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14889 		}
14890 
14891 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14892 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14893 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14894 		}
14895 	}
14896 
14897 	T_LOG("Validate that reads to our mapping do not fault.");
14898 	pmap_test_read(pmap, va_base, false);
14899 
14900 	T_LOG("Validate that writes to our mapping fault.");
14901 	pmap_test_write(pmap, va_base, true);
14902 
14903 	T_LOG("Make the first mapping writable.");
14904 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14905 
14906 	T_LOG("Validate that writes to our mapping do not fault.");
14907 	pmap_test_write(pmap, va_base, false);
14908 
14909 
14910 	T_LOG("Make the first mapping execute-only");
14911 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14912 
14913 
14914 	T_LOG("Validate that reads to our mapping do not fault.");
14915 	pmap_test_read(pmap, va_base, false);
14916 
14917 	T_LOG("Validate that writes to our mapping fault.");
14918 	pmap_test_write(pmap, va_base, true);
14919 
14920 
14921 	/*
14922 	 * For page ratios of greater than 1: validate that writes to the other
14923 	 * mappings still fault.  Remove the mappings afterwards (we're done
14924 	 * with page ratio testing).
14925 	 */
14926 	for (map_count = 1; map_count < page_ratio; map_count++) {
14927 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14928 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14929 	}
14930 
14931 	T_LOG("Mark the page unreferenced and unmodified.");
14932 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14933 	pmap_test_check_refmod(pa, 0);
14934 
14935 	/*
14936 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
14937 	 * different protection/fault_type settings, and confirm that the
14938 	 * ref/mod state matches our expectations at each step.
14939 	 */
14940 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
14941 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14942 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14943 
14944 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
14945 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14946 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14947 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14948 
14949 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
14950 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14951 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14952 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14953 
14954 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
14955 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14956 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14957 
14958 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
14959 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14960 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14961 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14962 
14963 	/*
14964 	 * Shared memory testing; we'll have two mappings; one read-only,
14965 	 * one read-write.
14966 	 */
14967 	vm_map_address_t rw_base = va_base;
14968 	vm_map_address_t ro_base = va_base + pmap_page_size;
14969 
14970 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14971 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14972 
14973 	/*
14974 	 * Test that we take faults as expected for unreferenced/unmodified
14975 	 * pages.  Also test the arm_fast_fault interface, to ensure that
14976 	 * mapping permissions change as expected.
14977 	 */
14978 	T_LOG("!ref/!mod: expect no access");
14979 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14980 	pmap_test_read_write(pmap, ro_base, false, false);
14981 	pmap_test_read_write(pmap, rw_base, false, false);
14982 
14983 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14984 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14985 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14986 	pmap_test_read_write(pmap, ro_base, true, false);
14987 	pmap_test_read_write(pmap, rw_base, true, false);
14988 
14989 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14990 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14991 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14992 	pmap_test_read_write(pmap, ro_base, true, false);
14993 	pmap_test_read_write(pmap, rw_base, true, true);
14994 
14995 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14996 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14997 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14998 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14999 	pmap_test_read_write(pmap, ro_base, true, false);
15000 	pmap_test_read_write(pmap, rw_base, true, true);
15001 
15002 	T_LOG("RW protect both mappings; should not change protections.");
15003 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15004 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
15005 	pmap_test_read_write(pmap, ro_base, true, false);
15006 	pmap_test_read_write(pmap, rw_base, true, true);
15007 
15008 	T_LOG("Read protect both mappings; RW mapping should become RO.");
15009 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
15010 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
15011 	pmap_test_read_write(pmap, ro_base, true, false);
15012 	pmap_test_read_write(pmap, rw_base, true, false);
15013 
15014 	T_LOG("RW protect the page; mappings should not change protections.");
15015 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
15016 	pmap_page_protect(pn, VM_PROT_ALL);
15017 	pmap_test_read_write(pmap, ro_base, true, false);
15018 	pmap_test_read_write(pmap, rw_base, true, true);
15019 
15020 	T_LOG("Read protect the page; RW mapping should become RO.");
15021 	pmap_page_protect(pn, VM_PROT_READ);
15022 	pmap_test_read_write(pmap, ro_base, true, false);
15023 	pmap_test_read_write(pmap, rw_base, true, false);
15024 
15025 	T_LOG("Validate that disconnect removes all known mappings of the page.");
15026 	pmap_disconnect(pn);
15027 	if (!pmap_verify_free(pn)) {
15028 		T_FAIL("Page still has mappings");
15029 	}
15030 
15031 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
15032 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
15033 	pmap_destroy(pmap);
15034 
15035 	T_LOG("Release the pages back to the VM.");
15036 	vm_page_lock_queues();
15037 	vm_page_free(unwired_vm_page);
15038 	vm_page_free(wired_vm_page);
15039 	vm_page_unlock_queues();
15040 
15041 	T_LOG("Testing successful!");
15042 	return 0;
15043 }
15044 #endif /* __arm64__ */
15045 
15046 kern_return_t
15047 pmap_test(void)
15048 {
15049 	T_LOG("Starting pmap_tests");
15050 #ifdef __arm64__
15051 	int flags = 0;
15052 	flags |= PMAP_CREATE_64BIT;
15053 
15054 #if __ARM_MIXED_PAGE_SIZE__
15055 	T_LOG("Testing VM_PAGE_SIZE_4KB");
15056 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
15057 	T_LOG("Testing VM_PAGE_SIZE_16KB");
15058 	pmap_test_test_config(flags);
15059 #else /* __ARM_MIXED_PAGE_SIZE__ */
15060 	pmap_test_test_config(flags);
15061 #endif /* __ARM_MIXED_PAGE_SIZE__ */
15062 
15063 #endif /* __arm64__ */
15064 	T_PASS("completed pmap_test successfully");
15065 	return KERN_SUCCESS;
15066 }
15067 #endif /* CONFIG_XNUPOST */
15068 
15069 /*
15070  * The following function should never make it to RELEASE code, since
15071  * it provides a way to get the PPL to modify text pages.
15072  */
15073 #if DEVELOPMENT || DEBUG
15074 
15075 #define ARM_UNDEFINED_INSN 0xe7f000f0
15076 #define ARM_UNDEFINED_INSN_THUMB 0xde00
15077 
15078 /**
15079  * Forcibly overwrite executable text with an illegal instruction.
15080  *
15081  * @note Only used for xnu unit testing.
15082  *
15083  * @param pa The physical address to corrupt.
15084  *
15085  * @return KERN_SUCCESS on success.
15086  */
15087 kern_return_t
15088 pmap_test_text_corruption(pmap_paddr_t pa)
15089 {
15090 #if XNU_MONITOR
15091 	return pmap_test_text_corruption_ppl(pa);
15092 #else /* XNU_MONITOR */
15093 	return pmap_test_text_corruption_internal(pa);
15094 #endif /* XNU_MONITOR */
15095 }
15096 
15097 MARK_AS_PMAP_TEXT kern_return_t
15098 pmap_test_text_corruption_internal(pmap_paddr_t pa)
15099 {
15100 	vm_offset_t va = phystokv(pa);
15101 	unsigned int pai = pa_index(pa);
15102 
15103 	assert(pa_valid(pa));
15104 
15105 	pvh_lock(pai);
15106 
15107 	pv_entry_t **pv_h  = pai_to_pvh(pai);
15108 	assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
15109 #if defined(PVH_FLAG_EXEC)
15110 	const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
15111 
15112 	if (need_ap_twiddle) {
15113 		pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
15114 	}
15115 #endif /* defined(PVH_FLAG_EXEC) */
15116 
15117 	/*
15118 	 * The low bit in an instruction address indicates a THUMB instruction
15119 	 */
15120 	if (va & 1) {
15121 		va &= ~(vm_offset_t)1;
15122 		*(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
15123 	} else {
15124 		*(uint32_t *)va = ARM_UNDEFINED_INSN;
15125 	}
15126 
15127 #if defined(PVH_FLAG_EXEC)
15128 	if (need_ap_twiddle) {
15129 		pmap_set_ptov_ap(pai, AP_RONA, FALSE);
15130 	}
15131 #endif /* defined(PVH_FLAG_EXEC) */
15132 
15133 	InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
15134 
15135 	pvh_unlock(pai);
15136 
15137 	return KERN_SUCCESS;
15138 }
15139 
15140 #endif /* DEVELOPMENT || DEBUG */
15141