xref: /xnu-12377.41.6/osfmk/arm64/sptm/pmap/pmap.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2011-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/backtrace.h>
42 #include <kern/bits.h>
43 #include <kern/ecc.h>
44 #include <kern/thread.h>
45 #include <kern/sched.h>
46 #include <kern/zalloc.h>
47 #include <kern/zalloc_internal.h>
48 #include <kern/kalloc.h>
49 #include <kern/spl.h>
50 #include <kern/startup.h>
51 #include <kern/trap_telemetry.h>
52 #include <kern/trustcache.h>
53 
54 #include <os/overflow.h>
55 
56 #include <vm/pmap.h>
57 #include <vm/pmap_cs.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_protos.h>
61 #include <vm/vm_object_internal.h>
62 #include <vm/vm_page_internal.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/cpm_internal.h>
65 
66 
67 #include <libkern/section_keywords.h>
68 #include <sys/errno.h>
69 
70 #include <libkern/amfi/amfi.h>
71 #include <sys/trusted_execution_monitor.h>
72 #include <sys/trust_caches.h>
73 #include <sys/code_signing.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/thread.h>
77 #include <machine/lowglobals.h>
78 
79 #include <arm/caches_internal.h>
80 #include <arm/cpu_data.h>
81 #include <arm/cpu_data_internal.h>
82 #include <arm/cpu_capabilities.h>
83 #include <arm/cpu_number.h>
84 #include <arm/machine_cpu.h>
85 #include <arm/misc_protos.h>
86 #include <arm/trap_internal.h>
87 #include <arm64/sptm/pmap/pmap_internal.h>
88 #include <arm64/sptm/sptm.h>
89 
90 #include <arm64/proc_reg.h>
91 #include <pexpert/arm64/boot.h>
92 #include <arm64/ppl/uat.h>
93 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
94 #include <arm64/amcc_rorgn.h>
95 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
96 
97 #include <pexpert/device_tree.h>
98 
99 #include <san/kasan.h>
100 #include <sys/cdefs.h>
101 
102 #if defined(HAS_APPLE_PAC)
103 #include <ptrauth.h>
104 #endif
105 
106 #ifdef CONFIG_XNUPOST
107 #include <tests/xnupost.h>
108 #endif
109 
110 
111 #if HIBERNATION
112 #include <IOKit/IOHibernatePrivate.h>
113 #endif /* HIBERNATION */
114 
115 #ifdef __ARM64_PMAP_SUBPAGE_L1__
116 /**
117  * Different from PPL, PMAP_ROOT_ALLOC_SIZE for subpage L1 devices is 128 bytes
118  * rather than 64 bytes, due to the metadata SPTM needs to track the subpage L1
119  * tables.
120  */
121 #define PMAP_ROOT_ALLOC_SIZE SUBPAGE_USER_ROOT_TABLE_SIZE
122 #else
123 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
124 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
125 
126 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
127 
128 
129 /**
130  * Per-CPU data used to do setup and post-processing for SPTM calls.
131  * On the setup side, this structure is used to store parameters for batched SPTM operations.
132  * These parameters may be large (upwards of 1K), and given that SPTM calls are generally
133  * issued from preemption-disabled contexts anyway, it's better to store them in per-CPU
134  * data rather than the local stack.
135  * On the post-processing side, this structure exposes a pointer to the SPTM's per-CPU array
136  * of 'prev_ptes', that is the prior value encountered in each PTE at the time of the SPTM's
137  * atomic update of that PTE.
138  */
139 pmap_sptm_percpu_data_t PERCPU_DATA(pmap_sptm_percpu);
140 
141 /**
142  * Reference group for global tracking of all outstanding pmap references.
143  */
144 os_refgrp_decl(static, pmap_refgrp, "pmap", NULL);
145 
146 /* Boot-arg to enable/disable the use of XNU_KERNEL_RESTRICTED type in SPTM. */
147 TUNABLE(bool, use_xnu_restricted, "xnu_restricted", true);
148 
149 extern u_int32_t random(void); /* from <libkern/libkern.h> */
150 
151 static bool alloc_asid(pmap_t pmap);
152 static void free_asid(pmap_t pmap);
153 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
154 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
155 
156 const struct page_table_ops native_pt_ops =
157 {
158 	.alloc_id = alloc_asid,
159 	.free_id = free_asid,
160 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
161 	.wimg_to_pte = wimg_to_pte,
162 };
163 
164 const struct page_table_level_info pmap_table_level_info_16k[] =
165 {
166 	[0] = {
167 		.size       = ARM_16K_TT_L0_SIZE,
168 		.offmask    = ARM_16K_TT_L0_OFFMASK,
169 		.shift      = ARM_16K_TT_L0_SHIFT,
170 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
171 		.valid_mask = ARM_TTE_VALID,
172 		.type_mask  = ARM_TTE_TYPE_MASK,
173 		.type_block = ARM_TTE_TYPE_BLOCK
174 	},
175 	[1] = {
176 		.size       = ARM_16K_TT_L1_SIZE,
177 		.offmask    = ARM_16K_TT_L1_OFFMASK,
178 		.shift      = ARM_16K_TT_L1_SHIFT,
179 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
180 		.valid_mask = ARM_TTE_VALID,
181 		.type_mask  = ARM_TTE_TYPE_MASK,
182 		.type_block = ARM_TTE_TYPE_BLOCK
183 	},
184 	[2] = {
185 		.size       = ARM_16K_TT_L2_SIZE,
186 		.offmask    = ARM_16K_TT_L2_OFFMASK,
187 		.shift      = ARM_16K_TT_L2_SHIFT,
188 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
189 		.valid_mask = ARM_TTE_VALID,
190 		.type_mask  = ARM_TTE_TYPE_MASK,
191 		.type_block = ARM_TTE_TYPE_BLOCK
192 	},
193 	[3] = {
194 		.size       = ARM_16K_TT_L3_SIZE,
195 		.offmask    = ARM_16K_TT_L3_OFFMASK,
196 		.shift      = ARM_16K_TT_L3_SHIFT,
197 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
198 		.valid_mask = ARM_PTE_TYPE_VALID,
199 		.type_mask  = ARM_TTE_TYPE_MASK,
200 		.type_block = ARM_TTE_TYPE_L3BLOCK
201 	}
202 };
203 
204 const struct page_table_level_info pmap_table_level_info_4k[] =
205 {
206 	[0] = {
207 		.size       = ARM_4K_TT_L0_SIZE,
208 		.offmask    = ARM_4K_TT_L0_OFFMASK,
209 		.shift      = ARM_4K_TT_L0_SHIFT,
210 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
211 		.valid_mask = ARM_TTE_VALID,
212 		.type_mask  = ARM_TTE_TYPE_MASK,
213 		.type_block = ARM_TTE_TYPE_BLOCK
214 	},
215 	[1] = {
216 		.size       = ARM_4K_TT_L1_SIZE,
217 		.offmask    = ARM_4K_TT_L1_OFFMASK,
218 		.shift      = ARM_4K_TT_L1_SHIFT,
219 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
220 		.valid_mask = ARM_TTE_VALID,
221 		.type_mask  = ARM_TTE_TYPE_MASK,
222 		.type_block = ARM_TTE_TYPE_BLOCK
223 	},
224 	[2] = {
225 		.size       = ARM_4K_TT_L2_SIZE,
226 		.offmask    = ARM_4K_TT_L2_OFFMASK,
227 		.shift      = ARM_4K_TT_L2_SHIFT,
228 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
229 		.valid_mask = ARM_TTE_VALID,
230 		.type_mask  = ARM_TTE_TYPE_MASK,
231 		.type_block = ARM_TTE_TYPE_BLOCK
232 	},
233 	[3] = {
234 		.size       = ARM_4K_TT_L3_SIZE,
235 		.offmask    = ARM_4K_TT_L3_OFFMASK,
236 		.shift      = ARM_4K_TT_L3_SHIFT,
237 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
238 		.valid_mask = ARM_PTE_TYPE_VALID,
239 		.type_mask  = ARM_TTE_TYPE_MASK,
240 		.type_block = ARM_TTE_TYPE_L3BLOCK
241 	}
242 };
243 
244 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
245 {
246 	[0] = { /* Unused */
247 		.size       = ARM_4K_TT_L0_SIZE,
248 		.offmask    = ARM_4K_TT_L0_OFFMASK,
249 		.shift      = ARM_4K_TT_L0_SHIFT,
250 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
251 		.valid_mask = ARM_TTE_VALID,
252 		.type_mask  = ARM_TTE_TYPE_MASK,
253 		.type_block = ARM_TTE_TYPE_BLOCK
254 	},
255 	[1] = { /* Concatenated, so index mask is larger than normal */
256 		.size       = ARM_4K_TT_L1_SIZE,
257 		.offmask    = ARM_4K_TT_L1_OFFMASK,
258 		.shift      = ARM_4K_TT_L1_SHIFT,
259 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
260 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
261 #else
262 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
263 #endif
264 		.valid_mask = ARM_TTE_VALID,
265 		.type_mask  = ARM_TTE_TYPE_MASK,
266 		.type_block = ARM_TTE_TYPE_BLOCK
267 	},
268 	[2] = {
269 		.size       = ARM_4K_TT_L2_SIZE,
270 		.offmask    = ARM_4K_TT_L2_OFFMASK,
271 		.shift      = ARM_4K_TT_L2_SHIFT,
272 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
273 		.valid_mask = ARM_TTE_VALID,
274 		.type_mask  = ARM_TTE_TYPE_MASK,
275 		.type_block = ARM_TTE_TYPE_BLOCK
276 	},
277 	[3] = {
278 		.size       = ARM_4K_TT_L3_SIZE,
279 		.offmask    = ARM_4K_TT_L3_OFFMASK,
280 		.shift      = ARM_4K_TT_L3_SHIFT,
281 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
282 		.valid_mask = ARM_PTE_TYPE_VALID,
283 		.type_mask  = ARM_TTE_TYPE_MASK,
284 		.type_block = ARM_TTE_TYPE_L3BLOCK
285 	}
286 };
287 
288 const struct page_table_attr pmap_pt_attr_4k = {
289 	.pta_level_info = pmap_table_level_info_4k,
290 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
291 #if __ARM_MIXED_PAGE_SIZE__
292 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
293 #else /* __ARM_MIXED_PAGE_SIZE__ */
294 #if __ARM_16K_PG__
295 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
296 #else /* __ARM_16K_PG__ */
297 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
298 #endif /* __ARM_16K_PG__ */
299 #endif /* __ARM_MIXED_PAGE_SIZE__ */
300 	.pta_max_level  = PMAP_TT_L3_LEVEL,
301 	.pta_ops = &native_pt_ops,
302 	.ap_ro = ARM_PTE_AP(AP_RORO),
303 	.ap_rw = ARM_PTE_AP(AP_RWRW),
304 	.ap_rona = ARM_PTE_AP(AP_RONA),
305 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
306 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
307 	.ap_x = ARM_PTE_PNX,
308 #if __ARM_MIXED_PAGE_SIZE__
309 	.pta_tcr_value  = TCR_EL1_4KB,
310 #endif /* __ARM_MIXED_PAGE_SIZE__ */
311 	.pta_page_size  = 4096,
312 	.pta_page_shift = 12,
313 	.geometry_id = SPTM_PT_GEOMETRY_4K,
314 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_4KB),
315 };
316 
317 const struct page_table_attr pmap_pt_attr_16k_kern = {
318 	.pta_level_info = pmap_table_level_info_16k,
319 	.pta_root_level = PMAP_TT_L1_LEVEL,
320 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
321 	.pta_max_level  = PMAP_TT_L3_LEVEL,
322 	.pta_ops = &native_pt_ops,
323 	.ap_ro = ARM_PTE_AP(AP_RORO),
324 	.ap_rw = ARM_PTE_AP(AP_RWRW),
325 	.ap_rona = ARM_PTE_AP(AP_RONA),
326 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
327 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
328 	.ap_x = ARM_PTE_PNX,
329 #if __ARM_MIXED_PAGE_SIZE__
330 	.pta_tcr_value  = TCR_EL1_16KB,
331 #endif /* __ARM_MIXED_PAGE_SIZE__ */
332 	.pta_page_size  = 16384,
333 	.pta_page_shift = 14,
334 	.geometry_id = SPTM_PT_GEOMETRY_16K_KERN,
335 	.pta_va_valid_mask = ARM_PTE_T1_REGION_MASK(TCR_EL1_16KB),
336 };
337 
338 const struct page_table_attr pmap_pt_attr_16k = {
339 	.pta_level_info = pmap_table_level_info_16k,
340 	.pta_root_level = PMAP_TT_L1_LEVEL,
341 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
342 	.pta_max_level  = PMAP_TT_L3_LEVEL,
343 	.pta_ops = &native_pt_ops,
344 	.ap_ro = ARM_PTE_AP(AP_RORO),
345 	.ap_rw = ARM_PTE_AP(AP_RWRW),
346 	.ap_rona = ARM_PTE_AP(AP_RONA),
347 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
348 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
349 	.ap_x = ARM_PTE_PNX,
350 #if __ARM_MIXED_PAGE_SIZE__
351 	.pta_tcr_value  = TCR_EL1_16KB,
352 #endif /* __ARM_MIXED_PAGE_SIZE__ */
353 	.pta_page_size  = 16384,
354 	.pta_page_shift = 14,
355 	.geometry_id = SPTM_PT_GEOMETRY_16K,
356 	.pta_va_valid_mask = ARM_PTE_T0_REGION_MASK(TCR_EL1_16KB),
357 };
358 
359 #if __ARM_16K_PG__
360 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
361 #else /* !__ARM_16K_PG__ */
362 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
363 #endif /* !__ARM_16K_PG__ */
364 
365 
366 #if DEVELOPMENT || DEBUG
367 int vm_footprint_suspend_allowed = 1;
368 
369 extern int pmap_ledgers_panic;
370 extern int pmap_ledgers_panic_leeway;
371 
372 #endif /* DEVELOPMENT || DEBUG */
373 
374 #if DEVELOPMENT || DEBUG
375 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
376 	(current_thread()->pmap_footprint_suspended)
377 #else /* DEVELOPMENT || DEBUG */
378 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
379 #endif /* DEVELOPMENT || DEBUG */
380 
381 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
382 
383 
384 /* Keeps track of whether the pmap has been bootstrapped */
385 SECURITY_READ_ONLY_LATE(bool) pmap_bootstrapped = false;
386 
387 /*
388  * Represents a tlb range that will be flushed before returning from the pmap.
389  * Used by phys_attribute_clear_range to defer flushing pages in this range until
390  * the end of the operation, and to accumulate batched operations for submission
391  * to the SPTM as a performance optimization.
392  */
393 typedef struct pmap_tlb_flush_range {
394 	/* Address space in which the flush region resides */
395 	pmap_t ptfr_pmap;
396 
397 	/* Page-aligned beginning of the flush region */
398 	vm_map_address_t ptfr_start;
399 
400 	/* Page-aligned non-inclusive end of the flush region */
401 	vm_map_address_t ptfr_end;
402 
403 	/**
404 	 * Address of current PTE position in ptfr_pmap's [ptfr_start, ptfr_end) region.
405 	 * This is meant to be set up by the caller of pmap_page_protect_options_with_flush_range()
406 	 * or arm_force_fast_fault_with_flush_range(), and used by those functions to determine
407 	 * when a given mapping can be added to the SPTM's per-CPU region templates array vs.
408 	 * the more complex task of adding it to the disjoint ops array.
409 	 */
410 	pt_entry_t *current_ptep;
411 
412 	/**
413 	 * Starting VA for any not-yet-submitted per-CPU region templates.  This is meant to be
414 	 * set up by the caller of pmap_page_protect_options_with_flush_range() or
415 	 * arm_force_fast_fault_with_flush_range() and used by pmap_multipage_op_submit_region()
416 	 * when issuing the SPTM call to purge any pending region ops.
417 	 */
418 	vm_map_address_t pending_region_start;
419 
420 	/**
421 	 * Number of entries in the per-CPU SPTM region templates array which have not
422 	 * yet been submitted to the SPTM.
423 	 */
424 	unsigned int pending_region_entries;
425 
426 	/**
427 	 * Indicates whether at least one region entry was added to the per-CPU region ops
428 	 * array since the last time this field was checked.  Intended to be cleared by the
429 	 * caller.
430 	 */
431 	bool region_entry_added;
432 
433 	/**
434 	 * Marker for the current paddr "header" entry in the per-CPU SPTM disjoint ops array.
435 	 * This field is intended to be modified only by pmap_multipage_op_submit_disjoint()
436 	 * and pmap_multipage_op_add_page(), and should be treated as opaque by callers
437 	 * of those functions.
438 	 */
439 	sptm_update_disjoint_multipage_op_t *current_header;
440 
441 	/**
442 	 * Position in the per-CPU SPTM ops array of the first ordinary
443 	 * sptm_disjoint_op_t entry following [current_header].  This is the starting
444 	 * point at which mappings should be inserted for the page described by
445 	 * [current_header].
446 	 */
447 	unsigned int current_header_first_mapping_index;
448 
449 	/**
450 	 * Number of entries in the per-CPU SPTM disjoint ops array, including paddr headers,
451 	 * which have not yet been submitted to the SPTM.
452 	 */
453 	unsigned int pending_disjoint_entries;
454 
455 	/**
456 	 * This field is used by the preemption check interval logic on the
457 	 * phys_attribute_clear_range() path to determine when sufficient
458 	 * forward progress has been made to check for and (if necessary)
459 	 * handle pending preemption.
460 	 */
461 	unsigned int processed_entries;
462 
463 	/**
464 	 * Indicates whether the top-level caller needs to flush the TLB for
465 	 * the region in [ptfr_pmap] described by [ptfr_start, ptfr_end).
466 	 * This will be set if the SPTM indicates that it needed to alter
467 	 * any valid mapping within this region and SPTM_UPDATE_DEFER_TLBI
468 	 * was passed to the relevant SPTM call(s).
469 	 */
470 	bool ptfr_flush_needed;
471 } pmap_tlb_flush_range_t;
472 
473 
474 
475 /* Virtual memory region for early allocation */
476 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
477 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
478 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
479 
480 extern uint8_t bootstrap_pagetables[];
481 
482 extern unsigned int not_in_kdp;
483 
484 extern vm_offset_t first_avail;
485 
486 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
487 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
488 extern vm_offset_t     static_memory_end;
489 
490 extern const vm_map_address_t physmap_base;
491 extern const vm_map_address_t physmap_end;
492 
493 extern int maxproc, hard_maxproc;
494 
495 extern bool sdsb_io_rgns_present;
496 
497 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
498 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
499 
500 /* The number of address bits one TTBR can cover. */
501 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
502 
503 /*
504  * The bounds on our TTBRs.  These are for sanity checking that
505  * an address is accessible by a TTBR before we attempt to map it.
506  */
507 
508 /* The level of the root of a page table. */
509 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
510 
511 /* The number of entries in the root TT of a page table. */
512 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
513 
514 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
515 const pmap_t    kernel_pmap = &kernel_pmap_store;
516 
517 __static_testable SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
518 
519 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
520 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
521 
522 typedef struct tt_free_entry {
523 	struct tt_free_entry    *next;
524 } tt_free_entry_t;
525 
526 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
527 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
528 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
529 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
530 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
531 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
532 _Atomic unsigned int inuse_iommu_pages_count[SPTM_IOMMUS_N_IDS] = {0}; /* number of active pages for each IOMMU class */
533 
534 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
535 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
536 
537 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
539 
540 /* Lock group used for all pmap object locks. */
541 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
542 
543 #if DEVELOPMENT || DEBUG
544 int nx_enabled = 1;                                     /* enable no-execute protection */
545 int allow_data_exec  = 0;                               /* No apps may execute data */
546 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
547 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
548 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
549 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
550 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
551 #else /* DEVELOPMENT || DEBUG */
552 const int nx_enabled = 1;                                       /* enable no-execute protection */
553 const int allow_data_exec  = 0;                         /* No apps may execute data */
554 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
555 #endif /* DEVELOPMENT || DEBUG */
556 
557 
558 #if MACH_ASSERT
559 static void pmap_check_ledgers(pmap_t pmap);
560 #else
561 static inline void
pmap_check_ledgers(__unused pmap_t pmap)562 pmap_check_ledgers(__unused pmap_t pmap)
563 {
564 }
565 #endif /* MACH_ASSERT */
566 
567 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
568 
569 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
570 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
571 
572 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
573 
574 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
575 
576 /* end of shared region + 512MB for various purposes */
577 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
578 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
579     "Minimum address space size outside allowable range");
580 
581 // Max offset is 15.375GB for devices with "large" memory config
582 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
583 // Max offset is 11.375GB for devices with "small" memory config
584 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
585 
586 
587 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
588     "Large device address space size outside allowable range");
589 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
590     "Small device address space size outside allowable range");
591 
592 #  ifdef XNU_TARGET_OS_OSX
593 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
594 #  else
595 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
596 #  endif
597 
598 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
599 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
600 #else
601 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
602 #endif
603 
604 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
605 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
606 SECURITY_READ_ONLY_LATE(__static_testable bitmap_t*) asid_bitmap;
607 #if !HAS_16BIT_ASID
608 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
609 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
610 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
611 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
612 #else
613 static uint16_t last_allocated_asid = 0;
614 #endif /* !HAS_16BIT_ASID */
615 
616 
617 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table;
618 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table;
619 #if __ARM_MIXED_PAGE_SIZE__
620 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table;
621 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table;
622 #endif
623 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_data_pa = 0;
624 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_text_pa = 0;
625 SECURITY_READ_ONLY_LATE(static vm_map_address_t) commpage_text_user_va = 0;
626 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_ro_data_pa = 0;
627 
628 
629 #if (DEVELOPMENT || DEBUG)
630 /* Caches whether the SPTM sysreg API has been enabled by the SPTM */
631 SECURITY_READ_ONLY_LATE(static bool) sptm_sysreg_available = false;
632 #endif /* (DEVELOPMENT || DEBUG) */
633 
634 /* PTE Define Macros */
635 
636 #ifndef SPTM_PTE_IN_FLIGHT_MARKER
637 /* SPTM TODO: Get rid of this once we export SPTM_PTE_IN_FLIGHT_MARKER from the SPTM. */
638 #define SPTM_PTE_IN_FLIGHT_MARKER 0x80U
639 #endif /* SPTM_PTE_IN_FLIGHT_MARKER */
640 
641 /**
642  * Determine whether a PTE has been marked as compressed.  This function also panics if
643  * the PTE contains bits that shouldn't be present in a compressed PTE, which is most of them.
644  *
645  * @param pte the PTE contents to check
646  * @param ptep the address of the PTE contents, for diagnostic purposes only
647  *
648  * @return true if the PTE is compressed, false otherwise
649  */
650 static inline bool
pte_is_compressed(pt_entry_t pte,pt_entry_t * ptep)651 pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep)
652 {
653 	const bool compressed = (!pte_is_valid(pte) && (pte & ARM_PTE_COMPRESSED));
654 	/**
655 	 * Check for bits that shouldn't be present in a compressed PTE.  This is everything except the
656 	 * compressed/compressed-alt bits, as well as the SPTM's in-flight marker which may be set while
657 	 * the SPTM is in the process of flushing the TLBs after marking a previously-valid PTE as
658 	 * compressed.
659 	 */
660 	if (__improbable(compressed && (pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER)))) {
661 		panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?",
662 		    ptep, pte, pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER));
663 	}
664 	return compressed;
665 }
666 
667 #define pte_is_wired(pte)                                                               \
668 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
669 
670 #define pte_was_writeable(pte) \
671 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
672 
673 #define pte_set_was_writeable(pte, was_writeable) \
674 	do {                                         \
675 	        if ((was_writeable)) {               \
676 	                (pte) |= ARM_PTE_WRITEABLE;  \
677 	        } else {                             \
678 	                (pte) &= ~ARM_PTE_WRITEABLE; \
679 	        }                                    \
680 	} while(0)
681 
682 /**
683  * Updated wired-mapping accountings in the PTD and ledger.
684  *
685  * @param pmap The pmap against which to update accounting
686  * @param pte_p The PTE whose wired state is being changed
687  * @param wired Indicates whether the PTE is being wired or unwired.
688  */
689 static inline void
pte_update_wiredcnt(pmap_t pmap,pt_entry_t * pte_p,boolean_t wired)690 pte_update_wiredcnt(pmap_t pmap, pt_entry_t *pte_p, boolean_t wired)
691 {
692 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
693 	unsigned short *ptd_wiredcnt_ptr = &(ptep_get_info(pte_p)->wiredcnt);
694 	if (wired) {
695 		if (__improbable(os_atomic_inc_orig(ptd_wiredcnt_ptr, relaxed) == UINT16_MAX)) {
696 			panic("pmap %p (pte %p): wired count overflow", pmap, pte_p);
697 		}
698 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
699 	} else {
700 		if (__improbable(os_atomic_dec_orig(ptd_wiredcnt_ptr, relaxed) == 0)) {
701 			panic("pmap %p (pte %p): wired count underflow", pmap, pte_p);
702 		}
703 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
704 	}
705 }
706 
707 /*
708  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
709  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
710  * will observe the updated PTE.
711  */
712 #define FLUSH_PTE()                                                                     \
713 	__builtin_arm_dmb(DMB_ISH);
714 
715 /*
716  * Synchronize updates to PTEs that were previously valid and thus may be cached in
717  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
718  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
719  * program order will not issue until the DSB completes.  Prior loads may be reordered
720  * after the barrier, but their behavior should not be materially affected by the
721  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
722  * matter for loads until the access is re-driven well after the TLB update is
723  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
724  * we should be in a position to handle access faults.  For "voluntary" PTE access
725  * restriction due to unmapping or protection, the decision to restrict access should
726  * have a data dependency on prior loads in order to avoid a data race.
727  */
728 #define FLUSH_PTE_STRONG()                                                             \
729 	__builtin_arm_dsb(DSB_ISHST);
730 
731 /**
732  * Write enough page table entries to map a single VM page. On systems where the
733  * VM page size does not match the hardware page size, multiple page table
734  * entries will need to be written.
735  *
736  * @note This function does not emit a barrier to ensure these page table writes
737  *       have completed before continuing. This is commonly needed. In the case
738  *       where a DMB or DSB barrier is needed, then use the write_pte() and
739  *       write_pte_strong() functions respectively instead of this one.
740  *
741  * @param ptep Pointer to the first page table entry to update.
742  * @param pte The value to write into each page table entry. In the case that
743  *            multiple PTEs are updated to a non-empty value, then the address
744  *            in this value will automatically be incremented for each PTE
745  *            write.
746  */
747 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)748 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
749 {
750 	/**
751 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
752 	 * systems, which is why it's checked at runtime instead of compile time.
753 	 * The "unreachable" warning needs to be suppressed because it still is a
754 	 * compile time constant on some systems.
755 	 */
756 	__unreachable_ok_push
757 	if (TEST_PAGE_RATIO_4) {
758 		if (((uintptr_t)ptep) & 0x1f) {
759 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
760 			    __func__, ptep, (void*)pte);
761 		}
762 
763 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
764 			/**
765 			 * If we're writing an empty/compressed PTE value, then don't
766 			 * auto-increment the address for each PTE write.
767 			 */
768 			*ptep = pte;
769 			*(ptep + 1) = pte;
770 			*(ptep + 2) = pte;
771 			*(ptep + 3) = pte;
772 		} else {
773 			*ptep = pte;
774 			*(ptep + 1) = pte | 0x1000;
775 			*(ptep + 2) = pte | 0x2000;
776 			*(ptep + 3) = pte | 0x3000;
777 		}
778 	} else {
779 		*ptep = pte;
780 	}
781 	__unreachable_ok_pop
782 }
783 
784 /**
785  * Writes enough page table entries to map a single VM page and then ensures
786  * those writes complete by executing a Data Memory Barrier.
787  *
788  * @note The DMB issued by this function is not strong enough to protect against
789  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
790  *       instruction is going to immediately be called after this write, it's
791  *       recommended to call write_pte_strong() instead of this function.
792  *
793  * See the function header for write_pte_fast() for more details on the
794  * parameters.
795  */
796 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)797 write_pte(pt_entry_t *ptep, pt_entry_t pte)
798 {
799 	write_pte_fast(ptep, pte);
800 	FLUSH_PTE();
801 }
802 
803 /**
804  * Retrieve the pmap structure for the thread running on the current CPU.
805  */
806 pmap_t
current_pmap()807 current_pmap()
808 {
809 	const pmap_t current = vm_map_pmap(current_thread()->map);
810 	assert(current != NULL);
811 	return current;
812 }
813 
814 #if DEVELOPMENT || DEBUG
815 
816 /*
817  * Trace levels are controlled by a bitmask in which each
818  * level can be enabled/disabled by the (1<<level) position
819  * in the boot arg
820  * Level 0: PPL extension functionality
821  * Level 1: pmap lifecycle (create/destroy/switch)
822  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
823  * Level 3: internal state management (attributes/fast-fault)
824  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
825  */
826 
827 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
828 
829 #define PMAP_TRACE(level, ...) \
830 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
831 	        KDBG_RELEASE(__VA_ARGS__); \
832 	}
833 #else /* DEVELOPMENT || DEBUG */
834 
835 #define PMAP_TRACE(level, ...)
836 
837 #endif /* DEVELOPMENT || DEBUG */
838 
839 
840 /*
841  * Internal function prototypes (forward declarations).
842  */
843 
844 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
845 
846 static void pmap_set_reference(ppnum_t pn);
847 
848 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
849 
850 static kern_return_t pmap_expand(
851 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
852 
853 static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t);
854 
855 static tt_entry_t *pmap_tt1_allocate(pmap_t, vm_size_t, uint8_t);
856 
857 static void pmap_tt1_deallocate(pmap_t, tt_entry_t *, vm_size_t);
858 
859 static kern_return_t pmap_tt_allocate(
860 	pmap_t, tt_entry_t **, pt_desc_t **, unsigned int, unsigned int);
861 
862 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
863 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
864 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
865 
866 static void pmap_unmap_commpage(
867 	pmap_t pmap);
868 
869 static boolean_t
870 pmap_is_64bit(pmap_t);
871 
872 
873 static void pmap_flush_tlb_for_paddr_async(pmap_paddr_t);
874 
875 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
876 
877 static boolean_t arm_clear_fast_fault(
878 	ppnum_t ppnum,
879 	vm_prot_t fault_type,
880 	uintptr_t pvh,
881 	pt_entry_t *pte_p,
882 	pp_attr_t attrs_to_clear);
883 
884 static void pmap_tte_deallocate(
885 	pmap_t pmap,
886 	vm_offset_t va_start,
887 	tt_entry_t *ttep,
888 	unsigned int level,
889 	bool pmap_locked);
890 
891 
892 /*
893  * Temporary prototypes, while we wait for pmap_enter to move to taking an
894  * address instead of a page number.
895  */
896 kern_return_t
897 pmap_enter(
898 	pmap_t pmap,
899 	vm_map_address_t v,
900 	ppnum_t pn,
901 	vm_prot_t prot,
902 	vm_prot_t fault_type,
903 	unsigned int flags,
904 	boolean_t wired,
905 	pmap_mapping_type_t mapping_type);
906 
907 static kern_return_t
908 pmap_enter_addr(
909 	pmap_t pmap,
910 	vm_map_address_t v,
911 	pmap_paddr_t pa,
912 	vm_prot_t prot,
913 	vm_prot_t fault_type,
914 	unsigned int flags,
915 	boolean_t wired,
916 	pmap_mapping_type_t mapping_type);
917 
918 kern_return_t
919 pmap_enter_options_addr(
920 	pmap_t pmap,
921 	vm_map_address_t v,
922 	pmap_paddr_t pa,
923 	vm_prot_t prot,
924 	vm_prot_t fault_type,
925 	unsigned int flags,
926 	boolean_t wired,
927 	unsigned int options,
928 	__unused void   *arg,
929 	pmap_mapping_type_t mapping_type);
930 
931 #ifdef CONFIG_XNUPOST
932 kern_return_t pmap_test(void);
933 #endif /* CONFIG_XNUPOST */
934 
935 PMAP_SUPPORT_PROTOTYPES(
936 	kern_return_t,
937 	arm_fast_fault, (pmap_t pmap,
938 	vm_map_address_t va,
939 	vm_prot_t fault_type,
940 	bool was_af_fault,
941 	bool from_user), ARM_FAST_FAULT_INDEX);
942 
943 PMAP_SUPPORT_PROTOTYPES(
944 	boolean_t,
945 	arm_force_fast_fault, (ppnum_t ppnum,
946 	vm_prot_t allow_mode,
947 	int options), ARM_FORCE_FAST_FAULT_INDEX);
948 
949 MARK_AS_PMAP_TEXT static boolean_t
950 arm_force_fast_fault_with_flush_range(
951 	ppnum_t ppnum,
952 	vm_prot_t allow_mode,
953 	int options,
954 	locked_pvh_t *locked_pvh,
955 	pp_attr_t bits_to_clear,
956 	pmap_tlb_flush_range_t *flush_range);
957 
958 PMAP_SUPPORT_PROTOTYPES(
959 	void,
960 	pmap_batch_set_cache_attributes, (
961 		const unified_page_list_t * page_list,
962 		unsigned int cacheattr,
963 		bool update_attr_table), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
964 
965 PMAP_SUPPORT_PROTOTYPES(
966 	void,
967 	pmap_change_wiring, (pmap_t pmap,
968 	vm_map_address_t v,
969 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
970 
971 PMAP_SUPPORT_PROTOTYPES(
972 	pmap_t,
973 	pmap_create_options, (ledger_t ledger,
974 	vm_map_size_t size,
975 	unsigned int flags,
976 	kern_return_t * kr), PMAP_CREATE_INDEX);
977 
978 PMAP_SUPPORT_PROTOTYPES(
979 	void,
980 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
981 
982 PMAP_SUPPORT_PROTOTYPES(
983 	kern_return_t,
984 	pmap_enter_options, (pmap_t pmap,
985 	vm_map_address_t v,
986 	pmap_paddr_t pa,
987 	vm_prot_t prot,
988 	vm_prot_t fault_type,
989 	unsigned int flags,
990 	boolean_t wired,
991 	unsigned int options,
992 	pmap_mapping_type_t mapping_type), PMAP_ENTER_OPTIONS_INDEX);
993 
994 PMAP_SUPPORT_PROTOTYPES(
995 	pmap_paddr_t,
996 	pmap_find_pa, (pmap_t pmap,
997 	addr64_t va), PMAP_FIND_PA_INDEX);
998 
999 PMAP_SUPPORT_PROTOTYPES(
1000 	kern_return_t,
1001 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
1002 
1003 
1004 PMAP_SUPPORT_PROTOTYPES(
1005 	boolean_t,
1006 	pmap_is_empty, (pmap_t pmap,
1007 	vm_map_offset_t va_start,
1008 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
1009 
1010 
1011 PMAP_SUPPORT_PROTOTYPES(
1012 	unsigned int,
1013 	pmap_map_cpu_windows_copy, (ppnum_t pn,
1014 	vm_prot_t prot,
1015 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
1016 
1017 PMAP_SUPPORT_PROTOTYPES(
1018 	void,
1019 	pmap_ro_zone_memcpy, (zone_id_t zid,
1020 	vm_offset_t va,
1021 	vm_offset_t offset,
1022 	const vm_offset_t new_data,
1023 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
1024 
1025 PMAP_SUPPORT_PROTOTYPES(
1026 	uint64_t,
1027 	pmap_ro_zone_atomic_op, (zone_id_t zid,
1028 	vm_offset_t va,
1029 	vm_offset_t offset,
1030 	zro_atomic_op_t op,
1031 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
1032 
1033 PMAP_SUPPORT_PROTOTYPES(
1034 	void,
1035 	pmap_ro_zone_bzero, (zone_id_t zid,
1036 	vm_offset_t va,
1037 	vm_offset_t offset,
1038 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1039 
1040 PMAP_SUPPORT_PROTOTYPES(
1041 	kern_return_t,
1042 	pmap_nest, (pmap_t grand,
1043 	pmap_t subord,
1044 	addr64_t vstart,
1045 	uint64_t size), PMAP_NEST_INDEX);
1046 
1047 PMAP_SUPPORT_PROTOTYPES(
1048 	void,
1049 	pmap_page_protect_options, (ppnum_t ppnum,
1050 	vm_prot_t prot,
1051 	unsigned int options,
1052 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1053 
1054 PMAP_SUPPORT_PROTOTYPES(
1055 	vm_map_address_t,
1056 	pmap_protect_options, (pmap_t pmap,
1057 	vm_map_address_t start,
1058 	vm_map_address_t end,
1059 	vm_prot_t prot,
1060 	unsigned int options,
1061 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1062 
1063 PMAP_SUPPORT_PROTOTYPES(
1064 	kern_return_t,
1065 	pmap_query_page_info, (pmap_t pmap,
1066 	vm_map_offset_t va,
1067 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1068 
1069 PMAP_SUPPORT_PROTOTYPES(
1070 	mach_vm_size_t,
1071 	pmap_query_resident, (pmap_t pmap,
1072 	vm_map_address_t start,
1073 	vm_map_address_t end,
1074 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1075 
1076 PMAP_SUPPORT_PROTOTYPES(
1077 	void,
1078 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1079 
1080 PMAP_SUPPORT_PROTOTYPES(
1081 	vm_map_address_t,
1082 	pmap_remove_options, (pmap_t pmap,
1083 	vm_map_address_t start,
1084 	vm_map_address_t end,
1085 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1086 
1087 
1088 PMAP_SUPPORT_PROTOTYPES(
1089 	void,
1090 	pmap_set_cache_attributes, (ppnum_t pn,
1091 	unsigned int cacheattr,
1092 	bool update_attr_table), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1093 
1094 PMAP_SUPPORT_PROTOTYPES(
1095 	void,
1096 	pmap_update_compressor_page, (ppnum_t pn,
1097 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1098 
1099 PMAP_SUPPORT_PROTOTYPES(
1100 	void,
1101 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1102 
1103 #if MACH_ASSERT
1104 PMAP_SUPPORT_PROTOTYPES(
1105 	void,
1106 	pmap_set_process, (pmap_t pmap,
1107 	int pid,
1108 	char *procname), PMAP_SET_PROCESS_INDEX);
1109 #endif
1110 
1111 PMAP_SUPPORT_PROTOTYPES(
1112 	void,
1113 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1114 
1115 PMAP_SUPPORT_PROTOTYPES(
1116 	void,
1117 	pmap_unnest_options, (pmap_t grand,
1118 	addr64_t vaddr,
1119 	uint64_t size,
1120 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1121 
1122 PMAP_SUPPORT_PROTOTYPES(
1123 	void,
1124 	phys_attribute_set, (ppnum_t pn,
1125 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1126 
1127 PMAP_SUPPORT_PROTOTYPES(
1128 	void,
1129 	phys_attribute_clear, (ppnum_t pn,
1130 	unsigned int bits,
1131 	int options,
1132 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1133 
1134 #if __ARM_RANGE_TLBI__
1135 PMAP_SUPPORT_PROTOTYPES(
1136 	vm_map_address_t,
1137 	phys_attribute_clear_range, (pmap_t pmap,
1138 	vm_map_address_t start,
1139 	vm_map_address_t end,
1140 	unsigned int bits,
1141 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1142 #endif /* __ARM_RANGE_TLBI__ */
1143 
1144 
1145 PMAP_SUPPORT_PROTOTYPES(
1146 	void,
1147 	pmap_switch, (pmap_t pmap, thread_t thread), PMAP_SWITCH_INDEX);
1148 
1149 PMAP_SUPPORT_PROTOTYPES(
1150 	void,
1151 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1152 
1153 PMAP_SUPPORT_PROTOTYPES(
1154 	void,
1155 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1156 
1157 PMAP_SUPPORT_PROTOTYPES(
1158 	void,
1159 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1160 
1161 PMAP_SUPPORT_PROTOTYPES(
1162 	void,
1163 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1164 
1165 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1166 PMAP_SUPPORT_PROTOTYPES(
1167 	void,
1168 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1169 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1170 
1171 PMAP_SUPPORT_PROTOTYPES(
1172 	void,
1173 	pmap_trim, (pmap_t grand,
1174 	pmap_t subord,
1175 	addr64_t vstart,
1176 	uint64_t size), PMAP_TRIM_INDEX);
1177 
1178 #if HAS_APPLE_PAC
1179 PMAP_SUPPORT_PROTOTYPES(
1180 	void *,
1181 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1182 PMAP_SUPPORT_PROTOTYPES(
1183 	void *,
1184 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1185 #endif /* HAS_APPLE_PAC */
1186 
1187 
1188 void pmap_footprint_suspend(vm_map_t    map,
1189     boolean_t   suspend);
1190 PMAP_SUPPORT_PROTOTYPES(
1191 	void,
1192 	pmap_footprint_suspend, (vm_map_t map,
1193 	boolean_t suspend),
1194 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1195 
1196 
1197 
1198 
1199 
1200 /*
1201  * The low global vector page is mapped at a fixed alias.
1202  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1203  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1204  * to check both addresses anyway for backward compatibility. So for now
1205  * we leave H6 and H7 where they were.
1206  */
1207 #if (ARM_PGSHIFT == 14)
1208 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1209 #else
1210 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1211 #endif
1212 
1213 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1214 PMAP_ZINFO_PALLOC(
1215 	pmap_t pmap, int bytes)
1216 {
1217 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1218 }
1219 
1220 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1221 PMAP_ZINFO_PFREE(
1222 	pmap_t pmap,
1223 	int bytes)
1224 {
1225 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1226 }
1227 
1228 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1229 pmap_tt_ledger_credit(
1230 	pmap_t          pmap,
1231 	vm_size_t       size)
1232 {
1233 	if (pmap != kernel_pmap) {
1234 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1235 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1236 	}
1237 }
1238 
1239 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1240 pmap_tt_ledger_debit(
1241 	pmap_t          pmap,
1242 	vm_size_t       size)
1243 {
1244 	if (pmap != kernel_pmap) {
1245 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1246 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1247 	}
1248 }
1249 
1250 static inline void
pmap_update_plru(uint16_t asid_index __unused)1251 pmap_update_plru(uint16_t asid_index __unused)
1252 {
1253 #if !HAS_16BIT_ASID
1254 	if (__probable(pmap_asid_plru)) {
1255 		unsigned plru_index = asid_index >> 6;
1256 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1257 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1258 			asid_plru_bitmap[plru_index] = ((plru_index == 0) ? ~1ULL : UINT64_MAX);
1259 		}
1260 	}
1261 #endif /* !HAS_16BIT_ASID */
1262 }
1263 
1264 static bool
alloc_asid(pmap_t pmap)1265 alloc_asid(pmap_t pmap)
1266 {
1267 	int vasid = -1;
1268 
1269 	pmap_simple_lock(&asid_lock);
1270 
1271 #if !HAS_16BIT_ASID
1272 	if (__probable(pmap_asid_plru)) {
1273 		unsigned plru_index = 0;
1274 		uint64_t lowest_gen = asid_plru_generation[0];
1275 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1276 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1277 			if (asid_plru_generation[i] < lowest_gen) {
1278 				plru_index = i;
1279 				lowest_gen = asid_plru_generation[i];
1280 				lowest_gen_bitmap = asid_plru_bitmap[i];
1281 			}
1282 		}
1283 
1284 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += (MAX_HW_ASIDS >> 6)) {
1285 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1286 			if (temp_plru) {
1287 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1288 #if DEVELOPMENT || DEBUG
1289 				++pmap_asid_hits;
1290 #endif
1291 				break;
1292 			}
1293 		}
1294 	}
1295 #else
1296 	/**
1297 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1298 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1299 	 * However, we first try to allocate starting from the position of the most-recently allocated
1300 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1301 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1302 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1303 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1304 	 * logic, without requiring prohibitively expensive RCTX instructions.
1305 	 */
1306 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1307 #endif /* !HAS_16BIT_ASID */
1308 	if (__improbable(vasid < 0)) {
1309 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1310 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1311 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1312 #if DEVELOPMENT || DEBUG
1313 		++pmap_asid_misses;
1314 #endif
1315 	}
1316 	if (__improbable(vasid < 0)) {
1317 		pmap_simple_unlock(&asid_lock);
1318 		return false;
1319 	}
1320 	assert((uint32_t)vasid < pmap_max_asids);
1321 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1322 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1323 	const uint16_t hw_asid = (uint16_t)(vasid & (MAX_HW_ASIDS - 1));
1324 #if HAS_16BIT_ASID
1325 	last_allocated_asid = hw_asid;
1326 #endif /* HAS_16BIT_ASID */
1327 	pmap_simple_unlock(&asid_lock);
1328 	assert(hw_asid != 0); // Should never alias kernel ASID
1329 	pmap->asid = (uint16_t)vasid;
1330 	pmap_update_plru(hw_asid);
1331 	return true;
1332 }
1333 
1334 static void
free_asid(pmap_t pmap)1335 free_asid(pmap_t pmap)
1336 {
1337 	const uint16_t vasid = os_atomic_xchg(&pmap->asid, 0, relaxed);
1338 	if (__improbable(vasid == 0)) {
1339 		return;
1340 	}
1341 
1342 #if !HAS_16BIT_ASID
1343 	if (pmap_asid_plru) {
1344 		const uint16_t hw_asid = vasid & (MAX_HW_ASIDS - 1);
1345 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1346 	}
1347 #endif /* !HAS_16BIT_ASID */
1348 	pmap_simple_lock(&asid_lock);
1349 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1350 	bitmap_set(&asid_bitmap[0], vasid);
1351 	pmap_simple_unlock(&asid_lock);
1352 }
1353 
1354 
1355 boolean_t
pmap_valid_address(pmap_paddr_t addr)1356 pmap_valid_address(
1357 	pmap_paddr_t addr)
1358 {
1359 	return pa_valid(addr);
1360 }
1361 
1362 
1363 
1364 
1365 
1366 
1367 /*
1368  *      Map memory at initialization.  The physical addresses being
1369  *      mapped are not managed and are never unmapped.
1370  *
1371  *      For now, VM is already on, we only need to map the
1372  *      specified memory.
1373  */
1374 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1375 pmap_map(
1376 	vm_map_address_t virt,
1377 	vm_offset_t start,
1378 	vm_offset_t end,
1379 	vm_prot_t prot,
1380 	unsigned int flags)
1381 {
1382 	kern_return_t   kr;
1383 	vm_size_t       ps;
1384 
1385 	ps = PAGE_SIZE;
1386 	while (start < end) {
1387 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1388 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1389 
1390 		if (kr != KERN_SUCCESS) {
1391 			panic("%s: failed pmap_enter, "
1392 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1393 			    __FUNCTION__,
1394 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1395 		}
1396 
1397 		virt += ps;
1398 		start += ps;
1399 	}
1400 
1401 
1402 	return virt;
1403 }
1404 
1405 #if HAS_SPTM_SYSCTL
1406 bool disarm_protected_io = false;
1407 #endif /* HAS_SPTM_SYSCTL */
1408 
1409 /**
1410  * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type.
1411  *
1412  * @param paddr The physical address of the page.
1413  * @param tmplate The PTE value to be evaluated.
1414  *
1415  * @return A new PTE value with permission bits modified.
1416  */
1417 static inline
1418 pt_entry_t
pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr,pt_entry_t tmplate)1419 pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate)
1420 {
1421 #if HAS_SPTM_SYSCTL
1422 	if (__improbable(disarm_protected_io)) {
1423 		/* Make sure disarm_protected_io is read before its counterpart in SPTM */
1424 		os_atomic_thread_fence(acquire);
1425 		return tmplate;
1426 	}
1427 
1428 #endif /* HAS_SPTM_SYSCTL */
1429 
1430 	/**
1431 	 * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade
1432 	 * the mapping to RO. This is required because IOKit relies on this
1433 	 * behavior currently in the PPL.
1434 	 */
1435 	const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
1436 	if (frame_type == XNU_PROTECTED_IO) {
1437 		/* SPTM to own the page by converting KERN_RW to PPL_RW. */
1438 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1439 		switch (xprr_perm) {
1440 		case XPRR_KERN_RO_PERM:
1441 			break;
1442 		case XPRR_KERN_RW_PERM:
1443 			tmplate &= ~ARM_PTE_XPRR_MASK;
1444 			tmplate |= xprr_perm_to_pte(XPRR_KERN_RO_PERM);
1445 			break;
1446 		default:
1447 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1448 		}
1449 	}
1450 
1451 	return tmplate;
1452 }
1453 
1454 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1455 pmap_map_bd_with_options(
1456 	vm_map_address_t virt,
1457 	vm_offset_t start,
1458 	vm_offset_t end,
1459 	vm_prot_t prot,
1460 	int32_t options)
1461 {
1462 	pt_entry_t      tmplate;
1463 	vm_map_address_t vaddr;
1464 	vm_offset_t     paddr;
1465 	pt_entry_t      mem_attr;
1466 
1467 	if (__improbable(start & PAGE_MASK)) {
1468 		panic("%s: start 0x%lx is not page aligned", __func__, start);
1469 	}
1470 
1471 	if (__improbable(end & PAGE_MASK)) {
1472 		panic("%s: end 0x%lx is not page aligned", __func__, end);
1473 	}
1474 
1475 	if (__improbable(!gDramBase || !gDramSize)) {
1476 		panic("%s: gDramBase/gDramSize not initialized", __func__);
1477 	}
1478 
1479 	bool first_page_is_dram = is_dram_addr(start);
1480 	for (vm_offset_t pa = start + PAGE_SIZE; pa < end; pa += PAGE_SIZE) {
1481 		if (first_page_is_dram != is_dram_addr(pa)) {
1482 			panic("%s: range crosses DRAM boundary. First inconsistent page 0x%lx %s DRAM",
1483 			    __func__, pa, first_page_is_dram ? "is not" : "is");
1484 		}
1485 	}
1486 
1487 	switch (options & PMAP_MAP_BD_MASK) {
1488 	case PMAP_MAP_BD_WCOMB:
1489 		if (is_dram_addr(start)) {
1490 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1491 		} else {
1492 #if HAS_FEAT_XS
1493 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
1494 #else /* HAS_FEAT_XS */
1495 			mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1496 #endif /* HAS_FEAT_XS */
1497 #if DEBUG || DEVELOPMENT
1498 			pmap_wcrt_on_non_dram_count_increment_atomic();
1499 #endif /* DEBUG || DEVELOPMENT */
1500 		}
1501 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1502 		break;
1503 	case PMAP_MAP_BD_POSTED:
1504 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1505 		break;
1506 	case PMAP_MAP_BD_POSTED_REORDERED:
1507 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1508 		break;
1509 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1510 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1511 		break;
1512 	default:
1513 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1514 		break;
1515 	}
1516 
1517 	tmplate = ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1518 	    mem_attr | ARM_PTE_TYPE_VALID | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1519 
1520 #if __ARM_KERNEL_PROTECT__
1521 	tmplate |= ARM_PTE_NG;
1522 #endif /* __ARM_KERNEL_PROTECT__ */
1523 
1524 	vaddr = virt;
1525 	paddr = start;
1526 	while (paddr < end) {
1527 		__assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, vaddr, pmap_force_pte_kernel_ro_if_protected_io(paddr, tmplate) | pa_to_pte(paddr));
1528 		assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1529 
1530 		vaddr += PAGE_SIZE;
1531 		paddr += PAGE_SIZE;
1532 	}
1533 
1534 	return vaddr;
1535 }
1536 
1537 /*
1538  *      Back-door routine for mapping kernel VM at initialization.
1539  *      Useful for mapping memory outside the range
1540  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1541  *      Otherwise like pmap_map.
1542  */
1543 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1544 pmap_map_bd(
1545 	vm_map_address_t virt,
1546 	vm_offset_t start,
1547 	vm_offset_t end,
1548 	vm_prot_t prot)
1549 {
1550 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
1551 }
1552 
1553 /*
1554  *      Back-door routine for mapping kernel VM at initialization.
1555  *      Useful for mapping memory specific physical addresses in early
1556  *      boot (i.e., before kernel_map is initialized).
1557  *
1558  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1559  */
1560 
1561 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1562 pmap_map_high_window_bd(
1563 	vm_offset_t pa_start,
1564 	vm_size_t len,
1565 	vm_prot_t prot)
1566 {
1567 	pt_entry_t              *ptep, pte;
1568 	vm_map_address_t        va_start = VREGION1_START;
1569 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1570 	vm_map_address_t        va_end;
1571 	vm_map_address_t        va;
1572 	vm_size_t               offset;
1573 
1574 	offset = pa_start & PAGE_MASK;
1575 	pa_start -= offset;
1576 	len += offset;
1577 
1578 	if (len > (va_max - va_start)) {
1579 		panic("%s: area too large, "
1580 		    "pa_start=%p, len=%p, prot=0x%x",
1581 		    __FUNCTION__,
1582 		    (void*)pa_start, (void*)len, prot);
1583 	}
1584 
1585 scan:
1586 	for (; va_start < va_max; va_start += PAGE_SIZE) {
1587 		ptep = pmap_pte(kernel_pmap, va_start);
1588 		assert(!pte_is_compressed(*ptep, ptep));
1589 		if (!pte_is_valid(*ptep)) {
1590 			break;
1591 		}
1592 	}
1593 	if (va_start > va_max) {
1594 		panic("%s: insufficient pages, "
1595 		    "pa_start=%p, len=%p, prot=0x%x",
1596 		    __FUNCTION__,
1597 		    (void*)pa_start, (void*)len, prot);
1598 	}
1599 
1600 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1601 		ptep = pmap_pte(kernel_pmap, va_end);
1602 		assert(!pte_is_compressed(*ptep, ptep));
1603 		if (pte_is_valid(*ptep)) {
1604 			va_start = va_end + PAGE_SIZE;
1605 			goto scan;
1606 		}
1607 	}
1608 
1609 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1610 		ptep = pmap_pte(kernel_pmap, va);
1611 		pte = pa_to_pte(pa_start)
1612 		    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1613 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1614 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT)
1615 		    | ARM_PTE_SH(SH_OUTER_MEMORY);
1616 #if __ARM_KERNEL_PROTECT__
1617 		pte |= ARM_PTE_NG;
1618 #endif /* __ARM_KERNEL_PROTECT__ */
1619 		__assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, va, pte);
1620 		assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1621 	}
1622 #if KASAN
1623 	kasan_notify_address(va_start, len);
1624 #endif
1625 	return va_start;
1626 }
1627 
1628 /*
1629  * pmap_get_arm64_prot
1630  *
1631  * return effective armv8 VMSA block protections including
1632  * table AP/PXN/XN overrides of a pmap entry
1633  *
1634  */
1635 
1636 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1637 pmap_get_arm64_prot(
1638 	pmap_t pmap,
1639 	vm_offset_t addr)
1640 {
1641 	tt_entry_t tte = 0;
1642 	unsigned int level = 0;
1643 	uint64_t effective_prot_bits = 0;
1644 	uint64_t aggregate_tte = 0;
1645 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1646 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1647 
1648 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1649 		tte = *pmap_ttne(pmap, level, addr);
1650 
1651 		if (!(tte & ARM_TTE_VALID)) {
1652 			return 0;
1653 		}
1654 
1655 		if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
1656 			/* Block or page mapping; both have the same protection bit layout. */
1657 			break;
1658 		} else if (tte_is_table(tte)) {
1659 			/* All of the table bits we care about are overrides, so just OR them together. */
1660 			aggregate_tte |= tte;
1661 		}
1662 	}
1663 
1664 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1665 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1666 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1667 
1668 	/* Start with the PTE bits. */
1669 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1670 
1671 	/* Table AP bits mask out block/page AP bits */
1672 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1673 
1674 	/* XN/PXN bits can be OR'd in. */
1675 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1676 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1677 
1678 	return effective_prot_bits;
1679 }
1680 
1681 /**
1682  * Helper macros for accessing the "unnested" and "in-progress" bits in
1683  * pmap->nested_region_unnested_table_bitmap.
1684  */
1685 #define UNNEST_BIT(index) ((index) * 2)
1686 #define UNNEST_IN_PROGRESS_BIT(index) (((index) * 2) + 1)
1687 
1688 /*
1689  *	Bootstrap the system enough to run with virtual memory.
1690  *
1691  *	The early VM initialization code has already allocated
1692  *	the first CPU's translation table and made entries for
1693  *	all the one-to-one mappings to be found there.
1694  *
1695  *	We must set up the kernel pmap structures, the
1696  *	physical-to-virtual translation lookup tables for the
1697  *	physical memory to be managed (between avail_start and
1698  *	avail_end).
1699  *
1700  *	Map the kernel's code and data, and allocate the system page table.
1701  *	Page_size must already be set.
1702  *
1703  *	Parameters:
1704  *	first_avail	first available physical page -
1705  *			   after kernel page tables
1706  *	avail_start	PA of first managed physical page
1707  *	avail_end	PA of last managed physical page
1708  */
1709 
1710 void
pmap_bootstrap(vm_offset_t vstart)1711 pmap_bootstrap(
1712 	vm_offset_t vstart)
1713 {
1714 	vm_map_offset_t maxoffset;
1715 
1716 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
1717 
1718 #if DEVELOPMENT || DEBUG
1719 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
1720 		kprintf("Kernel traces for pmap operations enabled\n");
1721 	}
1722 #endif
1723 
1724 	/*
1725 	 *	Initialize the kernel pmap.
1726 	 */
1727 #if ARM_PARAMETERIZED_PMAP
1728 	kernel_pmap->pmap_pt_attr = &pmap_pt_attr_16k_kern;
1729 #endif /* ARM_PARAMETERIZED_PMAP */
1730 #if HAS_APPLE_PAC
1731 	kernel_pmap->disable_jop = 0;
1732 #endif /* HAS_APPLE_PAC */
1733 	kernel_pmap->tte = cpu_tte;
1734 	kernel_pmap->ttep = cpu_ttep;
1735 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
1736 	kernel_pmap->max = UINTPTR_MAX;
1737 	os_ref_init_count_raw(&kernel_pmap->ref_count, &pmap_refgrp, 1);
1738 	kernel_pmap->nx_enabled = TRUE;
1739 	kernel_pmap->is_64bit = TRUE;
1740 #if CONFIG_ROSETTA
1741 	kernel_pmap->is_rosetta = FALSE;
1742 #endif
1743 
1744 	kernel_pmap->nested_region_addr = 0x0ULL;
1745 	kernel_pmap->nested_region_size = 0x0ULL;
1746 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
1747 	kernel_pmap->type = PMAP_TYPE_KERNEL;
1748 
1749 	kernel_pmap->asid = 0;
1750 
1751 	/**
1752 	 * The kernel pmap lock is no longer needed; init it and then destroy it to
1753 	 * place it in a known-invalid state that will cause any attempt to use it
1754 	 * to fail.
1755 	 */
1756 	pmap_lock_init(kernel_pmap);
1757 	pmap_lock_destroy(kernel_pmap);
1758 
1759 	pmap_max_asids = SPTMArgs->num_asids;
1760 
1761 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
1762 
1763 	/**
1764 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
1765 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
1766 	 * space for these data structures.
1767 	 * */
1768 	pmap_data_bootstrap();
1769 
1770 	/**
1771 	 * Don't make any assumptions about the alignment of avail_start before this
1772 	 * point (i.e., pmap_data_bootstrap() performs allocations).
1773 	 */
1774 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
1775 
1776 	const pmap_paddr_t pmap_struct_start = avail_start;
1777 
1778 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
1779 	avail_start = round_page(avail_start + asid_table_size);
1780 
1781 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
1782 
1783 	queue_init(&map_pmap_list);
1784 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
1785 
1786 	virtual_space_start = vstart;
1787 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
1788 
1789 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
1790 	/* Clear the ASIDs which will alias the reserved kernel ASID of 0. */
1791 	for (unsigned int i = 0; i < pmap_max_asids; i += MAX_HW_ASIDS) {
1792 		bitmap_clear(&asid_bitmap[0], i);
1793 	}
1794 
1795 
1796 #if !HAS_16BIT_ASID
1797 	/**
1798 	 * Align the range of available hardware ASIDs to a multiple of 64 to enable the
1799 	 * masking used by the PLRU scheme.  This means we must handle the case in which
1800 	 * the returned hardware ASID is 0, which we do by clearing all vASIDs that will
1801 	 * alias the kernel ASID.
1802 	 */
1803 	pmap_max_asids = pmap_max_asids & ~63ul;
1804 	if (__improbable(pmap_max_asids == 0)) {
1805 		panic("%s: insufficient number of ASIDs (%u) supplied by SPTM", __func__, (unsigned int)pmap_max_asids);
1806 	}
1807 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
1808 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
1809 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
1810 	_Static_assert((MAX_HW_ASIDS % 64) == 0, "MAX_HW_ASIDS is not divisible by 64");
1811 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
1812 	bitmap_clear(&asid_plru_bitmap[0], 0);
1813 #endif /* !HAS_16BIT_ASID */
1814 
1815 
1816 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
1817 		maxoffset = trunc_page(maxoffset);
1818 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
1819 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
1820 			arm_pmap_max_offset_default = maxoffset;
1821 		}
1822 	}
1823 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
1824 		maxoffset = trunc_page(maxoffset);
1825 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
1826 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
1827 			arm64_pmap_max_offset_default = maxoffset;
1828 		}
1829 	}
1830 
1831 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
1832 
1833 
1834 #if DEVELOPMENT || DEBUG
1835 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
1836 	    &vm_footprint_suspend_allowed,
1837 	    sizeof(vm_footprint_suspend_allowed));
1838 #endif /* DEVELOPMENT || DEBUG */
1839 
1840 #if KASAN
1841 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
1842 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
1843 #endif /* KASAN */
1844 
1845 	/**
1846 	 * Ensure that avail_start is always left on a page boundary. The calling
1847 	 * code might not perform any alignment before allocating page tables so
1848 	 * this is important.
1849 	 */
1850 	avail_start = round_page(avail_start);
1851 
1852 
1853 #if (DEVELOPMENT || DEBUG)
1854 	(void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available);
1855 #endif /* (DEVELOPMENT || DEBUG) */
1856 
1857 #if __ARM64_PMAP_SUBPAGE_L1__
1858 	/* Initialize the Subpage User Root Table subsystem. */
1859 	surt_init();
1860 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
1861 
1862 	/* Signal that the pmap has been bootstrapped */
1863 	pmap_bootstrapped = true;
1864 }
1865 
1866 /**
1867  * Helper for creating a populated commpage table
1868  *
1869  * In order to avoid burning extra pages on mapping the commpage, we create a
1870  * dedicated table hierarchy for the commpage.  We forcibly nest the translation tables from
1871  * this pmap into other pmaps.  The level we will nest at depends on the MMU configuration (page
1872  * size, TTBR range, etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
1873  *
1874  * @note that this is NOT "the nested pmap" (which is used to nest the shared cache).
1875  *
1876  * @param rw_va Virtual address at which to insert a mapping to the kernel R/W commpage
1877  * @param ro_va Virtual address at which to insert a mapping to the kernel R/O commpage
1878  * @param rw_pa Physical address of kernel R/W commpage
1879  * @param ro_pa Physical address of kernel R/O commpage, may be 0 if not supported in this
1880  *              configuration
1881  * @param rx_pa Physical address of user executable (and kernel R/O) commpage, may be 0 if
1882  *              not supported in this configuration
1883  * @param pmap_create_flags Control flags for the temporary pmap created by this function
1884  *
1885  * @return the physical address of the created commpage table, typed as
1886  *         XNU_PAGE_TABLE_COMMPAGE and containing all relevant commpage mappings.
1887  */
1888 static pmap_paddr_t
pmap_create_commpage_table(vm_map_address_t rw_va,vm_map_address_t ro_va,pmap_paddr_t rw_pa,pmap_paddr_t ro_pa,pmap_paddr_t rx_pa,unsigned int pmap_create_flags)1889 pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va,
1890     pmap_paddr_t rw_pa, pmap_paddr_t ro_pa, pmap_paddr_t rx_pa, unsigned int pmap_create_flags)
1891 {
1892 	pmap_t temp_commpage_pmap = pmap_create_options(NULL, 0, pmap_create_flags);
1893 	assert(temp_commpage_pmap != NULL);
1894 	assert(rw_pa != 0);
1895 	const pt_attr_t *pt_attr = pmap_get_pt_attr(temp_commpage_pmap);
1896 
1897 	/*
1898 	 * We only use pmap_expand to expand the pmap up to the commpage nesting level.  At that level
1899 	 * and beyond, all the newly created tables will be nested directly into the userspace region
1900 	 * for each process, and as such they must be of the dedicated SPTM commpage table type so that
1901 	 * the SPTM can enforce the commpage security model which forbids random replacement of commpage
1902 	 * mappings.
1903 	 */
1904 	kern_return_t kr = pmap_expand(temp_commpage_pmap, rw_va, 0, pt_attr_commpage_level(pt_attr));
1905 	assert(kr == KERN_SUCCESS);
1906 
1907 	pmap_paddr_t commpage_table_pa = 0;
1908 	for (unsigned int i = pt_attr_commpage_level(pt_attr); i < pt_attr_leaf_level(pt_attr); i++) {
1909 		pmap_paddr_t new_table = 0;
1910 		kr = pmap_page_alloc(&new_table, 0);
1911 		assert((kr == KERN_SUCCESS) && (new_table != 0));
1912 		if (commpage_table_pa == 0) {
1913 			commpage_table_pa = new_table;
1914 		}
1915 
1916 		pt_desc_t *ptdp = ptd_alloc(temp_commpage_pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
1917 		assert(ptdp);
1918 
1919 		const unsigned int pai = pa_index(new_table);
1920 		locked_pvh_t locked_pvh = pvh_lock(pai);
1921 		pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
1922 
1923 		ptd_info_init(ptdp, temp_commpage_pmap, pt_attr_align_va(pt_attr, i, rw_va), i + 1, NULL);
1924 
1925 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1926 		retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
1927 		sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params);
1928 
1929 		const sptm_tte_t table_tte = (new_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1930 
1931 		sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va),
1932 		    (sptm_pt_level_t)i, table_tte);
1933 
1934 		ptd_info_finalize(ptdp);
1935 
1936 		/* The PTD's assoicated pmap temp_commpage_pmap is to be destroyed, so set it to NULL here. */
1937 		ptdp->pmap = NULL;
1938 
1939 		pvh_unlock(&locked_pvh);
1940 	}
1941 
1942 	/*
1943 	 * Note the lack of ARM_PTE_NG here: commpage mappings are at fixed addresses and
1944 	 * frequently accessed, so we map them global to avoid unnecessary TLB pressure.
1945 	 */
1946 	static const sptm_pte_t commpage_pte_template = ARM_PTE_TYPE_VALID
1947 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK)
1948 	    | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX
1949 	    | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF;
1950 
1951 	sptm_return_t sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, rw_va,
1952 	    commpage_pte_template | ARM_PTE_NX | pa_to_pte(rw_pa));
1953 	assert(sptm_ret == SPTM_SUCCESS);
1954 
1955 	if (ro_pa != 0) {
1956 		assert((ro_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1957 		sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, ro_va,
1958 		    commpage_pte_template | ARM_PTE_NX | pa_to_pte(ro_pa));
1959 		assert(sptm_ret == SPTM_SUCCESS);
1960 	}
1961 
1962 	if (rx_pa != 0) {
1963 		assert((commpage_text_user_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1964 		assert((commpage_text_user_va != rw_va) && (commpage_text_user_va != ro_va));
1965 		sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, commpage_text_user_va, commpage_pte_template | pa_to_pte(rx_pa));
1966 		assert(sptm_ret == SPTM_SUCCESS);
1967 	}
1968 
1969 
1970 	/* Unmap the commpage table here so that it won't be deallocated by pmap_destroy(). */
1971 	sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va),
1972 	    (sptm_pt_level_t)pt_attr_commpage_level(pt_attr));
1973 	pmap_destroy(temp_commpage_pmap);
1974 
1975 	return commpage_table_pa;
1976 }
1977 
1978 /**
1979  * Helper for creating all commpage tables applicable to the current configuration.
1980  *
1981  * @note This function is intended to be called during bootstrap.
1982  * @note This function assumes that pmap_create_commpages has already executed, and therefore
1983  *       the commpage_*_pa variables have been assigned to their final values.  commpage_data_pa
1984  *       is the kernel RW commpage and is assumed to be present on all configurations, so it
1985  *       therefore must be non-zero at this point.  The other variables are considered optional
1986  *       depending upon configuration and may be zero.
1987  */
1988 void pmap_prepare_commpages(void);
1989 void
pmap_prepare_commpages(void)1990 pmap_prepare_commpages(void)
1991 {
1992 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1993 	assert(commpage_data_pa != 0);
1994 	sptm_retype(commpage_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RW, retype_params);
1995 	if (commpage_ro_data_pa != 0) {
1996 		sptm_retype(commpage_ro_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RO, retype_params);
1997 	}
1998 	if (commpage_text_pa != 0) {
1999 		sptm_retype(commpage_text_pa, XNU_DEFAULT, XNU_COMMPAGE_RX, retype_params);
2000 	}
2001 
2002 	/*
2003 	 * User mapping of comm page text section for 64 bit mapping only
2004 	 *
2005 	 * We don't insert the text commpage into the 32 bit mapping because we don't want
2006 	 * 32-bit user processes to get this page mapped in, they should never call into
2007 	 * this page.
2008 	 */
2009 	commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2010 	    commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, PMAP_CREATE_64BIT);
2011 
2012 	/*
2013 	 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2014 	 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2015 	 */
2016 	commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2017 	    commpage_data_pa, commpage_ro_data_pa, 0, 0);
2018 
2019 #if __ARM_MIXED_PAGE_SIZE__
2020 	commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
2021 	    commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_64BIT | PMAP_CREATE_FORCE_4K_PAGES);
2022 
2023 	/*
2024 	 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
2025 	 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
2026 	 * commpage32_4k_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
2027 	 *    commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
2028 	 */
2029 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2030 
2031 }
2032 
2033 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2034 pmap_virtual_space(
2035 	vm_offset_t *startp,
2036 	vm_offset_t *endp
2037 	)
2038 {
2039 	*startp = virtual_space_start;
2040 	*endp = virtual_space_end;
2041 }
2042 
2043 
2044 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2045 pmap_virtual_region(
2046 	unsigned int region_select,
2047 	vm_map_offset_t *startp,
2048 	vm_map_size_t *size
2049 	)
2050 {
2051 	boolean_t       ret = FALSE;
2052 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)
2053 	if (region_select == 0) {
2054 		/*
2055 		 * In this config, the bootstrap mappings should occupy their own L2
2056 		 * TTs, as they should be immutable after boot.  Having the associated
2057 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
2058 		 * while allowing the rest of the kernel address range to be remapped.
2059 		 */
2060 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2061 #if defined(ARM_LARGE_MEMORY)
2062 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2063 #else
2064 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2065 #endif
2066 		ret = TRUE;
2067 	}
2068 
2069 #if defined(ARM_LARGE_MEMORY)
2070 	if (region_select == 1) {
2071 		*startp = VREGION1_START;
2072 		*size = VREGION1_SIZE;
2073 		ret = TRUE;
2074 	}
2075 #endif
2076 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) */
2077 #if defined(ARM_LARGE_MEMORY)
2078 	/* For large memory systems with no KTRR/CTRR */
2079 	if (region_select == 0) {
2080 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2081 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2082 		ret = TRUE;
2083 	}
2084 
2085 	if (region_select == 1) {
2086 		*startp = VREGION1_START;
2087 		*size = VREGION1_SIZE;
2088 		ret = TRUE;
2089 	}
2090 #else /* !defined(ARM_LARGE_MEMORY) */
2091 	unsigned long low_global_vr_mask = 0;
2092 	vm_map_size_t low_global_vr_size = 0;
2093 
2094 	if (region_select == 0) {
2095 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2096 		if (!TEST_PAGE_SIZE_4K) {
2097 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
2098 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2099 		} else {
2100 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
2101 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2102 		}
2103 		ret = TRUE;
2104 	}
2105 	if (region_select == 1) {
2106 		*startp = VREGION1_START;
2107 		*size = VREGION1_SIZE;
2108 		ret = TRUE;
2109 	}
2110 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2111 	if (!TEST_PAGE_SIZE_4K) {
2112 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2113 		low_global_vr_size = 0x2000000;
2114 	} else {
2115 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2116 		low_global_vr_size = 0x800000;
2117 	}
2118 
2119 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2120 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2121 		*size = low_global_vr_size;
2122 		ret = TRUE;
2123 	}
2124 
2125 	if (region_select == 3) {
2126 		/* In this config, we allow the bootstrap mappings to occupy the same
2127 		 * page table pages as the heap.
2128 		 */
2129 		*startp = VM_MIN_KERNEL_ADDRESS;
2130 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2131 		ret = TRUE;
2132 	}
2133 #endif /* defined(ARM_LARGE_MEMORY) */
2134 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR) */
2135 	return ret;
2136 }
2137 
2138 /*
2139  * Routines to track and allocate physical pages during early boot.
2140  * On most systems that memory runs from first_avail through to avail_end
2141  * with no gaps.
2142  *
2143  * If the system supports ECC and ecc_bad_pages_count > 0, we
2144  * need to skip those pages.
2145  */
2146 
2147 static unsigned int avail_page_count = 0;
2148 static bool need_ram_ranges_init = true;
2149 
2150 
2151 /**
2152  * Checks to see if a given page is in
2153  * the array of known bad pages
2154  *
2155  * @param ppn page number to check
2156  */
2157 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2158 pmap_is_bad_ram(__unused ppnum_t ppn)
2159 {
2160 	return false;
2161 }
2162 
2163 /**
2164  * Prepare bad ram pages to be skipped.
2165  */
2166 #if HAS_MTE
2167 
2168 /*
2169  * Things to track use of MTE tag pages.
2170  *
2171  * The tag storage region starts at mte_tag_storage_start, and ends at
2172  * mte_tag_storage_end.  The tag storage region should consist of
2173  * mte_tag_storage_count pages.
2174  *
2175  * mte_tag_storage_start_pnum is just the physical page number of the first
2176  * page in the tag storage region.
2177  */
2178 SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_start;
2179 SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_end;
2180 SECURITY_READ_ONLY_LATE(ppnum_t)      mte_tag_storage_start_pnum;
2181 SECURITY_READ_ONLY_LATE(uint_t)       mte_tag_storage_count;
2182 
2183 /*
2184  * Bounds for calculating which portions of the tag storage range that won't be
2185  * used for tag storage
2186  *
2187  * We currently expect DRAM to look (very roughly) like this (unless the maxmem
2188  * boot-arg is being used):
2189  *
2190  * +-----------+---------+-------------+-----------------+-----------+
2191  * | Unmanaged | Managed | Tag Storage | Managed (maybe) | Unmanaged |
2192  * +-----------+---------+-------------+-----------------+-----------+
2193  *
2194  * The system will never tag the unmanaged pages, as it will not have data
2195  * structures for those pages.  The system will also never tag the tag storage
2196  * pages, as this is forbidden by the hardware.
2197  *
2198  * The maxmem boot-arg may grow the size of the ending unmanaged region
2199  * (potentially extended into or past the tag storage region).
2200  *
2201  * As far as the terminology goes, "recursive" tag storage is the tag storage
2202  * range that covers the tag storage region.  The "managed" tag storage is the
2203  * tag storage range that covers managed memory (which includes the tag storage
2204  * range itself, unless the maxmem boot-arg is involved).  This implicitly
2205  * means that the "unmanaged" tag storage range is all tag storage outside the
2206  * "managed" range.
2207  */
2208 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_start_pnum;
2209 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_end_pnum;
2210 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_managed_start_pnum;
2211 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_managed_end_pnum;
2212 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_discarded_start_pnum;
2213 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_discarded_end_pnum;
2214 static SECURITY_READ_ONLY_LATE(pmap_paddr_t) mte_tag_storage_recursive_discarded_end_pnum;
2215 
2216 static inline void
pmap_tag_op(const unified_page_list_t * page_list,bool tag_not_untag,__assert_only bool panic_on_redundant_calls)2217 pmap_tag_op(const unified_page_list_t *page_list, bool tag_not_untag, __assert_only bool panic_on_redundant_calls)
2218 {
2219 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
2220 	sptm_paddr_t *paddr_list = NULL;
2221 
2222 	unsigned int num_paddrs = 0;
2223 
2224 	/**
2225 	 * Drain the epochs to ensure any lingering batched operations that may have taken
2226 	 * an in-flight reference to these pages are complete.  sptm_tag_papt_multipage(),
2227 	 * much like sptm_retype(), takes exclusive guards on each physical page, so this
2228 	 * is needed as a precaution to avoid a race with (for example) a concurrent
2229 	 * pmap_remove() which may still hold a lingering shared guard on a page in this
2230 	 * list after removing a mapping.  The VM layer should guarantee that all existing
2231 	 * mappings have been disconnected and no new mappings should be incoming for the
2232 	 * pages when this function is called.
2233 	 */
2234 	pmap_epoch_prepare_drain();
2235 	pmap_epoch_drain();
2236 
2237 	unified_page_list_iterator_t iter;
2238 
2239 	for (unified_page_list_iterator_init(page_list, &iter);
2240 	    !unified_page_list_iterator_end(&iter);
2241 	    unified_page_list_iterator_next(&iter)) {
2242 		bool is_fictitious = false;
2243 		const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
2244 		const pmap_paddr_t paddr = ptoa(pn);
2245 		vm_page_t page;
2246 
2247 		/**
2248 		 * The VM may pass a fictitious or guard page here, which doesn't have a valid
2249 		 * managed PA.
2250 		 */
2251 		if (__improbable(!pa_valid(paddr) || is_fictitious)) {
2252 			continue;
2253 		}
2254 
2255 		page = unified_page_list_iterator_vm_page(&iter);
2256 		if (page == VM_PAGE_NULL) {
2257 			/*
2258 			 * all pages that we tag or untag are managed, meaning
2259 			 * that resolution should always succeed once we're past
2260 			 * bootstrap.
2261 			 *
2262 			 * Before bootstrap it means that callers must be sure
2263 			 * there's work to do.
2264 			 */
2265 			assert(startup_phase < STARTUP_SUB_KMEM);
2266 		} else if (page->vmp_using_mte == tag_not_untag) {
2267 			/* pmap_tag_op shoudldn't be called with no effect while
2268 			* panic_on_redundant_calls is set. Hence assert below */
2269 			assert(!panic_on_redundant_calls);
2270 			continue;
2271 		}
2272 
2273 		const unsigned int pai = pa_index(paddr);
2274 		pp_attr_t pp_attr_current, pp_attr_template;
2275 		unsigned int cacheattr = (tag_not_untag ? VM_WIMG_MTE : VM_WIMG_DEFAULT);
2276 
2277 		/**
2278 		 * We should not need the PVH lock here as the VM should not be issuing any concurrent
2279 		 * mappings requests against these pages.
2280 		 */
2281 		os_atomic_rmw_loop(&pp_attr_table[pai], pp_attr_current, pp_attr_template, relaxed, {
2282 			if (tag_not_untag) {
2283 			        if (pp_attr_current & PP_ATTR_WIMG_MASK) {
2284 			                assert3u(pp_attr_current & PP_ATTR_WIMG_MASK, ==, VM_WIMG_DEFAULT);
2285 				}
2286 			} else {
2287 			        assert3u(pp_attr_current & PP_ATTR_WIMG_MASK, ==, VM_WIMG_MTE);
2288 			}
2289 			pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
2290 		});
2291 
2292 		if (num_paddrs == 0) {
2293 			disable_preemption();
2294 			sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
2295 			paddr_list = sptm_pcpu->sptm_paddrs;
2296 		}
2297 		paddr_list[num_paddrs++] = paddr;
2298 		if (num_paddrs == SPTM_MAPPING_LIMIT) {
2299 			if (tag_not_untag) {
2300 				sptm_tag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs, 0);
2301 			} else {
2302 				sptm_untag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs);
2303 			}
2304 			enable_preemption();
2305 			num_paddrs = 0;
2306 		}
2307 	}
2308 
2309 	if (num_paddrs != 0) {
2310 		if (tag_not_untag) {
2311 			sptm_tag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs, 0);
2312 		} else {
2313 			sptm_untag_papt_multipage(sptm_pcpu->sptm_paddrs_pa, num_paddrs);
2314 		}
2315 		enable_preemption();
2316 	}
2317 }
2318 
2319 void
pmap_make_tagged_pages(const unified_page_list_t * page_list)2320 pmap_make_tagged_pages(const unified_page_list_t *page_list)
2321 {
2322 	pmap_tag_op(page_list, true, false);
2323 }
2324 
2325 void
pmap_make_tagged_page(ppnum_t pnum)2326 pmap_make_tagged_page(ppnum_t pnum)
2327 {
2328 	upl_page_info_t single_page_upl = { .phys_addr = pnum };
2329 	const unified_page_list_t page_list = {
2330 		.upl = {.upl_info = &single_page_upl, .upl_size = 1},
2331 		.type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
2332 	};
2333 	pmap_tag_op(&page_list, true, true);
2334 }
2335 
2336 void
pmap_unmake_tagged_pages(const unified_page_list_t * page_list)2337 pmap_unmake_tagged_pages(const unified_page_list_t *page_list)
2338 {
2339 	pmap_tag_op(page_list, false, false);
2340 }
2341 
2342 void
pmap_unmake_tagged_page(ppnum_t pnum)2343 pmap_unmake_tagged_page(ppnum_t pnum)
2344 {
2345 	upl_page_info_t single_page_upl = { .phys_addr = pnum };
2346 	const unified_page_list_t page_list = {
2347 		.upl = {.upl_info = &single_page_upl, .upl_size = 1},
2348 		.type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
2349 	};
2350 	pmap_tag_op(&page_list, false, true);
2351 }
2352 
2353 bool
pmap_is_tag_storage_page(ppnum_t pnum)2354 pmap_is_tag_storage_page(ppnum_t pnum)
2355 {
2356 	return sptm_get_frame_type(ptoa(pnum)) == XNU_TAG_STORAGE;
2357 }
2358 
2359 bool
pmap_in_tag_storage_range(ppnum_t pnum)2360 pmap_in_tag_storage_range(ppnum_t pnum)
2361 {
2362 	pmap_paddr_t addr = ptoa(pnum);
2363 
2364 	return (mte_tag_storage_start <= addr) && (addr < mte_tag_storage_end);
2365 }
2366 
2367 bool
pmap_tag_storage_is_recursive(ppnum_t pnum)2368 pmap_tag_storage_is_recursive(ppnum_t pnum)
2369 {
2370 	assert(pmap_in_tag_storage_range(pnum));
2371 
2372 	return (mte_tag_storage_recursive_start_pnum <= pnum) &&
2373 	       (pnum < mte_tag_storage_recursive_end_pnum);
2374 }
2375 
2376 bool
pmap_tag_storage_is_unmanaged(ppnum_t pnum)2377 pmap_tag_storage_is_unmanaged(ppnum_t pnum)
2378 {
2379 	assert(pmap_in_tag_storage_range(pnum));
2380 
2381 	return (pnum < mte_tag_storage_managed_start_pnum) ||
2382 	       (mte_tag_storage_managed_end_pnum <= pnum);
2383 }
2384 
2385 /*
2386  * Returns whether a physical page is MTE-tagged.
2387  *
2388  * Being a "tagged" page means the following:
2389  * 1. The cache attribute of the backing page is VM_WIMG_MTE.
2390  * 2. There is at least one MTE-enabled mapping of this physical page (the
2391  *     physical aperture mapping, which is explicitly managed alongside the
2392  *     page's cache attributes).
2393  *
2394  * IMPORTANT: this means that even if the mapping that "you" (the caller) have
2395  * is MTE-disabled, this function may still return true.
2396  */
2397 bool
pmap_tag_storage_is_discarded(ppnum_t pnum)2398 pmap_tag_storage_is_discarded(ppnum_t pnum)
2399 {
2400 	assert(pmap_in_tag_storage_range(pnum));
2401 
2402 	return mte_tag_storage_discarded_start_pnum && (((pnum >= mte_tag_storage_discarded_start_pnum) &&
2403 	       (pnum < mte_tag_storage_discarded_end_pnum)) || ((pnum >= mte_tag_storage_recursive_start_pnum) && (pnum < mte_tag_storage_recursive_discarded_end_pnum)));
2404 }
2405 
2406 bool
pmap_is_tagged_page(ppnum_t pnum)2407 pmap_is_tagged_page(ppnum_t pnum)
2408 {
2409 	const pmap_paddr_t pa = ptoa(pnum);
2410 
2411 	if (!pmap_valid_address(pa)) {
2412 		return false;
2413 	}
2414 
2415 	unsigned int wimg = pmap_cache_attributes(pnum);
2416 	return (wimg & VM_WIMG_MASK) == VM_WIMG_MTE;
2417 }
2418 
2419 /*
2420  * Returns whether or not the specific translation corresponding to a given
2421  * virtual address is an MTE-enabled translation.
2422  */
2423 bool
pmap_is_tagged_mapping(pmap_t pmap,vm_map_offset_t va)2424 pmap_is_tagged_mapping(pmap_t pmap, vm_map_offset_t va)
2425 {
2426 	pt_entry_t *ptep = pmap_pte(pmap, va);
2427 	return ptep && (*ptep & ARM_PTE_ATTRINDX(CACHE_ATTRINDX_MTE));
2428 }
2429 
2430 void
pmap_make_tag_storage_page(ppnum_t pnum)2431 pmap_make_tag_storage_page(ppnum_t pnum)
2432 {
2433 	/**
2434 	 * Drain the epochs to ensure any lingering batched operations that may have taken
2435 	 * an in-flight reference to this page are complete.
2436 	 */
2437 	pmap_epoch_prepare_drain();
2438 	const pmap_paddr_t pa = ptoa(pnum);
2439 	const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2440 	pmap_epoch_drain();
2441 	sptm_retype(pa, XNU_DEFAULT, XNU_TAG_STORAGE, retype_params);
2442 }
2443 
2444 void
pmap_unmake_tag_storage_page(ppnum_t pnum)2445 pmap_unmake_tag_storage_page(ppnum_t pnum)
2446 {
2447 	/**
2448 	 * Drain the epochs to ensure any lingering batched operations that may have operated
2449 	 * on just-removed mappings to this tag storage page have completed and are thus no
2450 	 * longer holding an in-flight reference to this page.
2451 	 */
2452 	pmap_epoch_prepare_drain();
2453 	const pmap_paddr_t pa = ptoa(pnum);
2454 	const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2455 	pmap_epoch_drain();
2456 	sptm_retype(pa, XNU_TAG_STORAGE, XNU_DEFAULT, retype_params);
2457 }
2458 
2459 /*
2460  * Given a physical address, calculates the physical page number of the tag
2461  * storage page that covers it.
2462  */
2463 static ppnum_t
map_paddr_to_tag_ppnum(pmap_paddr_t paddr)2464 map_paddr_to_tag_ppnum(pmap_paddr_t paddr)
2465 {
2466 	uint64_t tag_page_index;
2467 
2468 	assert((paddr >= gDramBase) && (paddr < (gDramBase + gDramSize)));
2469 	assert((paddr & PAGE_MASK) == 0);
2470 
2471 	tag_page_index = atop(paddr - gDramBase) / MTE_PAGES_PER_TAG_PAGE;
2472 	return mte_tag_storage_start_pnum + tag_page_index;
2473 }
2474 
2475 /*
2476  * Given the physical page number of a tag storage page, calculates the physical
2477  * page number of the first page covered by it.
2478  */
2479 ppnum_t
map_tag_ppnum_to_first_covered_ppnum(ppnum_t tag_ppnum)2480 map_tag_ppnum_to_first_covered_ppnum(ppnum_t tag_ppnum)
2481 {
2482 	assert((mte_tag_storage_start_pnum <= tag_ppnum) && (tag_ppnum <= (mte_tag_storage_start_pnum + mte_tag_storage_count)));
2483 
2484 	uint64_t tag_page_index = tag_ppnum - mte_tag_storage_start_pnum;
2485 	return atop(ptoa(tag_page_index * MTE_PAGES_PER_TAG_PAGE) + gDramBase);
2486 }
2487 
2488 #endif /* HAS_MTE */
2489 
2490 /*
2491  * Initialize the count of available pages. No lock needed here,
2492  * as this code is called while kernel boot up is single threaded.
2493  */
2494 static void
initialize_ram_ranges(void)2495 initialize_ram_ranges(void)
2496 {
2497 	__assert_only pmap_paddr_t first = first_avail;
2498 	pmap_paddr_t end = avail_end;
2499 
2500 	assert(first <= end);
2501 	assert(first == (first & ~PAGE_MASK));
2502 	assert(end == (end & ~PAGE_MASK));
2503 
2504 	need_ram_ranges_init = false;
2505 
2506 #if HAS_MTE
2507 	if (is_mte_enabled) {
2508 		assert3u(atop(gDramSize) / MTE_PAGES_PER_TAG_PAGE, ==,
2509 		    SPTMArgs->n_tag_storage_frames);
2510 		/* Export MTE tag region boundaries to the VM */
2511 		mte_tag_storage_start = SPTMArgs->first_tag_storage_paddr;
2512 		mte_tag_storage_count = SPTMArgs->n_tag_storage_frames;
2513 		mte_tag_storage_end = mte_tag_storage_start + ptoa(mte_tag_storage_count);
2514 		mte_tag_storage_start_pnum = atop(mte_tag_storage_start);
2515 
2516 		/*
2517 		 * Calculate the bounds of the recursive and managed tag
2518 		 * storage regions.  These will be used to determine which tag
2519 		 * storage pages will never be used to store tags.
2520 		 */
2521 		mte_tag_storage_recursive_start_pnum = map_paddr_to_tag_ppnum(mte_tag_storage_start);
2522 		mte_tag_storage_recursive_end_pnum   = map_paddr_to_tag_ppnum(mte_tag_storage_end);
2523 		mte_tag_storage_managed_start_pnum   = map_paddr_to_tag_ppnum(gPhysBase);
2524 		mte_tag_storage_managed_end_pnum     = map_paddr_to_tag_ppnum(gPhysBase + mem_size);
2525 
2526 		/*
2527 		 * If a capped memory override has been set via maxmem= / hw.memsize,
2528 		 * discard the remainder of memory and adjust the number of tag pages
2529 		 * available to the system by discarding them.
2530 		 */
2531 		if (max_mem != mem_size) {
2532 #define TAG_STORAGE_MASK ((PAGE_SIZE * MTE_PAGES_PER_TAG_PAGE) - 1)
2533 			assert(max_mem <= mem_size);
2534 			assert(!(max_mem & TAG_STORAGE_MASK));
2535 			// Make sure we do not retire a tag page that might have tagged pages associated
2536 			first_avail = (first_avail + TAG_STORAGE_MASK) & ~TAG_STORAGE_MASK;
2537 
2538 			uint64_t discarding = mte_tag_storage_start - (max_mem - max_mem / MTE_PAGES_PER_TAG_PAGE) - avail_start;
2539 			/*
2540 			 * Also align how much we discard up to a ~512KiB boundary. We might be
2541 			 * over/under discarding +=512KiB here (which is fine accuracy wise because
2542 			 * ml_static_mfree will also release different amount of memory depending on the
2543 			 * actual device config)
2544 			 */
2545 			discarding &= ~TAG_STORAGE_MASK;
2546 
2547 			mte_tag_storage_discarded_start_pnum = map_paddr_to_tag_ppnum(first_avail);
2548 
2549 			first_avail += discarding;
2550 
2551 			mte_tag_storage_discarded_end_pnum = map_paddr_to_tag_ppnum(first_avail);
2552 
2553 			/*
2554 			 * Also adjust the number of recursive dead tag storage pages to match
2555 			 * the capped memory size
2556 			 */
2557 			mte_tag_storage_recursive_discarded_end_pnum = mte_tag_storage_recursive_end_pnum - (mte_tag_storage_recursive_end_pnum - mte_tag_storage_recursive_start_pnum) * max_mem / mem_size;
2558 		}
2559 	} else {
2560 		assert3u(SPTMArgs->n_tag_storage_frames, ==, 0);
2561 	}
2562 
2563 #if DEVELOPMENT || DEBUG
2564 	printf("MTE [0x%llx, 0x%llx]\n", mte_tag_storage_start, mte_tag_storage_end);
2565 	printf("MTE tag storage 0x%x\n", mte_tag_storage_count);
2566 #endif /* DEVELOPMENT || DEBUG */
2567 #endif /* HAS_MTE */
2568 	avail_page_count = atop(end - first_avail);
2569 }
2570 
2571 unsigned int
pmap_free_pages(void)2572 pmap_free_pages(
2573 	void)
2574 {
2575 	if (need_ram_ranges_init) {
2576 		initialize_ram_ranges();
2577 	}
2578 	return avail_page_count;
2579 }
2580 
2581 unsigned int
pmap_free_pages_span(void)2582 pmap_free_pages_span(
2583 	void)
2584 {
2585 	if (need_ram_ranges_init) {
2586 		initialize_ram_ranges();
2587 	}
2588 	return (unsigned int)atop(avail_end - first_avail);
2589 }
2590 
2591 
2592 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2593 pmap_next_page_hi(
2594 	ppnum_t            * pnum,
2595 	__unused boolean_t might_free)
2596 {
2597 	return pmap_next_page(pnum);
2598 }
2599 
2600 
2601 boolean_t
pmap_next_page(ppnum_t * pnum)2602 pmap_next_page(
2603 	ppnum_t *pnum)
2604 {
2605 	if (need_ram_ranges_init) {
2606 		initialize_ram_ranges();
2607 	}
2608 
2609 
2610 	if (first_avail != avail_end) {
2611 		*pnum = (ppnum_t)atop(first_avail);
2612 		first_avail += PAGE_SIZE;
2613 		assert(avail_page_count > 0);
2614 		--avail_page_count;
2615 		return TRUE;
2616 	}
2617 	assert(avail_page_count == 0);
2618 	return FALSE;
2619 }
2620 
2621 
2622 
2623 
2624 /**
2625  * Helper function to check wheter the given physical
2626  * page number is a restricted page.
2627  *
2628  * @param pn the physical page number to query.
2629  */
2630 bool
pmap_is_page_restricted(ppnum_t pn)2631 pmap_is_page_restricted(ppnum_t pn)
2632 {
2633 	sptm_frame_type_t frame_type = sptm_get_frame_type(ptoa(pn));
2634 	return frame_type == XNU_KERNEL_RESTRICTED;
2635 }
2636 
2637 /*
2638  *	Initialize the pmap module.
2639  *	Called by vm_init, to initialize any structures that the pmap
2640  *	system needs to map virtual memory.
2641  */
2642 void
pmap_init(void)2643 pmap_init(
2644 	void)
2645 {
2646 	/*
2647 	 *	Protect page zero in the kernel map.
2648 	 *	(can be overruled by permanent transltion
2649 	 *	table entries at page zero - see arm_vm_init).
2650 	 */
2651 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2652 
2653 	pmap_initialized = TRUE;
2654 
2655 	/*
2656 	 *	Create the zone of physical maps
2657 	 *	and the physical-to-virtual entries.
2658 	 */
2659 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2660 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2661 
2662 
2663 	/*
2664 	 *	Initialize the pmap object (for tracking the vm_page_t
2665 	 *	structures for pages we allocate to be page tables in
2666 	 *	pmap_expand().
2667 	 */
2668 	_vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2669 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2670 
2671 	/*
2672 	 *	Initialize the TXM VM object in the same way as the
2673 	 *	PMAP VM object.
2674 	 */
2675 	_vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL);
2676 	txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2677 
2678 	/*
2679 	 * The values of [hard_]maxproc may have been scaled, make sure
2680 	 * they are still less than the value of pmap_max_asids.
2681 	 */
2682 	if ((uint32_t)maxproc > pmap_max_asids) {
2683 		maxproc = pmap_max_asids;
2684 	}
2685 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2686 		hard_maxproc = pmap_max_asids;
2687 	}
2688 }
2689 
2690 /**
2691  * Verify that a given physical page contains no mappings (outside of the
2692  * default physical aperture mapping).
2693  *
2694  * @param ppnum Physical page number to check there are no mappings to.
2695  *
2696  * @return True if there are no mappings, false otherwise or if the page is not
2697  *         kernel-managed.
2698  */
2699 bool
pmap_verify_free(ppnum_t ppnum)2700 pmap_verify_free(ppnum_t ppnum)
2701 {
2702 	const pmap_paddr_t pa = ptoa(ppnum);
2703 
2704 	assert(pa != vm_page_fictitious_addr);
2705 
2706 	/* Only mappings to kernel-managed physical memory are tracked. */
2707 	if (!pa_valid(pa)) {
2708 		return false;
2709 	}
2710 
2711 	const unsigned int pai = pa_index(pa);
2712 
2713 	return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL);
2714 }
2715 
2716 
2717 #if __ARM64_PMAP_SUBPAGE_L1__
2718 static inline bool
pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)2719 pmap_user_root_size_matches_subpage_l1(vm_size_t root_size)
2720 {
2721 	return root_size == 8 * sizeof(tt_entry_t);
2722 }
2723 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
2724 
2725 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2726 pmap_root_alloc_size(pmap_t pmap)
2727 {
2728 #pragma unused(pmap)
2729 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2730 	const unsigned int root_level = pt_attr_root_level(pt_attr);
2731 	const uint64_t index = pt_attr_va_valid_mask(pt_attr);
2732 	return ((index >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2733 }
2734 
2735 /*
2736  *	Create and return a physical map.
2737  *
2738  *	If the size specified for the map
2739  *	is zero, the map is an actual physical
2740  *	map, and may be referenced by the
2741  *	hardware.
2742  *
2743  *	If the size specified is non-zero,
2744  *	the map will be used in software only, and
2745  *	is bounded by that size.
2746  */
2747 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2748 pmap_create_options_internal(
2749 	ledger_t ledger,
2750 	vm_map_size_t size,
2751 	unsigned int flags,
2752 	kern_return_t *kr)
2753 {
2754 	pmap_t          p;
2755 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2756 #if defined(HAS_APPLE_PAC)
2757 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2758 #endif /* defined(HAS_APPLE_PAC) */
2759 	kern_return_t   local_kr = KERN_SUCCESS;
2760 	__unused uint8_t sptm_root_flags = SPTM_ROOT_PT_FLAGS_DEFAULT;
2761 	TXMAddressSpaceFlags_t txm_flags = kTXMAddressSpaceFlagInit;
2762 	const bool is_stage2 = false;
2763 
2764 	if (size != 0) {
2765 		{
2766 			// Size parameter should only be set for stage 2.
2767 			return PMAP_NULL;
2768 		}
2769 	}
2770 
2771 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2772 		return PMAP_NULL;
2773 	}
2774 
2775 	/*
2776 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2777 	 *	the translation table of the right size for the pmap.
2778 	 */
2779 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2780 		local_kr = KERN_RESOURCE_SHORTAGE;
2781 		goto pmap_create_fail;
2782 	}
2783 
2784 	p->ledger = ledger;
2785 
2786 
2787 	p->pmap_vm_map_cs_enforced = false;
2788 	p->min = 0;
2789 
2790 
2791 #if CONFIG_ROSETTA
2792 	if (flags & PMAP_CREATE_ROSETTA) {
2793 		p->is_rosetta = TRUE;
2794 	} else {
2795 		p->is_rosetta = FALSE;
2796 	}
2797 #endif /* CONFIG_ROSETTA */
2798 #if defined(HAS_APPLE_PAC)
2799 	p->disable_jop = disable_jop;
2800 
2801 	if (p->disable_jop) {
2802 		sptm_root_flags &= ~SPTM_ROOT_PT_FLAG_JOP;
2803 	}
2804 #endif /* defined(HAS_APPLE_PAC) */
2805 
2806 	p->nested_region_true_start = 0;
2807 	p->nested_region_true_end = ~0;
2808 
2809 	p->nx_enabled = true;
2810 	p->is_64bit = is_64bit;
2811 
2812 	if (!is_64bit) {
2813 		sptm_root_flags |= SPTM_ROOT_PT_FLAG_ARM64_32;
2814 	}
2815 
2816 	p->nested_pmap = PMAP_NULL;
2817 	p->type = PMAP_TYPE_USER;
2818 
2819 #if ARM_PARAMETERIZED_PMAP
2820 	/* Default to the native pt_attr */
2821 	p->pmap_pt_attr = native_pt_attr;
2822 #endif /* ARM_PARAMETERIZED_PMAP */
2823 #if __ARM_MIXED_PAGE_SIZE__
2824 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2825 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2826 	}
2827 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2828 	p->max = pmap_user_va_size(p);
2829 
2830 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2831 		local_kr = KERN_NO_SPACE;
2832 		goto id_alloc_fail;
2833 	}
2834 
2835 	/**
2836 	 * We expect top level translation tables to always fit into a single
2837 	 * physical page. This would also catch a misconfiguration if 4K
2838 	 * concatenated page tables needed more than one physical tt1 page.
2839 	 */
2840 	vm_size_t pmap_root_size = pmap_root_alloc_size(p);
2841 	if (__improbable(pmap_root_size > PAGE_SIZE)) {
2842 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size);
2843 	}
2844 
2845 #if __ARM64_PMAP_SUBPAGE_L1__
2846 	/**
2847 	 * Identify the case where the root qualifies for SURT, and update the
2848 	 * root size to the TTEs + the SPTM metadata, reflecting the actual
2849 	 * space taken by this subpage root table.
2850 	 */
2851 	if (!(flags & PMAP_CREATE_NESTED) && pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
2852 		pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
2853 	}
2854 #endif
2855 
2856 	pmap_lock_init(p);
2857 
2858 	p->tte = pmap_tt1_allocate(p, pmap_root_size, sptm_root_flags);
2859 	if (!(p->tte)) {
2860 		local_kr = KERN_RESOURCE_SHORTAGE;
2861 		goto tt1_alloc_fail;
2862 	}
2863 
2864 	p->ttep = kvtophys_nofail((vm_offset_t)p->tte);
2865 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2866 
2867 	/*
2868 	 *  initialize the rest of the structure
2869 	 */
2870 	p->nested_region_addr = 0x0ULL;
2871 	p->nested_region_size = 0x0ULL;
2872 	p->nested_region_unnested_table_bitmap = NULL;
2873 
2874 	p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE;
2875 #if HAS_MTE
2876 	p->restrict_receiving_aliases_to_tagged_memory = false;
2877 #endif /* HAS_MTE */
2878 
2879 #if MACH_ASSERT
2880 	p->pmap_pid = 0;
2881 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2882 #endif /* MACH_ASSERT */
2883 #if DEVELOPMENT || DEBUG
2884 	p->footprint_was_suspended = FALSE;
2885 #endif /* DEVELOPMENT || DEBUG */
2886 
2887 	os_ref_init_count_raw(&p->ref_count, &pmap_refgrp, 1);
2888 	pmap_simple_lock(&pmaps_lock);
2889 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2890 	pmap_simple_unlock(&pmaps_lock);
2891 
2892 	/**
2893 	 * The SPTM pmap's concurrency model can sometimes allow ledger balances to transiently
2894 	 * go negative.  Note that we still check overall ledger balance on pmap destruction.
2895 	 */
2896 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2897 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2898 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2899 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2900 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
2901 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
2902 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
2903 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
2904 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
2905 
2906 	if (!is_stage2) {
2907 		/*
2908 		 * Complete initialization for the TXM address space. This needs to be done
2909 		 * after the SW ASID has been registered with the SPTM.
2910 		 * TXM enforcement does not apply to virtual machines.
2911 		 */
2912 		if (flags & PMAP_CREATE_TEST) {
2913 			txm_flags |= kTXMAddressSpaceFlagTest;
2914 		}
2915 
2916 		pmap_txmlock_init(p);
2917 		txm_register_address_space(p, p->asid, txm_flags);
2918 		p->txm_trust_level = kCSTrustUntrusted;
2919 	}
2920 
2921 	return p;
2922 
2923 tt1_alloc_fail:
2924 	pmap_get_pt_ops(p)->free_id(p);
2925 id_alloc_fail:
2926 	zfree(pmap_zone, p);
2927 pmap_create_fail:
2928 	*kr = local_kr;
2929 	return PMAP_NULL;
2930 }
2931 
2932 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)2933 pmap_create_options(
2934 	ledger_t ledger,
2935 	vm_map_size_t size,
2936 	unsigned int flags)
2937 {
2938 	pmap_t pmap;
2939 	kern_return_t kr = KERN_SUCCESS;
2940 
2941 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
2942 
2943 	ledger_reference(ledger);
2944 
2945 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
2946 
2947 	if (pmap == PMAP_NULL) {
2948 		ledger_dereference(ledger);
2949 	}
2950 
2951 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2952 
2953 	return pmap;
2954 }
2955 
2956 #if MACH_ASSERT
2957 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)2958 pmap_set_process_internal(
2959 	__unused pmap_t pmap,
2960 	__unused int pid,
2961 	__unused char *procname)
2962 {
2963 	if (pmap == NULL || pmap->pmap_pid == -1) {
2964 		return;
2965 	}
2966 
2967 	validate_pmap_mutable(pmap);
2968 
2969 	pmap->pmap_pid = pid;
2970 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
2971 }
2972 #endif /* MACH_ASSERT */
2973 
2974 #if MACH_ASSERT
2975 void
pmap_set_process(pmap_t pmap,int pid,char * procname)2976 pmap_set_process(
2977 	pmap_t pmap,
2978 	int pid,
2979 	char *procname)
2980 {
2981 	pmap_set_process_internal(pmap, pid, procname);
2982 }
2983 #endif /* MACH_ASSERT */
2984 
2985 /*
2986  * pmap_deallocate_all_leaf_tts:
2987  *
2988  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
2989  * removing and deallocating all TTEs.
2990  */
2991 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,vm_map_address_t start_va,unsigned level)2992 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_address_t start_va, unsigned level)
2993 {
2994 	tt_entry_t tte = ARM_TTE_EMPTY;
2995 	tt_entry_t * ttep = NULL;
2996 	tt_entry_t * last_ttep = NULL;
2997 
2998 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2999 	const uint64_t size = pt_attr->pta_level_info[level].size;
3000 
3001 	assert(level < pt_attr_leaf_level(pt_attr));
3002 
3003 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3004 
3005 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3006 	vm_map_address_t va = start_va;
3007 	for (ttep = first_ttep; ttep <= last_ttep; ttep += page_ratio, va += (size * page_ratio)) {
3008 		if (!(*ttep & ARM_TTE_VALID)) {
3009 			continue;
3010 		}
3011 
3012 		for (unsigned i = 0; i < page_ratio; i++) {
3013 			tte = ttep[i];
3014 
3015 			if (!(tte & ARM_TTE_VALID)) {
3016 				panic("%s: found unexpectedly invalid tte, ttep=%p, tte=%p, "
3017 				    "pmap=%p, first_ttep=%p, level=%u",
3018 				    __FUNCTION__, ttep + i, (void *)tte,
3019 				    pmap, first_ttep, level);
3020 			}
3021 
3022 			if (tte_is_block(tte)) {
3023 				panic("%s: found block mapping, ttep=%p, tte=%p, "
3024 				    "pmap=%p, first_ttep=%p, level=%u",
3025 				    __FUNCTION__, ttep + i, (void *)tte,
3026 				    pmap, first_ttep, level);
3027 			}
3028 
3029 			/* Must be valid, type table */
3030 			if (level < pt_attr_twig_level(pt_attr)) {
3031 				/* If we haven't reached the twig level, recurse to the next level. */
3032 				pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK),
3033 				    va + (size * i), level + 1);
3034 			}
3035 		}
3036 
3037 		/* Remove the TTE. */
3038 		pmap_tte_deallocate(pmap, va, ttep, level, false);
3039 	}
3040 }
3041 
3042 /*
3043  * We maintain stats and ledgers so that a task's physical footprint is:
3044  * phys_footprint = ((internal - alternate_accounting)
3045  *                   + (internal_compressed - alternate_accounting_compressed)
3046  *                   + iokit_mapped
3047  *                   + purgeable_nonvolatile
3048  *                   + purgeable_nonvolatile_compressed
3049  *                   + page_table)
3050  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3051  */
3052 
3053 /*
3054  *	Retire the given physical map from service.
3055  *	Should only be called if the map contains
3056  *	no valid mappings.
3057  */
3058 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3059 pmap_destroy_internal(
3060 	pmap_t pmap)
3061 {
3062 	if (pmap == PMAP_NULL) {
3063 		return;
3064 	}
3065 
3066 	validate_pmap(pmap);
3067 
3068 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3069 	const bool is_stage2_pmap = false;
3070 
3071 	if (os_ref_release_raw(&pmap->ref_count, &pmap_refgrp) > 0) {
3072 		return;
3073 	}
3074 
3075 	if (!is_stage2_pmap) {
3076 		/*
3077 		 * Complete all clean up required for TXM. This needs to happen before the
3078 		 * SW ASID has been unregistered with the SPTM.
3079 		 */
3080 		txm_unregister_address_space(pmap);
3081 		pmap_txmlock_destroy(pmap);
3082 	}
3083 
3084 	/**
3085 	 * Drain any concurrent retype-sensitive SPTM operations.  This is needed to
3086 	 * ensure that we don't unmap and retype the page tables while those operations
3087 	 * are still finishing on other CPUs, leading to an SPTM violation.  In particular,
3088 	 * the multipage batched cacheability/attribute update code may issue SPTM calls
3089 	 * without holding the relevant PVH or pmap locks, so we can't guarantee those
3090 	 * calls have actually completed despite observing refcnt == 0.
3091 	 *
3092 	 * At this point, we CAN guarantee that:
3093 	 * 1) All prior PTE removals required to empty the pmap have completed and
3094 	 *    been synchronized with DSB, *except* the commpage removal which doesn't
3095 	 *    involve pages that can ever be retyped.  Subsequent calls not already
3096 	 *    in the pmap epoch will no longer observe these mappings.
3097 	 * 2) The pmap now has a zero refcount, so in a correctly functioning system
3098 	 *    no further mappings will be requested for it.
3099 	 */
3100 	pmap_epoch_prepare_drain();
3101 
3102 	if (!is_stage2_pmap) {
3103 		pmap_unmap_commpage(pmap);
3104 	}
3105 
3106 	pmap_simple_lock(&pmaps_lock);
3107 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3108 	pmap_simple_unlock(&pmaps_lock);
3109 
3110 	pmap_epoch_drain();
3111 
3112 	/*
3113 	 *	Free the memory maps, then the
3114 	 *	pmap structure.
3115 	 */
3116 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr));
3117 
3118 	if (pmap->tte) {
3119 		vm_size_t pmap_root_size = pmap_root_alloc_size(pmap);
3120 #if __ARM64_PMAP_SUBPAGE_L1__
3121 		/**
3122 		 * Like in the allocation path, identify the case where the root table
3123 		 * qualifies for SURT.
3124 		 */
3125 		if (pmap_user_root_size_matches_subpage_l1(pmap_root_size)) {
3126 			/**
3127 			 * Nested tables cannot use SURT, so the allocated size has to be
3128 			 * PAGE_SIZE.
3129 			 */
3130 			if (pmap_is_nested(pmap)) {
3131 				pmap_root_size = PAGE_SIZE;
3132 			} else {
3133 				/**
3134 				 * Note: with SPTM, the kernel pmap is never supposed to be
3135 				 * destroyed because the SPTM relies on the existence of the
3136 				 * kernel root table. Also, the commpage-typed pmap doesn't
3137 				 * exist. Not only is the pmap associated with a commpage
3138 				 * table transient and destroyed right after the commpage
3139 				 * table is setup, but also the pmap is just a plain
3140 				 * PMAP_TYPE_USER typed pmap.
3141 				 */
3142 				assert(pmap->type == PMAP_TYPE_USER);
3143 				pmap_root_size = SUBPAGE_USER_ROOT_TABLE_SIZE;
3144 			}
3145 		}
3146 #endif
3147 		pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_size);
3148 		pmap->tte = (tt_entry_t *) NULL;
3149 		pmap->ttep = 0;
3150 	}
3151 
3152 	if (pmap->type != PMAP_TYPE_NESTED) {
3153 		/* return its asid to the pool */
3154 		pmap_get_pt_ops(pmap)->free_id(pmap);
3155 		if (pmap->nested_pmap != NULL) {
3156 			/* release the reference we hold on the nested pmap */
3157 			pmap_destroy_internal(pmap->nested_pmap);
3158 		}
3159 	}
3160 
3161 	pmap_check_ledgers(pmap);
3162 
3163 	if ((pmap->type == PMAP_TYPE_NESTED) && (pmap->nested_region_unnested_table_bitmap != NULL)) {
3164 		bitmap_free(pmap->nested_region_unnested_table_bitmap,
3165 		    (pmap->nested_region_size >> (pt_attr_twig_shift(pt_attr) - 1)));
3166 	}
3167 
3168 	pmap_lock_destroy(pmap);
3169 	zfree(pmap_zone, pmap);
3170 }
3171 
3172 void
pmap_destroy(pmap_t pmap)3173 pmap_destroy(
3174 	pmap_t pmap)
3175 {
3176 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
3177 
3178 	ledger_t ledger = pmap->ledger;
3179 
3180 	pmap_destroy_internal(pmap);
3181 
3182 	ledger_dereference(ledger);
3183 
3184 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3185 }
3186 
3187 
3188 /*
3189  *	Add a reference to the specified pmap.
3190  */
3191 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3192 pmap_reference_internal(
3193 	pmap_t pmap)
3194 {
3195 	if (pmap != PMAP_NULL) {
3196 		validate_pmap_mutable(pmap);
3197 		os_ref_retain_raw(&pmap->ref_count, &pmap_refgrp);
3198 	}
3199 }
3200 
3201 void
pmap_reference(pmap_t pmap)3202 pmap_reference(
3203 	pmap_t pmap)
3204 {
3205 	pmap_reference_internal(pmap);
3206 }
3207 
3208 static sptm_frame_type_t
get_sptm_pt_type(pmap_t pmap)3209 get_sptm_pt_type(pmap_t pmap)
3210 {
3211 	const bool is_stage2_pmap = false;
3212 	if (is_stage2_pmap) {
3213 		assert(pmap->type != PMAP_TYPE_NESTED);
3214 		return XNU_STAGE2_PAGE_TABLE;
3215 	} else {
3216 		return pmap->type == PMAP_TYPE_NESTED ? XNU_PAGE_TABLE_SHARED : XNU_PAGE_TABLE;
3217 	}
3218 }
3219 
3220 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,uint8_t sptm_root_flags)3221 pmap_tt1_allocate(pmap_t pmap, vm_size_t size, uint8_t sptm_root_flags)
3222 {
3223 	pmap_paddr_t pa = 0;
3224 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3225 	const bool is_stage2_pmap = false;
3226 
3227 	/**
3228 	 * Allocate the entire page for root-level page table unless it is subpage
3229 	 * L1 table, where size will be exactly PMAP_ROOT_ALLOC_SIZE.
3230 	 */
3231 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3232 		size = PAGE_SIZE;
3233 	}
3234 
3235 #if __ARM64_PMAP_SUBPAGE_L1__
3236 	/**
3237 	 * At this moment, the allocation size is smaller than the page size only
3238 	 * when it is a subpage L1 table. We will try to allocate a root table
3239 	 * from the SURTs (SUbpage Root Tables).
3240 	 */
3241 	const bool use_surt = (size < PAGE_SIZE);
3242 	if (use_surt) {
3243 		/* It has to be a user pmap. */
3244 		assert(pmap->type == PMAP_TYPE_USER);
3245 
3246 		/**
3247 		 * Subpage stage 2 root table is not supported. This is guaranteed by
3248 		 * the stage 2 pmaps using a different pmap geometry than the stage
3249 		 * 1 pmaps.
3250 		 */
3251 		assert(!is_stage2_pmap);
3252 
3253 		/* Try allocating a SURT from the SURT page queue. */
3254 		pa = surt_try_alloc();
3255 
3256 		/* If there is one SURT available, call SPTM to claim the SURT. */
3257 		if (pa) {
3258 			sptm_surt_alloc(surt_page_pa_from_surt_pa(pa),
3259 			    surt_index_from_surt_pa(pa),
3260 			    pt_attr->geometry_id,
3261 			    sptm_root_flags,
3262 			    pmap->asid);
3263 
3264 			/* We don't need to allocate a new page, so skip to the end. */
3265 			goto ptt1a_done;
3266 		}
3267 	}
3268 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3269 
3270 	/**
3271 	 * Either the root table size is not suitable for SURT or SURT is out of
3272 	 * tables. In either case, a page needs to be allocated.
3273 	 */
3274 	const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL);
3275 
3276 	/* No page is allocated, so return 0 to signal failure. */
3277 	if (ret != KERN_SUCCESS) {
3278 		return (tt_entry_t *)0;
3279 	}
3280 
3281 	/**
3282 	 * Drain the epochs to ensure any lingering batched operations that may have
3283 	 * taken an in-flight reference to this page are complete.
3284 	 */
3285 	pmap_epoch_prepare_drain();
3286 
3287 	assert(pa);
3288 
3289 #if __ARM64_PMAP_SUBPAGE_L1__
3290 	if (use_surt) {
3291 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3292 
3293 		pmap_epoch_drain();
3294 
3295 		/**
3296 		 * The allocated page is retyped to XNU_SUBPAGE_USER_ROOT_TABLES as the
3297 		 * container of the SURTs.
3298 		 */
3299 		sptm_retype(pa, XNU_DEFAULT, XNU_SUBPAGE_USER_ROOT_TABLES, retype_params);
3300 
3301 		/**
3302 		 * Before we add the page to the SURT page queue, claim the first SURT
3303 		 * for ourselves. This is safe since we are the only one accessing this
3304 		 * page at this moment.
3305 		 */
3306 		sptm_surt_alloc(pa, 0, pt_attr->geometry_id, sptm_root_flags, pmap->asid);
3307 
3308 		/**
3309 		 * Add the newly allocated SURT page to the page queue.
3310 		 */
3311 		surt_feed_page_with_first_table_allocated(pa);
3312 	} else
3313 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3314 	{
3315 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3316 		retype_params.attr_idx = pt_attr->geometry_id;
3317 		retype_params.flags = sptm_root_flags;
3318 		if (is_stage2_pmap) {
3319 			retype_params.vmid = pmap->vmid;
3320 		} else {
3321 			retype_params.asid = pmap->asid;
3322 		}
3323 
3324 		pmap_epoch_drain();
3325 
3326 		sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE,
3327 		    retype_params);
3328 	}
3329 
3330 #if __ARM64_PMAP_SUBPAGE_L1__
3331 ptt1a_done:
3332 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3333 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3334 	 * Depending on the device, this can vary between 512b and 16K. */
3335 	OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3336 	pmap_tt_ledger_credit(pmap, size);
3337 
3338 	return (tt_entry_t *) phystokv(pa);
3339 }
3340 
3341 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size)3342 pmap_tt1_deallocate(
3343 	pmap_t pmap,
3344 	tt_entry_t *tt,
3345 	vm_size_t size)
3346 {
3347 	pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt);
3348 	const bool is_stage2_pmap = false;
3349 
3350 	/**
3351 	 * Free the entire page unless it is subpage L1 table, where size will be
3352 	 * exactly PMAP_ROOT_ALLOC_SIZE.
3353 	 */
3354 	if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3355 		size = PAGE_SIZE;
3356 	}
3357 
3358 #if __ARM64_PMAP_SUBPAGE_L1__
3359 	/**
3360 	 * At this moment, the free size is smaller than the page size only
3361 	 * when it is a subpage L1 table. We will try to free the root table
3362 	 * from the SURT page.
3363 	 */
3364 	const bool use_surt = (size < PAGE_SIZE);
3365 	if (use_surt) {
3366 		/* It has to be a user pmap. */
3367 		assert(pmap->type == PMAP_TYPE_USER);
3368 
3369 		/* Subpage stage 2 root table is not supported. */
3370 		assert(!is_stage2_pmap);
3371 
3372 		/* Before we do anything in pmap, tell SPTM that the SURT is free. */
3373 		sptm_surt_free(surt_page_pa_from_surt_pa(pa),
3374 		    surt_index_from_surt_pa(pa));
3375 
3376 		/**
3377 		 * Make sure the SURT bitmap update is not reordered before the SPTM
3378 		 * rw guard release.
3379 		 */
3380 		os_atomic_thread_fence(release);
3381 
3382 		/**
3383 		 * Free the SURT in pmap scope, if surt_free() returns false, there
3384 		 * are still other SURTs on the page. In such case, do not retype
3385 		 * or free the page; just skip to the end to finish accounting.
3386 		 */
3387 		if (!surt_free(pa)) {
3388 			goto ptt1d_done;
3389 		}
3390 
3391 		/**
3392 		 * Make sure the SURT bitmap read is not reordered after the SPTM
3393 		 * rw guard exclusive acquire in the retype case.
3394 		 */
3395 		os_atomic_thread_fence(acquire);
3396 	}
3397 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3398 
3399 	sptm_frame_type_t page_type;
3400 #if __ARM64_PMAP_SUBPAGE_L1__
3401 	if (use_surt) {
3402 		page_type = XNU_SUBPAGE_USER_ROOT_TABLES;
3403 	} else
3404 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3405 	if (is_stage2_pmap) {
3406 		page_type = XNU_STAGE2_ROOT_TABLE;
3407 	} else if (pmap->type == PMAP_TYPE_NESTED) {
3408 		page_type = XNU_SHARED_ROOT_TABLE;
3409 	} else {
3410 		page_type = XNU_USER_ROOT_TABLE;
3411 	}
3412 
3413 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3414 	sptm_retype(pa & ~PAGE_MASK, page_type, XNU_DEFAULT, retype_params);
3415 	pmap_page_free(pa & ~PAGE_MASK);
3416 
3417 #if __ARM64_PMAP_SUBPAGE_L1__
3418 ptt1d_done:
3419 #endif /* __ARM64_PMAP_SUBPAGE_L1__ */
3420 	OSAddAtomic(-(int32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3421 	pmap_tt_ledger_debit(pmap, size);
3422 }
3423 
3424 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,pt_desc_t ** ptdp_out,unsigned int level,unsigned int options)3425 pmap_tt_allocate(
3426 	pmap_t pmap,
3427 	tt_entry_t **ttp,
3428 	pt_desc_t **ptdp_out,
3429 	unsigned int level,
3430 	unsigned int options)
3431 {
3432 	pmap_paddr_t pa;
3433 	const unsigned int alloc_flags =
3434 	    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
3435 
3436 	/* Allocate a VM page to be used as the page table. */
3437 	if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
3438 		return KERN_RESOURCE_SHORTAGE;
3439 	}
3440 
3441 	pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags);
3442 	if (ptdp == NULL) {
3443 		pmap_page_free(pa);
3444 		return KERN_RESOURCE_SHORTAGE;
3445 	}
3446 
3447 	unsigned int pai = pa_index(pa);
3448 	locked_pvh_t locked_pvh = pvh_lock(pai);
3449 	assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p",
3450 	    __func__, (void*)locked_pvh.pvh);
3451 
3452 	/**
3453 	 * Drain the epochs to ensure any lingering batched operations that may have taken
3454 	 * an in-flight reference to this page are complete.
3455 	 */
3456 	pmap_epoch_prepare_drain();
3457 
3458 	if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3459 		OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3460 	} else {
3461 		OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3462 	}
3463 
3464 	pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3465 
3466 	PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3467 
3468 	pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
3469 	pvh_unlock(&locked_pvh);
3470 
3471 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3472 	retype_params.level = (sptm_pt_level_t)level;
3473 
3474 	/**
3475 	 * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages
3476 	 * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above.
3477 	 */
3478 	pmap_epoch_drain();
3479 
3480 	sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params);
3481 
3482 	*ptdp_out = ptdp;
3483 	*ttp = (tt_entry_t *)phystokv(pa);
3484 
3485 	return KERN_SUCCESS;
3486 }
3487 
3488 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3489 pmap_tt_deallocate(
3490 	pmap_t pmap,
3491 	tt_entry_t *ttp,
3492 	unsigned int level)
3493 {
3494 	pt_desc_t *ptdp;
3495 	vm_offset_t     free_page = 0;
3496 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3497 
3498 	ptdp = ptep_get_ptd(ttp);
3499 	ptdp->va = (vm_offset_t)-1;
3500 
3501 	const uint16_t refcnt = sptm_get_page_table_refcnt(kvtophys_nofail((vm_offset_t)ttp));
3502 
3503 	if (__improbable(refcnt != 0)) {
3504 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, refcnt);
3505 	}
3506 
3507 	free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3508 	if (free_page != 0) {
3509 		pmap_paddr_t pa = kvtophys_nofail(free_page);
3510 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
3511 		sptm_retype(pa, get_sptm_pt_type(pmap), XNU_DEFAULT, retype_params);
3512 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3513 
3514 		unsigned int pai = pa_index(pa);
3515 		locked_pvh_t locked_pvh = pvh_lock(pai);
3516 		assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTDP), "%s: non-PTD PVH %p",
3517 		    __func__, (void*)locked_pvh.pvh);
3518 		pvh_update_head(&locked_pvh, NULL, PVH_TYPE_NULL);
3519 		pvh_unlock(&locked_pvh);
3520 		pmap_page_free(pa);
3521 		if (level < pt_attr_leaf_level(pt_attr)) {
3522 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3523 		} else {
3524 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3525 		}
3526 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3527 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3528 	}
3529 }
3530 
3531 /**
3532  * Check table refcounts after clearing a translation table entry pointing to that table
3533  *
3534  * @note If the cleared TTE points to a leaf table, then that leaf table
3535  *       must have a refcnt of zero before the TTE can be removed.
3536  *
3537  * @param pmap The pmap containing the page table whose TTE is being removed.
3538  * @param tte Value stored in the TTE prior to clearing it
3539  * @param level The level of the page table that contains the TTE being removed
3540  */
3541 static void
pmap_tte_check_refcounts(pmap_t pmap,tt_entry_t tte,unsigned int level)3542 pmap_tte_check_refcounts(
3543 	pmap_t pmap,
3544 	tt_entry_t tte,
3545 	unsigned int level)
3546 {
3547 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3548 
3549 	/**
3550 	 * Remember, the passed in "level" parameter refers to the level above the
3551 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3552 	 * page table).
3553 	 */
3554 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3555 
3556 	unsigned short refcnt = 0;
3557 
3558 	/**
3559 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3560 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
3561 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3562 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
3563 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3564 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3565 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3566 	 * synchronize it against the disconnect operation.  If that removal caused the
3567 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
3568 	 * operation is finished using the relevant pagetable descriptor.
3569 	 * Address these cases by draining the epochs to ensure other cores are no longer
3570 	 * consuming the page table we're preparing to delete.
3571 	 */
3572 	if (remove_leaf_table) {
3573 		pmap_epoch_prepare_drain();
3574 		pmap_epoch_drain();
3575 		refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte));
3576 	}
3577 
3578 #if MACH_ASSERT
3579 	/**
3580 	 * On internal devices, always do the page table consistency check
3581 	 * regardless of page table level or the actual refcnt value.
3582 	 */
3583 	{
3584 #else /* MACH_ASSERT */
3585 	/**
3586 	 * Only perform the page table consistency check when deleting leaf page
3587 	 * tables and it seems like there might be valid/compressed mappings
3588 	 * leftover.
3589 	 */
3590 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3591 #endif /* MACH_ASSERT */
3592 
3593 		/**
3594 		 * There are multiple problems that can arise as a non-zero refcnt:
3595 		 * 1. A bug in the refcnt management logic.
3596 		 * 2. A memory stomper or hardware failure.
3597 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3598 		 *    space before destroying a pmap.
3599 		 *
3600 		 * By looping over the page table and determining how many valid or
3601 		 * compressed entries there actually are, we can narrow down which of
3602 		 * these three cases is causing this panic. If the expected refcnt
3603 		 * (valid + compressed) and the actual refcnt don't match then the
3604 		 * problem is probably either a memory corruption issue (if the
3605 		 * non-empty entries don't match valid+compressed, that could also be a
3606 		 * sign of corruption) or refcnt management bug. Otherwise, there
3607 		 * actually are leftover mappings and the higher layers of xnu are
3608 		 * probably at fault.
3609 		 *
3610 		 * Note that we use PAGE_SIZE to govern the range of the table check,
3611 		 * because even for 4K processes we still allocate a 16K page for each
3612 		 * page table; we simply map it using 4 adjacent TTEs for the 4K case.
3613 		 */
3614 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(PAGE_SIZE - 1)));
3615 
3616 		pt_entry_t *ptep = bpte;
3617 		unsigned short wiredcnt = ptep_get_info((pt_entry_t*)ttetokv(tte))->wiredcnt;
3618 		unsigned short non_empty = 0, valid = 0, comp = 0;
3619 		for (unsigned int i = 0; i < (PAGE_SIZE / sizeof(*ptep)); i++, ptep++) {
3620 			/* Keep track of all non-empty entries to detect memory corruption. */
3621 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3622 				non_empty++;
3623 			}
3624 
3625 			if (__improbable(pte_is_compressed(*ptep, ptep))) {
3626 				comp++;
3627 			} else if (__improbable(pte_is_valid(*ptep))) {
3628 				valid++;
3629 			}
3630 		}
3631 
3632 #if MACH_ASSERT
3633 		/**
3634 		 * On internal machines, panic whenever a page table getting deleted has
3635 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3636 		 * non-zero refcnt.
3637 		 */
3638 		if (__improbable((non_empty != 0) || (remove_leaf_table && ((refcnt != 0) || (wiredcnt != 0))))) {
3639 #else /* MACH_ASSERT */
3640 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3641 		{
3642 #endif /* MACH_ASSERT */
3643 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3644 			    "%d compressed, %d non-empty, refcnt=%d, wiredcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3645 			    level + 1, valid, comp, non_empty, refcnt, wiredcnt, level, (uint64_t)tte, pmap, bpte);
3646 		}
3647 	}
3648 }
3649 
3650 /**
3651  * Remove translation table entry pointing to a nested shared region table
3652  *
3653  * @note The TTE to clear out is expected to point to a leaf table with a refcnt
3654  *       of zero.
3655  *
3656  * @param pmap The user pmap containing the nested page table whose TTE is being removed.
3657  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3658  * @param ttep Pointer to the TTE that should be cleared out.
3659  */
3660 static void
3661 pmap_tte_trim(
3662 	pmap_t pmap,
3663 	vm_offset_t va_start,
3664 	tt_entry_t *ttep)
3665 {
3666 	assert(ttep != NULL);
3667 	const tt_entry_t tte = *ttep;
3668 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3669 
3670 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3671 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3672 		    "stomper? pmap=%p ttep=%p", __func__, pt_attr_twig_level(pt_attr), pmap, ttep);
3673 	}
3674 
3675 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3676 	sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift);
3677 
3678 	pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr));
3679 }
3680 
3681 /**
3682  * Remove a translation table entry.
3683  *
3684  * @note If the TTE to clear out points to a leaf table, then that leaf table
3685  *       must have a mapping refcount of zero before the TTE can be removed.
3686  * @note If locked_pvh is non-NULL, this function expects to be called with
3687  *       the PVH lock held and will return with it unlocked.  Otherwise it
3688  *       expects pmap to be locked exclusive, and will return with pmap unlocked.
3689  *
3690  * @param pmap The pmap containing the page table whose TTE is being removed.
3691  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3692  * @param ttep Pointer to the TTE that should be cleared out.
3693  * @param level The level of the page table that contains the TTE to be removed.
3694  * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3695  *                    be dropped after removing the table entry.
3696  */
3697 static void
3698 pmap_tte_remove(
3699 	pmap_t pmap,
3700 	vm_offset_t va_start,
3701 	tt_entry_t *ttep,
3702 	unsigned int level,
3703 	bool pmap_locked)
3704 {
3705 	assert(ttep != NULL);
3706 	const tt_entry_t tte = *ttep;
3707 
3708 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3709 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3710 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3711 	}
3712 
3713 	sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level);
3714 
3715 	if (pmap_locked) {
3716 		pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3717 	}
3718 
3719 	pmap_tte_check_refcounts(pmap, tte, level);
3720 }
3721 
3722 /**
3723  * Given a pointer to an entry within a `level` page table, delete the
3724  * page table at `level` + 1 that is represented by that entry. For instance,
3725  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3726  * contains the PA of the L3 table, and `level` would be "2".
3727  *
3728  * @note If the table getting deallocated is a leaf table, then that leaf table
3729  *       must have a mapping refcount of zero before getting deallocated.
3730  * @note If locked_pvh is non-NULL, this function expects to be called with
3731  *       the PVH lock held and will return with it unlocked.  Otherwise it
3732  *       expects pmap to be locked exclusive, and will return with pmap unlocked.
3733  *
3734  * @param pmap The pmap that owns the page table to be deallocated.
3735  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3736  * @param ttep Pointer to the `level` TTE to remove.
3737  * @param level The level of the table that contains an entry pointing to the
3738  *              table to be removed. The deallocated page table will be a
3739  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3740  *              deleted).
3741  * @param pmap_locked If true, the caller holds an exclusive pmap lock which should
3742  *                    be dropped after removing the table entry.
3743  */
3744 static void
3745 pmap_tte_deallocate(
3746 	pmap_t pmap,
3747 	vm_offset_t va_start,
3748 	tt_entry_t *ttep,
3749 	unsigned int level,
3750 	bool pmap_locked)
3751 {
3752 	tt_entry_t tte = *ttep;
3753 
3754 	if (tte_get_ptd(tte)->pmap != pmap) {
3755 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3756 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3757 	}
3758 
3759 	assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
3760 	    (unsigned long long)tte);
3761 
3762 	/* pmap_tte_remove() will drop the pmap lock if necessary. */
3763 	pmap_tte_remove(pmap, va_start, ttep, level, pmap_locked);
3764 
3765 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3766 }
3767 
3768 /*
3769  *	Remove a range of hardware page-table entries.
3770  *	The range is given as the first (inclusive)
3771  *	and last (exclusive) virtual addresses mapped by
3772  *      the PTE region to be removed.
3773  *
3774  *	The pmap must be locked shared.
3775  *	If the pmap is not the kernel pmap, the range must lie
3776  *	entirely within one pte-page. Assumes that the pte-page exists.
3777  *
3778  *	Returns the number of PTE changed
3779  */
3780 MARK_AS_PMAP_TEXT static void
3781 pmap_remove_range(
3782 	pmap_t pmap,
3783 	vm_map_address_t va,
3784 	vm_map_address_t end)
3785 {
3786 	pmap_remove_range_options(pmap, va, end, PMAP_OPTIONS_REMOVE);
3787 }
3788 
3789 MARK_AS_PMAP_TEXT void
3790 pmap_remove_range_options(
3791 	pmap_t pmap,
3792 	vm_map_address_t start,
3793 	vm_map_address_t end,
3794 	int options)
3795 {
3796 	const unsigned int sptm_flags = ((options & PMAP_OPTIONS_REMOVE) ? SPTM_REMOVE_COMPRESSED : 0);
3797 	unsigned int num_removed = 0;
3798 	unsigned int num_external = 0, num_internal = 0, num_reusable = 0;
3799 	unsigned int num_alt_internal = 0;
3800 	unsigned int num_compressed = 0, num_alt_compressed = 0;
3801 	unsigned short num_unwired = 0;
3802 	bool need_strong_sync = false;
3803 
3804 	/*
3805 	 * The pmap lock should be held here.  It will only be held shared in most if not all cases.
3806 	 */
3807 	pmap_assert_locked(pmap, PMAP_LOCK_HELD);
3808 
3809 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3810 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3811 	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pt_attr);
3812 	vm_map_address_t va = start;
3813 	pt_entry_t *cpte = pmap_pte(pmap, va);
3814 	assert(cpte != NULL);
3815 
3816 	while (va < end) {
3817 		/**
3818 		 * We may need to sleep when taking the PVH lock below, and our pmap_pv_remove()
3819 		 * call below may also place the lock in sleep mode if processing a large PV list.
3820 		 * We therefore can't leave preemption disabled across that code, which means we
3821 		 * can't directly use the per-CPU prev_ptes array in that code.  Since that code
3822 		 * only cares about the physical address stored in each prev_ptes entry, we'll
3823 		 * use a local array to stash off only the 4-byte physical address index in order
3824 		 * to reduce stack usage.
3825 		 */
3826 		unsigned int pai_list[SPTM_MAPPING_LIMIT];
3827 		_Static_assert(SPTM_MAPPING_LIMIT <= 64,
3828 		    "SPTM_MAPPING_LIMIT value causes excessive stack usage for pai_list");
3829 
3830 		unsigned int num_mappings = (end - va) >> pmap_page_shift;
3831 		if (num_mappings > SPTM_MAPPING_LIMIT) {
3832 			num_mappings = SPTM_MAPPING_LIMIT;
3833 		}
3834 
3835 		/**
3836 		 * Disable preemption to ensure that we can safely access per-CPU mapping data after
3837 		 * issuing the SPTM call.
3838 		 */
3839 		disable_preemption();
3840 		/**
3841 		 * Enter the pmap epoch for the batched unmap operation.  This is necessary because we
3842 		 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
3843 		 * call, so a concurrent pmap_page_protect() operation against one of those pages may
3844 		 * race this call.  That should be perfectly fine as far as the PTE updates are concerned,
3845 		 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
3846 		 * if it does not first drain our epoch.
3847 		 */
3848 		pmap_epoch_enter();
3849 		sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags);
3850 		pmap_epoch_exit();
3851 
3852 		sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes;
3853 		for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3854 			const pt_entry_t prev_pte = prev_ptes[i];
3855 
3856 			if (pte_is_compressed(prev_pte, cpte)) {
3857 				if (options & PMAP_OPTIONS_REMOVE) {
3858 					++num_compressed;
3859 					if (prev_pte & ARM_PTE_COMPRESSED_ALT) {
3860 						++num_alt_compressed;
3861 					}
3862 				}
3863 				pai_list[i] = INVALID_PAI;
3864 				continue;
3865 			} else if (!pte_is_valid(prev_pte)) {
3866 				pai_list[i] = INVALID_PAI;
3867 				continue;
3868 			}
3869 
3870 			if (pte_is_wired(prev_pte)) {
3871 				num_unwired++;
3872 			}
3873 
3874 			const pmap_paddr_t pa = pte_to_pa(prev_pte);
3875 
3876 			if (__improbable(!pa_valid(pa))) {
3877 				pai_list[i] = INVALID_PAI;
3878 				continue;
3879 			}
3880 			pai_list[i] = pa_index(pa);
3881 		}
3882 
3883 		enable_preemption();
3884 		cpte -= num_mappings;
3885 
3886 		for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3887 			if (pai_list[i] == INVALID_PAI) {
3888 				continue;
3889 			}
3890 			locked_pvh_t locked_pvh;
3891 			if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
3892 				locked_pvh = pvh_lock_nopreempt(pai_list[i]);
3893 			} else {
3894 				locked_pvh = pvh_lock(pai_list[i]);
3895 			}
3896 
3897 			bool is_internal, is_altacct;
3898 			pv_remove_return_t remove_status = pmap_remove_pv(pmap, cpte, &locked_pvh, &is_internal, &is_altacct);
3899 
3900 			switch (remove_status) {
3901 			case PV_REMOVE_SUCCESS:
3902 				++num_removed;
3903 				if (is_altacct) {
3904 					assert(is_internal);
3905 					num_internal++;
3906 					num_alt_internal++;
3907 				} else if (is_internal) {
3908 					if (ppattr_test_reusable(pai_list[i])) {
3909 						num_reusable++;
3910 					} else {
3911 						num_internal++;
3912 					}
3913 				} else {
3914 					num_external++;
3915 				}
3916 				break;
3917 			default:
3918 				/*
3919 				 * PVE already removed; this can happen due to a concurrent pmap_disconnect()
3920 				 * executing before we grabbed the PVH lock.
3921 				 */
3922 				break;
3923 			}
3924 
3925 			pvh_unlock(&locked_pvh);
3926 		}
3927 
3928 		va += (num_mappings << pmap_page_shift);
3929 	}
3930 
3931 	if (__improbable(need_strong_sync)) {
3932 		arm64_sync_tlb(true);
3933 	}
3934 
3935 	/*
3936 	 *	Update the counts
3937 	 */
3938 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
3939 
3940 	if (pmap != kernel_pmap) {
3941 		if (num_unwired != 0) {
3942 			ptd_info_t * const ptd_info = ptep_get_info(cpte - 1);
3943 			if (__improbable(os_atomic_sub_orig(&ptd_info->wiredcnt, num_unwired, relaxed) < num_unwired)) {
3944 				panic("%s: pmap %p VA [0x%llx, 0x%llx) (ptd info %p) wired count underflow", __func__, pmap,
3945 				    (unsigned long long)start, (unsigned long long)end, ptd_info);
3946 			}
3947 		}
3948 
3949 		/* update ledgers */
3950 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
3951 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
3952 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
3953 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
3954 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
3955 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
3956 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
3957 		/* make needed adjustments to phys_footprint */
3958 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
3959 		    ((num_internal -
3960 		    num_alt_internal) +
3961 		    (num_compressed -
3962 		    num_alt_compressed)) * pmap_page_size);
3963 	}
3964 }
3965 
3966 
3967 /*
3968  *	Remove the given range of addresses
3969  *	from the specified map.
3970  *
3971  *	It is assumed that the start and end are properly
3972  *	rounded to the hardware page size.
3973  */
3974 void
3975 pmap_remove(
3976 	pmap_t pmap,
3977 	vm_map_address_t start,
3978 	vm_map_address_t end)
3979 {
3980 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
3981 }
3982 
3983 MARK_AS_PMAP_TEXT vm_map_address_t
3984 pmap_remove_options_internal(
3985 	pmap_t pmap,
3986 	vm_map_address_t start,
3987 	vm_map_address_t end,
3988 	int options)
3989 {
3990 	vm_map_address_t eva = end;
3991 	tt_entry_t     *tte_p;
3992 	bool            unlock = true;
3993 
3994 	if (__improbable(end < start)) {
3995 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
3996 	}
3997 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3998 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3999 	}
4000 
4001 	validate_pmap_mutable(pmap);
4002 
4003 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4004 
4005 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
4006 	pmap_lock(pmap, lock_mode);
4007 
4008 	tte_p = pmap_tte(pmap, start);
4009 
4010 	if ((tte_p == NULL) || ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_FAULT)) {
4011 		goto done;
4012 	}
4013 
4014 	assertf(tte_is_table(*tte_p), "%s: invalid TTE %p (0x%llx) for pmap %p va 0x%llx",
4015 	    __func__, tte_p, (unsigned long long)*tte_p, pmap, (unsigned long long)start);
4016 
4017 	pmap_remove_range_options(pmap, start, end, options);
4018 
4019 	if (pmap->type != PMAP_TYPE_USER) {
4020 		goto done;
4021 	}
4022 
4023 	uint16_t refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
4024 	if (__improbable(refcnt == 0)) {
4025 		ptd_info_t *ptd_info = ptep_get_info((pt_entry_t*)ttetokv(*tte_p));
4026 		os_atomic_inc(&ptd_info->wiredcnt, relaxed); // Prevent someone else from freeing the table if we need to drop the lock
4027 		if (!pmap_lock_shared_to_exclusive(pmap)) {
4028 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4029 		}
4030 		lock_mode = PMAP_LOCK_EXCLUSIVE;
4031 		refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
4032 		if ((os_atomic_dec(&ptd_info->wiredcnt, relaxed) == 0) && (refcnt == 0)) {
4033 			/**
4034 			 * Drain any concurrent retype-sensitive SPTM operations.  This is needed to
4035 			 * ensure that we don't unmap the page table and retype it while those operations
4036 			 * are still finishing on other CPUs, leading to an SPTM violation.  In particular,
4037 			 * the multipage batched cacheability/attribute update code may issue SPTM calls
4038 			 * without holding the relevant PVH or pmap locks, so we can't guarantee those
4039 			 * calls have actually completed despite observing refcnt == 0.
4040 			 *
4041 			 * At this point, we CAN guarantee that:
4042 			 * 1) All prior PTE removals required to produce refcnt == 0 have
4043 			 *    completed and been synchronized for all observers by DSB, and the
4044 			 *    relevant PV list entries removed.  Subsequent calls not already in the
4045 			 *    pmap epoch will no longer observe these mappings.
4046 			 * 2) We now hold the pmap lock exclusive, so there will be no further attempt
4047 			 *    to enter mappings in this page table before it is unmapped.
4048 			 */
4049 			pmap_epoch_prepare_drain();
4050 			pmap_epoch_drain();
4051 			pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr), true);
4052 			unlock = false; // pmap_tte_deallocate() has dropped the lock
4053 		}
4054 	}
4055 done:
4056 	if (unlock) {
4057 		pmap_unlock(pmap, lock_mode);
4058 	}
4059 
4060 	return eva;
4061 }
4062 
4063 void
4064 pmap_remove_options(
4065 	pmap_t pmap,
4066 	vm_map_address_t start,
4067 	vm_map_address_t end,
4068 	int options)
4069 {
4070 	vm_map_address_t va;
4071 
4072 	if (pmap == PMAP_NULL) {
4073 		return;
4074 	}
4075 
4076 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4077 
4078 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4079 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4080 	    VM_KERNEL_ADDRHIDE(end));
4081 
4082 #if MACH_ASSERT
4083 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4084 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4085 		    pmap, (uint64_t)start, (uint64_t)end);
4086 	}
4087 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4088 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4089 		    pmap, (uint64_t)start, (uint64_t)end);
4090 	}
4091 #endif
4092 
4093 	/*
4094 	 * We allow single-page requests to execute non-preemptibly,
4095 	 * as it doesn't make sense to sample AST_URGENT for a single-page
4096 	 * operation, and there are a couple of special use cases that
4097 	 * require a non-preemptible single-page operation.
4098 	 */
4099 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4100 		pmap_verify_preemptible();
4101 	}
4102 
4103 	/*
4104 	 *      Invalidate the translation buffer first
4105 	 */
4106 	va = start;
4107 	while (va < end) {
4108 		vm_map_address_t l;
4109 
4110 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4111 		if (l > end) {
4112 			l = end;
4113 		}
4114 
4115 		va = pmap_remove_options_internal(pmap, va, l, options);
4116 	}
4117 
4118 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4119 }
4120 
4121 
4122 /*
4123  *	Remove phys addr if mapped in specified map
4124  */
4125 void
4126 pmap_remove_some_phys(
4127 	__unused pmap_t map,
4128 	__unused ppnum_t pn)
4129 {
4130 	/* Implement to support working set code */
4131 }
4132 
4133 /*
4134  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4135  * switch a thread onto a new vm_map.
4136  */
4137 void
4138 pmap_switch_user(thread_t thread, vm_map_t new_map)
4139 {
4140 	pmap_t new_pmap = new_map->pmap;
4141 
4142 
4143 	thread->map = new_map;
4144 	pmap_set_pmap(new_pmap, thread);
4145 
4146 }
4147 void
4148 pmap_set_pmap(
4149 	pmap_t pmap,
4150 	thread_t thread)
4151 {
4152 	pmap_switch(pmap, thread);
4153 }
4154 
4155 MARK_AS_PMAP_TEXT void
4156 pmap_switch_internal(
4157 	pmap_t pmap,
4158 	thread_t thread)
4159 {
4160 	validate_pmap_mutable(pmap);
4161 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4162 	const uint16_t asid_index = PMAP_HWASID(pmap);
4163 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4164 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4165 	}
4166 
4167 #if __ARM_KERNEL_PROTECT__
4168 	asid_index >>= 1;
4169 #endif
4170 
4171 	if (asid_index > 0) {
4172 		pmap_update_plru(asid_index);
4173 	}
4174 
4175 	__unused sptm_return_t sptm_return;
4176 #if HAS_MTE
4177 	if (ml_thread_get_sec_override(thread)) {
4178 		assert(pmap != kernel_pmap);
4179 		sptm_return = sptm_switch_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_MTE);
4180 #else
4181 #pragma unused(thread)
4182 	if (0) {
4183 #endif
4184 	} else {
4185 		sptm_return = sptm_switch_root(pmap->ttep, 0, 0);
4186 	}
4187 
4188 #if DEVELOPMENT || DEBUG
4189 	if (__improbable(sptm_return & SPTM_SWITCH_ASID_TLBI_FLUSH)) {
4190 		os_atomic_inc(&pmap_asid_flushes, relaxed);
4191 	}
4192 
4193 	if (__improbable(sptm_return & SPTM_SWITCH_RCTX_FLUSH)) {
4194 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
4195 	}
4196 #endif /* DEVELOPMENT || DEBUG */
4197 }
4198 
4199 void
4200 pmap_switch(
4201 	pmap_t pmap,
4202 	thread_t thread)
4203 {
4204 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
4205 	pmap_switch_internal(pmap, thread);
4206 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4207 }
4208 
4209 void
4210 pmap_page_protect(
4211 	ppnum_t ppnum,
4212 	vm_prot_t prot)
4213 {
4214 	pmap_page_protect_options(ppnum, prot, 0, NULL);
4215 }
4216 
4217 /**
4218  *  Helper function for performing per-mapping accounting following an SPTM disjoint unmap request.
4219  *
4220  * @note [pmap] cannot be the kernel pmap. This is because we do not maintain a ledger in the
4221  *       kernel pmap.
4222  *
4223  * @param pmap The pmap that contained the mapping
4224  * @param pai The physical page index mapped by the mapping
4225  * @param is_compressed Indicates whether the operation was an unmap-to-compress vs. a full unmap
4226  * @param is_internal Indicates whether the mapping was for an internal (aka anonymous) VM page
4227  * @param is_altacct Indicates whether the mapping was subject to alternate accounting.
4228  */
4229 static void
4230 pmap_disjoint_unmap_accounting(pmap_t pmap, unsigned int pai, bool is_compressed, bool is_internal, bool is_altacct)
4231 {
4232 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4233 	pvh_assert_locked(pai);
4234 
4235 	assert(pmap != kernel_pmap);
4236 
4237 	if (is_internal &&
4238 	    !is_altacct &&
4239 	    ppattr_test_reusable(pai)) {
4240 		pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4241 	} else if (!is_internal) {
4242 		pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4243 	}
4244 
4245 	if (is_altacct) {
4246 		assert(is_internal);
4247 		pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4248 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4249 		if (is_compressed) {
4250 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4251 			pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4252 		}
4253 	} else if (ppattr_test_reusable(pai)) {
4254 		assert(is_internal);
4255 		if (is_compressed) {
4256 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4257 			/* was not in footprint, but is now */
4258 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4259 		}
4260 	} else if (is_internal) {
4261 		pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4262 
4263 		/*
4264 		 * Update all stats related to physical footprint, which only
4265 		 * deals with internal pages.
4266 		 */
4267 		if (is_compressed) {
4268 			/*
4269 			 * This removal is only being done so we can send this page to
4270 			 * the compressor; therefore it mustn't affect total task footprint.
4271 			 */
4272 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4273 		} else {
4274 			/*
4275 			 * This internal page isn't going to the compressor, so adjust stats to keep
4276 			 * phys_footprint up to date.
4277 			 */
4278 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4279 		}
4280 	} else {
4281 		/* external page: no impact on ledgers */
4282 	}
4283 }
4284 
4285 /**
4286  * Helper function for issuing a disjoint unmap request to the SPTM and performing
4287  * related accounting.  This function uses the 'prev_ptes' list generated by
4288  * the sptm_unmap_disjoint() call to determine whether said call altered the
4289  * relevant PTEs in a manner that would require accounting updates.
4290  *
4291  * @param pa The physical address against which the disjoint unmap will be issued.
4292  * @param num_mappings The number of disjoint mappings for the SPTM to update.
4293  *                     The per-CPU sptm_ops array should contain the same number
4294  *                     of individual disjoint requests.
4295  */
4296 static void
4297 pmap_disjoint_unmap(pmap_paddr_t pa, unsigned int num_mappings)
4298 {
4299 	const unsigned int pai = pa_index(pa);
4300 
4301 	pvh_assert_locked(pai);
4302 
4303 	assert(num_mappings <= SPTM_MAPPING_LIMIT);
4304 
4305 	assert(get_preemption_level() > 0);
4306 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
4307 
4308 	sptm_unmap_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings);
4309 
4310 	for (unsigned int cur_mapping = 0; cur_mapping < num_mappings; ++cur_mapping) {
4311 		pt_entry_t prev_pte = sptm_pcpu->sptm_prev_ptes[cur_mapping];
4312 
4313 		pt_desc_t * const ptdp = sptm_pcpu->sptm_ptds[cur_mapping];
4314 		const pmap_t pmap = ptdp->pmap;
4315 
4316 		assertf(!pte_is_valid(prev_pte) ||
4317 		    ((pte_to_pa(prev_pte) & ~PAGE_MASK) == pa), "%s: prev_pte 0x%llx does not map pa 0x%llx",
4318 		    __func__, (unsigned long long)prev_pte, (unsigned long long)pa);
4319 
4320 		const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4321 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4322 
4323 		if (pmap != kernel_pmap) {
4324 			/*
4325 			 * If the prior PTE is invalid (which may happen due to a concurrent remove operation),
4326 			 * the compressed marker won't be written so we shouldn't account the mapping as compressed.
4327 			 */
4328 			const bool is_compressed = (pte_is_valid(prev_pte) &&
4329 			    ((sptm_pcpu->sptm_ops[cur_mapping].pte_template & ARM_PTE_COMPRESSED_MASK) != 0));
4330 			const bool is_internal = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_INTERNAL) != 0;
4331 			const bool is_altacct = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_ALTACCT) != 0;
4332 
4333 			/*
4334 			 * The rule is that accounting related to PTE contents (wired, PTD refcount)
4335 			 * must be updated by whoever clears the PTE, while accounting related to physical page
4336 			 * attributes must be updated by whoever clears the PVE.  We therefore always call
4337 			 * pmap_disjoint_unmap_accounting() here since we're removing the PVE, but only update
4338 			 * wired/PTD accounting if the prior PTE was valid.
4339 			 */
4340 			pmap_disjoint_unmap_accounting(pmap, pai, is_compressed, is_internal, is_altacct);
4341 
4342 			if (!pte_is_valid(prev_pte)) {
4343 				continue;
4344 			}
4345 
4346 			if (pte_is_wired(prev_pte)) {
4347 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4348 				if (__improbable(os_atomic_dec_orig(&sptm_pcpu->sptm_ptd_info[cur_mapping]->wiredcnt, relaxed) == 0)) {
4349 					panic("%s: over-unwire of ptdp %p, ptd info %p", __func__,
4350 					    ptdp, sptm_pcpu->sptm_ptd_info[cur_mapping]);
4351 				}
4352 			}
4353 		}
4354 	}
4355 }
4356 
4357 /**
4358  * The following two functions, pmap_multipage_op_submit_disjoint() and
4359  * pmap_multipage_op_add_page(), are intended to allow callers to manage batched SPTM
4360  * operations that may span multiple physical pages.  They are intended to operate in
4361  * a way that allows callers such as pmap_page_protect_options_with_flush_range() to
4362  * insert mappings into the per-CPU SPTM disjoint ops array in the same manner that
4363  * they would for an ordinary single-page operation.
4364  * Functions such as pmap_page_protect_options_with_flush_range() operate on a single
4365  * physical page but may be passed a non-NULL flush_range object to indicate that the
4366  * call is part of a larger batched operation which may span multiple physical pages.
4367  * In that scenario, these functions are intended to be used as follows:
4368  * 1) Call pmap_multipage_op_add_page() to insert a "header" for the page into the per-
4369  *    CPU SPTM ops array.  Use the return value from this call as the starting index
4370  *    at which to add ordinary mapping entries into the same array.
4371  * 2) Insert sptm_disjoint_op_t entries into the ops array in the normal manner until
4372  *    the array is full, the SPTM options required for the upcoming sequence of pages
4373  *    need to change, or the current mapping matches flush_range->current_ptep.
4374  *    In the latter case, pmap_insert_flush_range_template() may instead be used
4375  *    to insert the mapping into the per-CPU SPTM region templates array.  See the
4376  *    documentation for pmap_insert_flush_range_template() below.
4377  * 3) If the array is full, call pmap_multipage_op_submit_disjoint() and return to step 1).
4378  * 4) If the SPTM options need to change, call pmap_multipage_op_add_page() to insert
4379  *    a new header with the updated options and, using the return value as the new
4380  *    insertion point for the ops array, resume step 2).
4381  * 5) Upon completion, if there are any pending not-yet-submitted mappings, do not
4382  *    submit those mappings to the SPTM as would ordinarily be done for a single-page
4383  *    call.  These trailing mappings will be submitted as part of the next batch,
4384  *    or by the next-higher caller if the range operation is complete.
4385  *
4386  * Note that, as a performance optimization, the caller may track the insertion
4387  * point in the disjoint ops array locally (i.e. without incrementing
4388  * flush_range->pending_disjoint_entries on every iteration, as long as it takes care to do the
4389  * following:
4390  * 1) Initialize and update that insertion point as described in steps 1) and 4) above.
4391  * 2) Pass the updated insertion point as the 'pending_disjoint_entries' parameter into the calls
4392  *    in steps 3) and 4) above.
4393  * 3) Update flush_range->pending_disjoint_entries with the locally-maintained value along with
4394  *    step 5) above.
4395  */
4396 
4397 /**
4398  * Submit any pending disjoint multi-page mapping updates to the SPTM.
4399  *
4400  * @note This function must be called with preemption disabled, and will drop
4401  *       the preemption-disable count upon submitting to the SPTM.
4402  * @note [pending_disjoint_entries] must include *all* pending entries in the SPTM ops array,
4403  *       including physical address "header" entries.
4404  * @note This function automatically updates the per_paddr_header.num_mappings field
4405  *       for the most recent physical address header in the SPTM ops array to its final
4406  *       value.
4407  *
4408  * @param pending_disjoint_entries The number of not-yet-submitted mappings according to the caller.
4409  *                        This value may be greater than [flush_range]->pending_disjoint_entries if
4410  *                        the caller has inserted mappings into the ops array without
4411  *                        updating [flush_range]->pending_disjoint_entries, in which case this
4412  *                        function will update [flush_range]->pending_disjoint_entries with the
4413  *                        caller's value.
4414  * @param flush_range The object tracking the current state of the multipage disjoint
4415  *                    operation.
4416  */
4417 static inline void
4418 pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tlb_flush_range_t *flush_range)
4419 {
4420 	/**
4421 	 * Reconcile the number of pending entries as tracked by the caller with the
4422 	 * number of pending entries tracked by flush_range.  If the caller's value is
4423 	 * greater, we assume the caller has inserted locally-tracked mappings into the
4424 	 * array without directly updating flush_range->pending_disjoint_entries.  Otherwise, we
4425 	 * assume the caller has no locally-tracked mappings and is simply trying to
4426 	 * purge any pending mappings from a prior call sequence.
4427 	 */
4428 	if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4429 		flush_range->pending_disjoint_entries = pending_disjoint_entries;
4430 	} else {
4431 		assert(pending_disjoint_entries == 0);
4432 	}
4433 	if (flush_range->pending_disjoint_entries != 0) {
4434 		assert(get_preemption_level() > 0);
4435 		/**
4436 		 * Compute the correct number of mappings for the most recent paddr
4437 		 * header based on the current position in the SPTM ops array.
4438 		 */
4439 		flush_range->current_header->per_paddr_header.num_mappings =
4440 		    flush_range->pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4441 		const sptm_return_t sptm_return = sptm_update_disjoint_multipage(
4442 			PERCPU_GET(pmap_sptm_percpu)->sptm_ops_pa, flush_range->pending_disjoint_entries);
4443 
4444 		/**
4445 		 * We may be submitting the batch and exiting the epoch partway through
4446 		 * processing the PV list for a page.  That's fine, because in that case we'll
4447 		 * hold the PV lock for that page, which will prevent mappings of that page from
4448 		 * being disconnected and will prevent the completion of pmap_remove() against
4449 		 * any of those mappings, thus also guaranteeing the relevant page table pages
4450 		 * can't be freed.  The epoch still protects mappings for any prior page in
4451 		 * the batch, whose PV locks are no longer held.
4452 		 */
4453 		pmap_epoch_exit();
4454 		enable_preemption();
4455 		if (flush_range->pending_region_entries != 0) {
4456 			flush_range->processed_entries += flush_range->pending_disjoint_entries;
4457 		} else {
4458 			flush_range->processed_entries = 0;
4459 		}
4460 		flush_range->pending_disjoint_entries = 0;
4461 		if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4462 			flush_range->ptfr_flush_needed = true;
4463 		}
4464 	}
4465 }
4466 
4467 /**
4468  * Insert a new physical address "header" entry into the per-CPU SPTM ops array for a
4469  * multi-page SPTM operation.  It is expected that the caller will subsequently add
4470  * mapping entries for this physical address into the array.
4471  *
4472  * @note This function will disable preemption upon creation of the first paddr header
4473  *       (index 0 in the per-CPU SPTM ops array) and it is expected that
4474  *       pmap_multipage_op_submit() will subsequently be called on the same CPU.
4475  * @note Before inserting the new header, this function automatically updates the
4476  *       per_paddr_header.num_mappings field for the previous physical address header
4477  *       (if present) in the SPTM ops array to its final value.
4478  *
4479  * @param phys The physical address for which to insert a header entry.
4480  * @param inout_pending_disjoint_entries
4481  *              [input] The number of not-yet-submitted mappings according to the caller.
4482  *                      This value may be greater than [flush_range]->pending_disjoint_entries if
4483  *                      the caller has inserted mappings into the ops array without
4484  *                      updating [flush_range]->pending_disjoint_entries, in which case this
4485  *                      function will update [flush_range]->pending_disjoint_entries with the
4486  *                      caller's value.
4487  *              [output] Returns the starting index at which the caller should insert mapping
4488  *                       entries into the per-CPU SPTM ops array.
4489  * @param sptm_update_options SPTM_UPDATE_* flags to pass to the SPTM call.
4490  *                            SPTM_UPDATE_SKIP_PAPT is automatically inserted by this
4491  *                            function.
4492  * @param flush_range The object tracking the current state of the multipage operation.
4493  *
4494  * @return True if the region operation was submitted to the SPTM due to the ops array already
4495  *         being full, false otherwise.  In the former case, the new header will not be added
4496  *         to the array; the caller will need to re-invoke this function after taking any
4497  *         necessary post-submission action (such as enabling preemption).
4498  */
4499 static inline bool
4500 pmap_multipage_op_add_page(
4501 	pmap_paddr_t phys,
4502 	unsigned int *inout_pending_disjoint_entries,
4503 	uint32_t sptm_update_options,
4504 	pmap_tlb_flush_range_t *flush_range)
4505 {
4506 	unsigned int pending_disjoint_entries = *inout_pending_disjoint_entries;
4507 
4508 	/**
4509 	 * Reconcile the number of pending entries as tracked by the caller with the
4510 	 * number of pending entries tracked by flush_range.  If the caller's value is
4511 	 * greater, we assume the caller has inserted locally-tracked mappings into the
4512 	 * array without directly updating flush_range->pending_disjoint_entries.  Otherwise, we
4513 	 * assume the caller has no locally-tracked mappings and is adding its paddr
4514 	 * header for the first time.
4515 	 */
4516 	if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
4517 		flush_range->pending_disjoint_entries = pending_disjoint_entries;
4518 	} else {
4519 		assert(pending_disjoint_entries == 0);
4520 	}
4521 	if (flush_range->pending_disjoint_entries >= (SPTM_MAPPING_LIMIT - 1)) {
4522 		/**
4523 		 * If the SPTM ops array is either full or only has space for the paddr
4524 		 * header, there won't be room for mapping entries, so submit the pending
4525 		 * mappings to the SPTM now, and return to allow the caller to take
4526 		 * any necessary post-submission action.
4527 		 */
4528 		pmap_multipage_op_submit_disjoint(pending_disjoint_entries, flush_range);
4529 		*inout_pending_disjoint_entries = 0;
4530 		return true;
4531 	}
4532 	pending_disjoint_entries = flush_range->pending_disjoint_entries;
4533 
4534 	sptm_update_options |= SPTM_UPDATE_SKIP_PAPT;
4535 	if (pending_disjoint_entries == 0) {
4536 		disable_preemption();
4537 		/**
4538 		 * Enter the pmap epoch while we gather the disjoint update arguments
4539 		 * and issue the SPTM call.  Since this operation may cover multiple physical
4540 		 * pages, we may construct the argument array and invoke the SPTM without holding
4541 		 * all relevant PVH locks or pmap locks.  We therefore need to record that we are
4542 		 * collecting and modifying mapping state so that e.g. pmap_page_protect() does
4543 		 * not attempt to retype the underlying pages and pmap_remove() does not attempt
4544 		 * to free the page tables used for these mappings without first draining our epoch.
4545 		 */
4546 		pmap_epoch_enter();
4547 		flush_range->pending_disjoint_entries = 1;
4548 	} else {
4549 		/**
4550 		 * Before inserting the new header, update the prior header's number
4551 		 * of paddr-specific mappings to its final value.
4552 		 */
4553 		assert(flush_range->current_header != NULL);
4554 		flush_range->current_header->per_paddr_header.num_mappings =
4555 		    pending_disjoint_entries - flush_range->current_header_first_mapping_index;
4556 	}
4557 	sptm_disjoint_op_t *sptm_ops = PERCPU_GET(pmap_sptm_percpu)->sptm_ops;
4558 	flush_range->current_header = (sptm_update_disjoint_multipage_op_t*)&sptm_ops[pending_disjoint_entries];
4559 	flush_range->current_header_first_mapping_index = ++pending_disjoint_entries;
4560 	flush_range->current_header->per_paddr_header.paddr = phys;
4561 	flush_range->current_header->per_paddr_header.num_mappings = 0;
4562 	flush_range->current_header->per_paddr_header.options = sptm_update_options;
4563 
4564 	*inout_pending_disjoint_entries = pending_disjoint_entries;
4565 	return false;
4566 }
4567 
4568 /**
4569  * The following two functions, pmap_multipage_op_submit_region() and
4570  * pmap_insert_flush_range_template(), are meant to be used in a similar fashion
4571  * to pmap_multipage_op_submit_disjoint() and pmap_multipage_op_add_page(),
4572  * but for the specific case in which a given mapping within a PV list happens
4573  * to map the current VA within a VA region being operated on by
4574  * phys_attribute_clear_range().  This allows the pmap to further optimize
4575  * the SPTM calls by using sptm_update_region() to modify all mappings within
4576  * the VA region, which requires far fewer table walks than a disjoint operation.
4577  * Since the starting VA of the region, the owning pmap, and the insertion point
4578  * within the per-CPU region templates array are already known, these functions
4579  * don't require the special "header" entry or the complex array position tracking
4580  * of their disjoint equivalents above.
4581  * Note that these functions may be used together with the disjoint functions above;
4582  * these functions can be used for the "primary" mappings corresponding to the VA
4583  * region being manipulated by the VM layer, while the disjoint functions can be
4584  * used for any alias mappings of the underlying pages which fall outside that
4585  * VA region.
4586  */
4587 
4588 /**
4589  * Submit any pending region-based templates for the specified flush_range.
4590  *
4591  * @note This function must be called with preemption disabled, and will drop
4592  *       the preemption-disable count upon submitting to the SPTM.
4593  *
4594  * @param flush_range The object tracking the current state of the region operation.
4595  */
4596 static inline void
4597 pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range)
4598 {
4599 	if (flush_range->pending_region_entries != 0) {
4600 		assert(get_preemption_level() > 0);
4601 		pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED);
4602 		/**
4603 		 * If there are any pending disjoint entries, we're already in a pmap epoch.
4604 		 * For disjoint entries, we need to hold the epoch during the entire time we
4605 		 * construct the disjoint ops array because those ops may point to some arbitrary
4606 		 * pmap and we need to ensure the relevant page tables and even the pmap itself
4607 		 * aren't concurrently reclaimed while our ops array points to them.
4608 		 * But for a region op like this, we know we already hold the relevant pmap lock
4609 		 * so none of the above can happen concurrently.  We therefore only need to hold
4610 		 * the epoch across the SPTM call itself to prevent a concurrent unmap operation
4611 		 * from attempting to retype the mapped pages while our SPTM call has them in-
4612 		 * flight.
4613 		 */
4614 		if (flush_range->pending_disjoint_entries == 0) {
4615 			pmap_epoch_enter();
4616 		}
4617 		const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep,
4618 		    flush_range->pending_region_start, flush_range->pending_region_entries,
4619 		    PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa,
4620 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI);
4621 		if (flush_range->pending_disjoint_entries == 0) {
4622 			pmap_epoch_exit();
4623 		}
4624 		enable_preemption();
4625 		if (flush_range->pending_disjoint_entries != 0) {
4626 			flush_range->processed_entries += flush_range->pending_region_entries;
4627 		} else {
4628 			flush_range->processed_entries = 0;
4629 		}
4630 		flush_range->pending_region_start += (flush_range->pending_region_entries <<
4631 		        pmap_get_pt_attr(flush_range->ptfr_pmap)->pta_page_shift);
4632 		flush_range->pending_region_entries = 0;
4633 		if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4634 			flush_range->ptfr_flush_needed = true;
4635 		}
4636 	}
4637 }
4638 
4639 /**
4640  * Insert a PTE template into the per-CPU SPTM region ops array.
4641  * This is meant to be used as a performance optimization for the case in which a given
4642  * mapping being processed by a function such as pmap_page_protect_options_with_flush_range()
4643  * happens to map the current iteration position within [flush_range]'s VA region.
4644  * In this case the mapping can be inserted as a region-based template rather than a disjoint
4645  * operation as would be done in the general case.  The idea is that region-based SPTM
4646  * operations are significantly less expensive than disjoint operations, because each region
4647  * operation only requires a single page table walk at the beginning vs. a table walk for
4648  * each mapping in the disjoint case.  Since the majority of mappings processed by a flush
4649  * range operation belong to the main flush range VA region (i.e. alias mappings outside
4650  * the region are less common), the performance improvement can be significant.
4651  *
4652  * @note This function will disable preemption upon inserting the first entry into the
4653  *       per-CPU templates array, and will re-enable preemption upon submitting the region
4654  *       operation to the SPTM.
4655  *
4656  * @param template The PTE template to insert into the per-CPU templates array.
4657  * @param flush_range The object tracking the current state of the region operation.
4658  *
4659  * @return True if the region operation was submitted to the SPTM, false otherwise.
4660  */
4661 static inline bool
4662 pmap_insert_flush_range_template(pt_entry_t template, pmap_tlb_flush_range_t *flush_range)
4663 {
4664 	if (flush_range->pending_region_entries == 0) {
4665 		disable_preemption();
4666 	}
4667 	flush_range->region_entry_added = true;
4668 	PERCPU_GET(pmap_sptm_percpu)->sptm_templates[flush_range->pending_region_entries++] = template;
4669 	if (flush_range->pending_region_entries == SPTM_MAPPING_LIMIT) {
4670 		pmap_multipage_op_submit_region(flush_range);
4671 		return true;
4672 	}
4673 	return false;
4674 }
4675 
4676 /**
4677  * Wrapper function for submitting any pending operations, region-based or disjoint,
4678  * tracked by a flush range object.  This is meant to be used by the top-level caller that
4679  * iterates over the flush range's VA region and calls functions such as
4680  * pmap_page_protect_options_with_flush_range() or arm_force_fast_fault_with_flush_range()
4681  * to construct the relevant SPTM operations arrays.
4682  *
4683  * @param flush_range The object tracking the current state of region and/or disjoint operations.
4684  */
4685 static inline void
4686 pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range)
4687 {
4688 	pmap_multipage_op_submit_disjoint(0, flush_range);
4689 	pmap_multipage_op_submit_region(flush_range);
4690 }
4691 
4692 /**
4693  * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range()
4694  * is removing/updating all mappings in preparation for a retype operation.  In this case
4695  * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held
4696  * by the calller, and will perform the necessary pmap epoch drain and retype the page back to XNU_DEFAULT
4697  * prior to returning.
4698  */
4699 #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000
4700 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK,
4701     "PMAP_OPTIONS_PPO_PENDING_RETYPE outside reserved encoding space");
4702 
4703 /**
4704  * Lower the permission for all mappings to a given page. If VM_PROT_NONE is specified,
4705  * the mappings will be removed.
4706  *
4707  * @param ppnum Page number to lower the permission of.
4708  * @param prot The permission to lower to.
4709  * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
4710  *                PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is
4711  *                already locked and a pmap epoch drain shold be performed, along with
4712  *                retyping [ppnum] back to XNU_DEFAULT.
4713  *                PMAP_OPTIONS_COMPRESSOR indicates the function is called by the
4714  *                VM compressor.
4715  *                PMAP_OPTIONS_RETYPE requests the [ppnum] be retyped back to XNU_DEFAULT,
4716  *                along with an epoch drain; like PMAP_OPTIONS_PPO_PENDING_RETYPE but without
4717  *                the PVH lock being held by the caller.
4718  * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
4719  *                   by the caller.  This is an input/output parameter which may be updated
4720  *                   to reflect a new PV head value to be passed to a later call to pvh_unlock().
4721  * @param flush_range When present, this function will skip the TLB flush for the
4722  *                    mappings that are covered by the range, leaving that to be
4723  *                    done later by the caller.  It may also avoid submitting mapping
4724  *                    updates directly to the SPTM, instead accumulating them in a
4725  *                    per-CPU array to be submitted later by the caller.
4726  *
4727  * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4728  */
4729 MARK_AS_PMAP_TEXT static void
4730 pmap_page_protect_options_with_flush_range(
4731 	ppnum_t ppnum,
4732 	vm_prot_t prot,
4733 	unsigned int options,
4734 	locked_pvh_t *locked_pvh,
4735 	pmap_tlb_flush_range_t *flush_range)
4736 {
4737 	pmap_paddr_t phys = ptoa(ppnum);
4738 	locked_pvh_t local_locked_pvh = {.pvh = 0};
4739 	pv_entry_t *pve_p = NULL;
4740 	pv_entry_t *pveh_p = NULL;
4741 	pv_entry_t *pvet_p = NULL;
4742 	pt_entry_t *pte_p = NULL;
4743 	pv_entry_t *new_pve_p = NULL;
4744 	pt_entry_t *new_pte_p = NULL;
4745 
4746 	bool remove = false;
4747 	unsigned int pvh_cnt = 0;
4748 	unsigned int num_mappings = 0, num_skipped_mappings = 0;
4749 
4750 	assert(ppnum != vm_page_fictitious_addr);
4751 
4752 	/**
4753 	 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4754 	 *
4755 	 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
4756 	 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
4757 	 * semantics conflict with each other, so assert they are not both true.
4758 	 */
4759 	assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
4760 
4761 	/* Only work with managed pages. */
4762 	if (!pa_valid(phys)) {
4763 		return;
4764 	}
4765 
4766 	/*
4767 	 * Determine the new protection.
4768 	 */
4769 	switch (prot) {
4770 	case VM_PROT_ALL:
4771 		return;         /* nothing to do */
4772 	case VM_PROT_READ:
4773 	case VM_PROT_READ | VM_PROT_EXECUTE:
4774 		break;
4775 	default:
4776 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4777 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4778 		remove = true;
4779 		break;
4780 	}
4781 
4782 	/**
4783 	 * We don't support cross-page batching (indicated by flush_range being non-NULL) for removals,
4784 	 * as removals must use the SPTM prev_ptes array for accounting, which isn't supported for cross-
4785 	 * page batches.
4786 	 */
4787 	assert((flush_range == NULL) || !remove);
4788 
4789 	unsigned int pai = pa_index(phys);
4790 	if (__probable(locked_pvh == NULL)) {
4791 		if (flush_range != NULL) {
4792 			/**
4793 			 * If we're partway through processing a multi-page batched call,
4794 			 * preemption will already be disabled so we can't simply call
4795 			 * pvh_lock() which may block.  Instead, we first try to acquire
4796 			 * the lock without waiting, which in most cases should succeed.
4797 			 * If it fails, we submit the pending batched operations to re-
4798 			 * enable preemption and then acquire the lock normally.
4799 			 */
4800 			local_locked_pvh = pvh_try_lock(pai);
4801 			if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
4802 				pmap_multipage_op_submit(flush_range);
4803 				local_locked_pvh = pvh_lock(pai);
4804 			}
4805 		} else {
4806 			local_locked_pvh = pvh_lock(pai);
4807 		}
4808 	} else {
4809 		local_locked_pvh = *locked_pvh;
4810 		assert(pai == local_locked_pvh.pai);
4811 	}
4812 	assert(local_locked_pvh.pvh != 0);
4813 	pvh_assert_locked(pai);
4814 
4815 	bool pvh_lock_sleep_mode_needed = false;
4816 	bool clear_epoch = false;
4817 
4818 	/*
4819 	 * PVH should be locked before accessing per-CPU data, as we're relying on the lock
4820 	 * to disable preemption.
4821 	 */
4822 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4823 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4824 	sptm_disjoint_op_t *sptm_ops = NULL;
4825 	pt_desc_t **sptm_ptds = NULL;
4826 	ptd_info_t **sptm_ptd_info = NULL;
4827 
4828 	/* BEGIN IGNORE CODESTYLE */
4829 
4830 	/**
4831 	 * This would also work as a block, with the above variables declared using the
4832 	 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
4833 	 * dereferencing __block variables through stack forwarding pointers) isn't needed
4834 	 * here, as we never need to use this code sequence as a closure.
4835 	 */
4836 	#define PPO_PERCPU_INIT() do { \
4837 	        disable_preemption(); \
4838 	        pmap_cpu_data = pmap_get_cpu_data(); \
4839 	        sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
4840 	        sptm_ops = sptm_pcpu->sptm_ops; \
4841 	        sptm_ptds = sptm_pcpu->sptm_ptds; \
4842 	        sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \
4843 	        if (remove) { \
4844 	                clear_epoch = true; \
4845 	                pmap_epoch_enter(); \
4846 	        } \
4847 	} while (0)
4848 
4849 	/* END IGNORE CODESTYLE */
4850 
4851 
4852 	PPO_PERCPU_INIT();
4853 
4854 	pv_entry_t **pve_pp = NULL;
4855 
4856 	if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
4857 		pte_p = pvh_ptep(local_locked_pvh.pvh);
4858 	} else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4859 		pve_p = pvh_pve_list(local_locked_pvh.pvh);
4860 		pveh_p = pve_p;
4861 	} else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
4862 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
4863 	}
4864 
4865 	int pve_ptep_idx = 0;
4866 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4867 
4868 	/*
4869 	 * We need to keep track of whether a particular PVE list contains IOMMU
4870 	 * mappings when removing entries, because we should only remove CPU
4871 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4872 	 * it around.
4873 	 */
4874 	bool iommu_mapping_in_pve = false;
4875 
4876 	/**
4877 	 * With regard to TLBI, there are three cases:
4878 	 *
4879 	 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
4880 	 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
4881 	 *    itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
4882 	 *    mapping is out of the range.
4883 	 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
4884 	 *    let SPTM handle TLBI flushing.
4885 	 */
4886 	const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
4887 	const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
4888 
4889 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4890 		if (__improbable(pvh_lock_sleep_mode_needed)) {
4891 			assert((num_mappings == 0) && (num_skipped_mappings == 0));
4892 			if (clear_epoch) {
4893 				pmap_epoch_exit();
4894 				clear_epoch = false;
4895 			}
4896 			/**
4897 			 * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT().
4898 			 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
4899 			 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
4900 			 * core while processing SPTM per-CPU data.  At the same time, we also want preemption
4901 			 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
4902 			 * urgent ASTs can be handled.
4903 			 */
4904 			enable_preemption();
4905 			pvh_lock_enter_sleep_mode(&local_locked_pvh);
4906 			pvh_lock_sleep_mode_needed = false;
4907 			PPO_PERCPU_INIT();
4908 		}
4909 
4910 		if (pve_p != PV_ENTRY_NULL) {
4911 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4912 			if (pte_p == PT_ENTRY_NULL) {
4913 				goto protect_skip_pve;
4914 			}
4915 		}
4916 
4917 #ifdef PVH_FLAG_IOMMU
4918 		if (pvh_ptep_is_iommu(pte_p)) {
4919 			iommu_mapping_in_pve = true;
4920 			if (__improbable(remove && (options & PMAP_OPTIONS_COMPRESSOR))) {
4921 				const iommu_instance_t iommu = ptep_get_iommu(pte_p);
4922 				panic("%s: attempt to compress ppnum 0x%x owned by iommu driver "
4923 				    "%u (token: %#x), pve_p=%p", __func__, ppnum, GET_IOMMU_ID(iommu),
4924 				    GET_IOMMU_TOKEN(iommu), pve_p);
4925 			}
4926 			if (remove && (pve_p == PV_ENTRY_NULL)) {
4927 				/*
4928 				 * We've found an IOMMU entry and it's the only entry in the PV list.
4929 				 * We don't discard IOMMU entries, so simply set up the new PV list to
4930 				 * contain the single IOMMU PTE and exit the loop.
4931 				 */
4932 				new_pte_p = pte_p;
4933 				break;
4934 			}
4935 			++num_skipped_mappings;
4936 			goto protect_skip_pve;
4937 		}
4938 #endif
4939 
4940 		const pt_entry_t spte = os_atomic_load(pte_p, relaxed);
4941 
4942 		if (__improbable(!remove && !pte_is_valid(spte))) {
4943 			++num_skipped_mappings;
4944 			goto protect_skip_pve;
4945 		}
4946 
4947 		pt_desc_t *ptdp = NULL;
4948 		pmap_t pmap = NULL;
4949 		vm_map_address_t va = 0;
4950 
4951 		if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
4952 			/**
4953 			 * If the current mapping matches the flush range's current iteration position,
4954 			 * there's no need to do the work of getting the PTD.  We already know the pmap,
4955 			 * and the VA is implied by flush_range->pending_region_start.
4956 			 */
4957 			pmap = flush_range->ptfr_pmap;
4958 		} else {
4959 			ptdp = ptep_get_ptd(pte_p);
4960 			pmap = ptdp->pmap;
4961 			va = ptd_get_va(ptdp, pte_p);
4962 		}
4963 
4964 		/**
4965 		 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
4966 		 * pending disjoint ops, so we don't need to do flush range disjoint op management.
4967 		 */
4968 		if ((flush_range != NULL) && (ptdp != NULL)) {
4969 			/**
4970 			 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
4971 			 * We do this in three cases:
4972 			 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
4973 			 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
4974 			 *    for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
4975 			 * 3) We need to change the options passed to the SPTM for a run of one or more mappings.  Specifically,
4976 			 *    if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
4977 			 *    belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
4978 			 *    the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
4979 			 */
4980 			uint32_t per_mapping_sptm_update_options = sptm_update_options;
4981 			if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4982 				per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
4983 			}
4984 			if ((num_mappings == 0) ||
4985 			    (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
4986 				if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
4987 					/**
4988 					 * If we needed to submit the pending disjoint ops to make room for the new page,
4989 					 * flush any pending region ops to reenable preemption and restart the loop with
4990 					 * the lock in sleep mode.  This prevents preemption from being held disabled
4991 					 * for an arbitrary amount of time in the pathological case in which we have
4992 					 * both pending region ops and an excessively long PV list that repeatedly
4993 					 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
4994 					 */
4995 					pmap_multipage_op_submit_region(flush_range);
4996 					assert(num_mappings == 0);
4997 					num_skipped_mappings = 0;
4998 					pvh_lock_sleep_mode_needed = true;
4999 					continue;
5000 				}
5001 			}
5002 		}
5003 
5004 		if (__improbable((pmap == NULL) ||
5005 		    (pte_is_valid(spte) && (atop(pte_to_pa(spte)) != ppnum)))) {
5006 #if MACH_ASSERT
5007 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
5008 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
5009 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
5010 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5011 
5012 				pv_entry_t *check_pvep = pve_p;
5013 
5014 				do {
5015 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
5016 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
5017 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, (void*)local_locked_pvh.pvh, pve_p, pai);
5018 					}
5019 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
5020 
5021 				/* Restore previous PTEP value. */
5022 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
5023 			}
5024 #endif
5025 			panic("%s: bad PVE pte_p=%p pmap=%p prot=%d options=%u, pvh=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
5026 			    __func__, pte_p, pmap, prot, options, (void*)local_locked_pvh.pvh, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
5027 		}
5028 
5029 		pt_entry_t pte_template = ARM_PTE_EMPTY;
5030 
5031 		if (ptdp != NULL) {
5032 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
5033 			sptm_ops[num_mappings].vaddr = va;
5034 		}
5035 
5036 		/* Remove the mapping if new protection is NONE */
5037 		if (remove) {
5038 			sptm_ptds[num_mappings] = ptdp;
5039 			sptm_ptd_info[num_mappings] = ptd_get_info(ptdp);
5040 			sptm_pcpu->sptm_acct_flags[num_mappings] = 0;
5041 			if (pmap != kernel_pmap) {
5042 				const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
5043 				const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
5044 
5045 				if (is_internal) {
5046 					sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_INTERNAL;
5047 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
5048 				}
5049 				if (is_altacct) {
5050 					sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_ALTACCT;
5051 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
5052 				}
5053 				if (compress && is_internal) {
5054 					pte_template = ARM_PTE_COMPRESSED;
5055 					if (is_altacct) {
5056 						pte_template |= ARM_PTE_COMPRESSED_ALT;
5057 					}
5058 				}
5059 			}
5060 			/* Remove this CPU mapping from PVE list. */
5061 			if (pve_p != PV_ENTRY_NULL) {
5062 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5063 			}
5064 		} else {
5065 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5066 
5067 			if (pmap == kernel_pmap) {
5068 				pte_template = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5069 			} else {
5070 				pte_template = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5071 			}
5072 
5073 			/*
5074 			 * We must at least clear the 'was writeable' flag, as we're at least revoking write access,
5075 			 * meaning that the VM is effectively requesting that subsequent write accesses to these mappings
5076 			 * go through vm_fault() instead of being handled by arm_fast_fault().
5077 			 */
5078 			pte_set_was_writeable(pte_template, false);
5079 
5080 			/*
5081 			 * While the naive implementation of this would serve to add execute
5082 			 * permission, this is not how the VM uses this interface, or how
5083 			 * x86_64 implements it.  So ignore requests to add execute permissions.
5084 			 */
5085 #if DEVELOPMENT || DEBUG
5086 			if ((!(prot & VM_PROT_EXECUTE) && nx_enabled && pmap->nx_enabled) ||
5087 			    (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
5088 #else
5089 			if (!(prot & VM_PROT_EXECUTE) ||
5090 			    (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
5091 #endif
5092 			{
5093 				pte_template |= pt_attr_leaf_xn(pt_attr);
5094 			}
5095 		}
5096 
5097 		if (ptdp != NULL) {
5098 			sptm_ops[num_mappings].pte_template = pte_template;
5099 			++num_mappings;
5100 		} else if (pmap_insert_flush_range_template(pte_template, flush_range)) {
5101 			/**
5102 			 * We submit both the pending disjoint and pending region ops whenever
5103 			 * either category reaches the mapping limit.  Having pending operations
5104 			 * in either category will keep preemption disabled, and we want to ensure
5105 			 * that we can at least temporarily re-enable preemption roughly every
5106 			 * SPTM_MAPPING_LIMIT mappings.
5107 			 */
5108 			pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
5109 			pvh_lock_sleep_mode_needed = true;
5110 			num_mappings = num_skipped_mappings = 0;
5111 		}
5112 
5113 protect_skip_pve:
5114 		if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
5115 			if (flush_range != NULL) {
5116 				/* See comment above for why we submit both disjoint and region ops when we hit the limit. */
5117 				pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
5118 				pmap_multipage_op_submit_region(flush_range);
5119 			} else if (num_mappings > 0) {
5120 				if (remove) {
5121 					pmap_disjoint_unmap(phys, num_mappings);
5122 				} else {
5123 					sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
5124 				}
5125 			}
5126 			pvh_lock_sleep_mode_needed = true;
5127 			num_mappings = num_skipped_mappings = 0;
5128 		}
5129 		pte_p = PT_ENTRY_NULL;
5130 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5131 			pve_ptep_idx = 0;
5132 
5133 			if (remove) {
5134 				/**
5135 				 * If there are any IOMMU mappings in the PVE list, preserve
5136 				 * those mappings in a new PVE list (new_pve_p) which will later
5137 				 * become the new PVH entry. Keep track of the CPU mappings in
5138 				 * pveh_p/pvet_p so they can be deallocated later.
5139 				 */
5140 				if (iommu_mapping_in_pve) {
5141 					iommu_mapping_in_pve = false;
5142 					pv_entry_t *temp_pve_p = pve_next(pve_p);
5143 					pve_remove(&local_locked_pvh, pve_pp, pve_p);
5144 					if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
5145 						pveh_p = pvh_pve_list(local_locked_pvh.pvh);
5146 					} else {
5147 						assert(pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL));
5148 						pveh_p = PV_ENTRY_NULL;
5149 					}
5150 					pve_p->pve_next = new_pve_p;
5151 					new_pve_p = pve_p;
5152 					pve_p = temp_pve_p;
5153 					continue;
5154 				} else {
5155 					pvet_p = pve_p;
5156 					pvh_cnt++;
5157 				}
5158 			}
5159 
5160 			pve_pp = pve_next_ptr(pve_p);
5161 			pve_p = pve_next(pve_p);
5162 			iommu_mapping_in_pve = false;
5163 		}
5164 	}
5165 
5166 	if (num_mappings != 0) {
5167 		if (remove) {
5168 			pmap_disjoint_unmap(phys, num_mappings);
5169 		} else if (flush_range == NULL) {
5170 			sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
5171 		} else {
5172 			/* Resync the pending mapping state in flush_range with our local state. */
5173 			assert(num_mappings >= flush_range->pending_disjoint_entries);
5174 			flush_range->pending_disjoint_entries = num_mappings;
5175 		}
5176 	}
5177 
5178 	if (clear_epoch) {
5179 		pmap_epoch_exit();
5180 	}
5181 
5182 	/**
5183 	 * Undo the explicit disable_preemption() done in PPO_PERCPU_INIT().
5184 	 * Note that enable_preemption() decrements a per-thread counter, so if
5185 	 * we happen to still hold the PVH lock in spin mode then preemption won't
5186 	 * actually be re-enabled until we drop the lock (which also decrements
5187 	 * the per-thread counter.
5188 	 */
5189 	enable_preemption();
5190 
5191 	/* if we removed a bunch of entries, take care of them now */
5192 	if (remove) {
5193 		/**
5194 		 * If a retype is going to be needed here and/or by our caller, drain
5195 		 * the epochs to ensure that concurrent calls to batched operations such as
5196 		 * pmap_remove() and the various multipage attribute update functions have
5197 		 * finished consuming mappings of this page.
5198 		 */
5199 		bool retype_needed = false;
5200 		sptm_frame_type_t frame_type = XNU_DEFAULT;
5201 		if (options & (PMAP_OPTIONS_PPO_PENDING_RETYPE | PMAP_OPTIONS_RETYPE)) {
5202 			/**
5203 			 * If the frame type isn't currently XNU_DEFAULT, retype it back either
5204 			 * to satisfy the caller's request (PMAP_OPTIONS_RETYPE) or to ensure
5205 			 * the caller's subsequent retype will work as not all non-default types
5206 			 * can be directly retyped to one another without going through XNU_DEFAULT.
5207 			 */
5208 			frame_type = sptm_get_frame_type(phys);
5209 			retype_needed = (frame_type != XNU_DEFAULT);
5210 		}
5211 		/**
5212 		 * If the caller is indicating that it will subsequently retype the page
5213 		 * by passing PMAP_OPTIONS_PPO_PENDING_RETYPE, then we'll need to drain the epochs
5214 		 * regardless of current frame type to prepare for the caller's retype.
5215 		 */
5216 		const bool drain_needed = retype_needed || !!(options & PMAP_OPTIONS_PPO_PENDING_RETYPE);
5217 		if (__improbable(drain_needed)) {
5218 			pmap_epoch_prepare_drain();
5219 		}
5220 		if (new_pve_p != PV_ENTRY_NULL) {
5221 			pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP);
5222 		} else if (new_pte_p != PT_ENTRY_NULL) {
5223 			pvh_update_head(&local_locked_pvh, new_pte_p, PVH_TYPE_PTEP);
5224 		} else {
5225 			pvh_set_flags(&local_locked_pvh, 0);
5226 			pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
5227 		}
5228 
5229 		if (__improbable(drain_needed)) {
5230 			pmap_epoch_drain();
5231 		}
5232 		if (__improbable(retype_needed)) {
5233 			const sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5234 			sptm_retype(phys, frame_type, XNU_DEFAULT, retype_params);
5235 		}
5236 	}
5237 
5238 	if (__probable(locked_pvh == NULL)) {
5239 		pvh_unlock(&local_locked_pvh);
5240 	} else {
5241 		*locked_pvh = local_locked_pvh;
5242 	}
5243 
5244 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
5245 		assert(pveh_p != PV_ENTRY_NULL);
5246 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
5247 	}
5248 
5249 	if ((flush_range != NULL) && !preemption_enabled()) {
5250 		flush_range->processed_entries += num_skipped_mappings;
5251 	}
5252 }
5253 
5254 MARK_AS_PMAP_TEXT void
5255 pmap_page_protect_options_internal(
5256 	ppnum_t ppnum,
5257 	vm_prot_t prot,
5258 	unsigned int options,
5259 	void *arg)
5260 {
5261 	if (arg != NULL) {
5262 		/*
5263 		 * This is a legacy argument from pre-ARM era that the VM layer passes in to hint that it will call
5264 		 * pmap_flush() later to flush the TLB. On ARM platforms, however, pmap_flush() is not implemented,
5265 		 * as it's typically more efficient to perform the TLB flushing inline with the page table updates
5266 		 * themselves. Therefore, if the argument is non-NULL, pmap will take care of TLB flushing itself
5267 		 * by clearing PMAP_OPTIONS_NOFLUSH.
5268 		 */
5269 		options &= ~PMAP_OPTIONS_NOFLUSH;
5270 	}
5271 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL, NULL);
5272 }
5273 
5274 void
5275 pmap_page_protect_options(
5276 	ppnum_t ppnum,
5277 	vm_prot_t prot,
5278 	unsigned int options,
5279 	void *arg)
5280 {
5281 	pmap_paddr_t    phys = ptoa(ppnum);
5282 
5283 	assert(ppnum != vm_page_fictitious_addr);
5284 
5285 	/* Only work with managed pages. */
5286 	if (!pa_valid(phys)) {
5287 		return;
5288 	}
5289 
5290 	/*
5291 	 * Determine the new protection.
5292 	 */
5293 	if (prot == VM_PROT_ALL) {
5294 		return;         /* nothing to do */
5295 	}
5296 
5297 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5298 
5299 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
5300 
5301 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5302 }
5303 
5304 
5305 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5306 MARK_AS_PMAP_TEXT void
5307 pmap_disable_user_jop_internal(pmap_t pmap)
5308 {
5309 	if (pmap == kernel_pmap) {
5310 		panic("%s: called with kernel_pmap", __func__);
5311 	}
5312 	validate_pmap_mutable(pmap);
5313 	sptm_configure_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_JOP);
5314 	pmap->disable_jop = true;
5315 }
5316 
5317 void
5318 pmap_disable_user_jop(pmap_t pmap)
5319 {
5320 	pmap_disable_user_jop_internal(pmap);
5321 }
5322 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5323 
5324 /*
5325  * Indicates if the pmap layer enforces some additional restrictions on the
5326  * given set of protections.
5327  */
5328 bool
5329 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5330 {
5331 	return false;
5332 }
5333 
5334 static inline bool
5335 pmap_allows_xo(pmap_t pmap __unused)
5336 {
5337 	return true;
5338 }
5339 
5340 /*
5341  *	Set the physical protection on the
5342  *	specified range of this map as requested.
5343  *	VERY IMPORTANT: Will not increase permissions.
5344  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5345  */
5346 void
5347 pmap_protect(
5348 	pmap_t pmap,
5349 	vm_map_address_t b,
5350 	vm_map_address_t e,
5351 	vm_prot_t prot)
5352 {
5353 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
5354 }
5355 
5356 static bool
5357 pmap_protect_strong_sync(unsigned int num_mappings __unused)
5358 {
5359 	return false;
5360 }
5361 
5362 MARK_AS_PMAP_TEXT vm_map_address_t
5363 pmap_protect_options_internal(
5364 	pmap_t pmap,
5365 	vm_map_address_t start,
5366 	vm_map_address_t end,
5367 	vm_prot_t prot,
5368 	unsigned int options,
5369 	__unused void *args)
5370 {
5371 	pt_entry_t       *pte_p;
5372 	bool             set_NX = true;
5373 	bool             set_XO = false;
5374 	bool             should_have_removed = false;
5375 	bool             need_strong_sync = false;
5376 
5377 	/* Validate the pmap input before accessing its data. */
5378 	validate_pmap_mutable(pmap);
5379 
5380 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5381 
5382 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5383 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5384 	}
5385 
5386 #if DEVELOPMENT || DEBUG
5387 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5388 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5389 			should_have_removed = true;
5390 		}
5391 	} else
5392 #endif
5393 	{
5394 		/* Determine the new protection. */
5395 		switch (prot) {
5396 		case VM_PROT_READ:
5397 		case VM_PROT_READ | VM_PROT_EXECUTE:
5398 			break;
5399 		case VM_PROT_READ | VM_PROT_WRITE:
5400 		case VM_PROT_ALL:
5401 			return end;         /* nothing to do */
5402 		case VM_PROT_EXECUTE:
5403 			set_XO = true;
5404 			if (pmap_allows_xo(pmap)) {
5405 				break;
5406 			}
5407 		/* Fall through and panic if this pmap shouldn't be allowed to have XO mappings. */
5408 		default:
5409 			should_have_removed = true;
5410 		}
5411 	}
5412 
5413 	if (__improbable(should_have_removed)) {
5414 		panic("%s: should have been a remove operation, "
5415 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5416 		    __FUNCTION__,
5417 		    pmap, (void *)start, (void *)end, prot, options, args);
5418 	}
5419 
5420 #if DEVELOPMENT || DEBUG
5421 	bool force_write = false;
5422 	if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5423 		force_write = true;
5424 	}
5425 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5426 #else
5427 	if ((prot & VM_PROT_EXECUTE))
5428 #endif
5429 	{
5430 		set_NX = false;
5431 	} else {
5432 		set_NX = true;
5433 	}
5434 
5435 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5436 	vm_map_address_t va = start;
5437 	vm_map_address_t sptm_start_va = start;
5438 	unsigned int num_mappings = 0;
5439 
5440 	pmap_lock(pmap, PMAP_LOCK_SHARED);
5441 
5442 	pte_p = pmap_pte(pmap, start);
5443 
5444 	if (pte_p == NULL) {
5445 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
5446 		return end;
5447 	}
5448 
5449 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
5450 #if DEVELOPMENT || DEBUG
5451 	if (!force_write)
5452 #endif
5453 	{
5454 		disable_preemption();
5455 		sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5456 	}
5457 
5458 	pt_entry_t tmplate = ARM_PTE_EMPTY;
5459 
5460 	if (pmap == kernel_pmap) {
5461 #if DEVELOPMENT || DEBUG
5462 		if (force_write) {
5463 			tmplate = ARM_PTE_AP(AP_RWNA);
5464 		} else
5465 #endif
5466 		{
5467 			tmplate = ARM_PTE_AP(AP_RONA);
5468 		}
5469 	} else {
5470 #if DEVELOPMENT || DEBUG
5471 		if (force_write) {
5472 			assert(pmap->type != PMAP_TYPE_NESTED);
5473 			tmplate = pt_attr_leaf_rw(pt_attr);
5474 		} else
5475 #endif
5476 		if (__improbable(set_XO)) {
5477 			tmplate = pt_attr_leaf_rona(pt_attr);
5478 		} else {
5479 			tmplate = pt_attr_leaf_ro(pt_attr);
5480 		}
5481 	}
5482 
5483 	if (set_NX) {
5484 		tmplate |= pt_attr_leaf_xn(pt_attr);
5485 	}
5486 
5487 	while (va < end) {
5488 		pt_entry_t spte = ARM_PTE_EMPTY;
5489 
5490 		/**
5491 		 * Removing "NX" would grant "execute" access immediately, bypassing any
5492 		 * checks VM might want to do in its soft fault path.
5493 		 * pmap_protect() and co. are not allowed to increase access permissions,
5494 		 * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case.
5495 		 * Therefore, if we are not explicitly clearing execute permissions, inherit
5496 		 * the existing permissions.
5497 		 */
5498 		if (!set_NX) {
5499 			spte = os_atomic_load(pte_p, relaxed);
5500 			if (__improbable(!pte_is_valid(spte))) {
5501 				tmplate |= pt_attr_leaf_xn(pt_attr);
5502 			} else {
5503 				tmplate |= (spte & ARM_PTE_XMASK);
5504 			}
5505 		}
5506 
5507 #if DEVELOPMENT || DEBUG
5508 		/*
5509 		 * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to
5510 		 * provide a "backdoor" to allow normally write-protected compressor pages to be
5511 		 * be temporarily written without triggering expensive write faults.
5512 		 */
5513 		while (force_write) {
5514 			if (spte == ARM_PTE_EMPTY) {
5515 				spte = os_atomic_load(pte_p, relaxed);
5516 			}
5517 			const pt_entry_t prev_pte = spte;
5518 
5519 			/* A concurrent disconnect may have cleared the PTE. */
5520 			if (__improbable(!pte_is_valid(spte))) {
5521 				break;
5522 			}
5523 
5524 			/* Inherit permissions and "was_writeable" from the template. */
5525 			spte = (spte & ~(ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE)) |
5526 			    (tmplate & (ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE));
5527 
5528 			/* Access flag should be set for any immediate change in protections */
5529 			spte |= ARM_PTE_AF;
5530 			const pmap_paddr_t pa = pte_to_pa(spte);
5531 			const unsigned int pai = pa_index(pa);
5532 			locked_pvh_t locked_pvh;
5533 			if (pa_valid(pa)) {
5534 				locked_pvh = pvh_lock(pai);
5535 
5536 				/**
5537 				 * The VM may concurrently call pmap_disconnect() on the compressor
5538 				 * page in question, e.g. if relocating the page to satisfy a precious
5539 				 * allocation.  Now that we hold the PVH lock, re-check the PTE and
5540 				 * restart the loop if it's different from the value we read before
5541 				 * we held the lock.
5542 				 */
5543 				if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) {
5544 					pvh_unlock(&locked_pvh);
5545 					spte = ARM_PTE_EMPTY;
5546 					continue;
5547 				}
5548 				ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT,
5549 				    PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5550 			}
5551 
5552 			__assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte);
5553 
5554 			/**
5555 			 * We don't expect the VM to be concurrently calling pmap_remove() against these
5556 			 * compressor mappings.  If it does for some reason, that could cause the above
5557 			 * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING.
5558 			 */
5559 			assert3u(sptm_status, ==, SPTM_MAP_VALID);
5560 
5561 			if (pa_valid(pa)) {
5562 				pvh_unlock(&locked_pvh);
5563 			}
5564 			break;
5565 		}
5566 
5567 #endif /* DEVELOPMENT || DEBUG */
5568 
5569 		va += pmap_page_size;
5570 		++pte_p;
5571 
5572 #if DEVELOPMENT || DEBUG
5573 		if (!force_write)
5574 #endif
5575 		{
5576 			sptm_pcpu->sptm_templates[num_mappings] = tmplate;
5577 			++num_mappings;
5578 			if (num_mappings == SPTM_MAPPING_LIMIT) {
5579 				/**
5580 				 * Enter the pmap epoch for the batched update operation.  This is necessary because we
5581 				 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
5582 				 * call, so a concurrent pmap_page_protect() operation against one of those pages may
5583 				 * race this call.  That should be perfectly fine as far as the PTE updates are concerned,
5584 				 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
5585 				 * if it does not first drain our epoch.
5586 				 */
5587 				pmap_epoch_enter();
5588 				sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5589 				    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5590 				pmap_epoch_exit();
5591 				need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5592 
5593 				/* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */
5594 				enable_preemption();
5595 				num_mappings = 0;
5596 				sptm_start_va = va;
5597 				disable_preemption();
5598 				sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5599 			}
5600 		}
5601 	}
5602 
5603 	/* This won't happen in the force_write case as we should never increment num_mappings. */
5604 	if (num_mappings != 0) {
5605 		pmap_epoch_enter();
5606 		sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5607 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5608 		pmap_epoch_exit();
5609 		need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5610 	}
5611 
5612 #if DEVELOPMENT || DEBUG
5613 	if (!force_write)
5614 #endif
5615 	{
5616 		enable_preemption();
5617 	}
5618 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
5619 	if (__improbable(need_strong_sync)) {
5620 		arm64_sync_tlb(true);
5621 	}
5622 	return va;
5623 }
5624 
5625 void
5626 pmap_protect_options(
5627 	pmap_t pmap,
5628 	vm_map_address_t b,
5629 	vm_map_address_t e,
5630 	vm_prot_t prot,
5631 	unsigned int options,
5632 	__unused void *args)
5633 {
5634 	vm_map_address_t l, beg;
5635 
5636 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5637 
5638 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5639 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5640 		    pmap, (uint64_t)b, (uint64_t)e);
5641 	}
5642 
5643 	/*
5644 	 * We allow single-page requests to execute non-preemptibly,
5645 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5646 	 * operation, and there are a couple of special use cases that
5647 	 * require a non-preemptible single-page operation.
5648 	 */
5649 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5650 		pmap_verify_preemptible();
5651 	}
5652 
5653 #if DEVELOPMENT || DEBUG
5654 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5655 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5656 			pmap_remove_options(pmap, b, e, options);
5657 			return;
5658 		}
5659 	} else
5660 #endif
5661 	{
5662 		/* Determine the new protection. */
5663 		switch (prot) {
5664 		case VM_PROT_READ:
5665 		case VM_PROT_READ | VM_PROT_EXECUTE:
5666 			break;
5667 		case VM_PROT_READ | VM_PROT_WRITE:
5668 		case VM_PROT_ALL:
5669 			return;         /* nothing to do */
5670 		case VM_PROT_EXECUTE:
5671 			if (pmap_allows_xo(pmap)) {
5672 				break;
5673 			}
5674 		/* Fall through and remove the mapping if XO is requested and [pmap] doesn't allow it. */
5675 		default:
5676 			pmap_remove_options(pmap, b, e, options);
5677 			return;
5678 		}
5679 	}
5680 
5681 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5682 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5683 	    VM_KERNEL_ADDRHIDE(e));
5684 
5685 	beg = b;
5686 
5687 	while (beg < e) {
5688 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5689 
5690 		if (l > e) {
5691 			l = e;
5692 		}
5693 
5694 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5695 	}
5696 
5697 
5698 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5699 }
5700 
5701 /**
5702  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5703  *
5704  * @param pmap pmap to insert the pages into.
5705  * @param va virtual address to map the pages into.
5706  * @param pa page number of the first physical page to map.
5707  * @param size block size, in number of pages.
5708  * @param prot mapping protection attributes.
5709  * @param attr flags to pass to pmap_enter().
5710  *
5711  * @return KERN_SUCCESS.
5712  */
5713 kern_return_t
5714 pmap_map_block(
5715 	pmap_t pmap,
5716 	addr64_t va,
5717 	ppnum_t pa,
5718 	uint32_t size,
5719 	vm_prot_t prot,
5720 	int attr,
5721 	unsigned int flags)
5722 {
5723 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5724 }
5725 
5726 /**
5727  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5728  * As opposed to pmap_map_block(), this function takes
5729  * a physical address as an input and operates using the
5730  * page size associated with the input pmap.
5731  *
5732  * @param pmap pmap to insert the pages into.
5733  * @param va virtual address to map the pages into.
5734  * @param pa physical address of the first physical page to map.
5735  * @param size block size, in number of pages.
5736  * @param prot mapping protection attributes.
5737  * @param attr flags to pass to pmap_enter().
5738  *
5739  * @return KERN_SUCCESS.
5740  */
5741 kern_return_t
5742 pmap_map_block_addr(
5743 	pmap_t pmap,
5744 	addr64_t va,
5745 	pmap_paddr_t pa,
5746 	uint32_t size,
5747 	vm_prot_t prot,
5748 	int attr,
5749 	unsigned int flags)
5750 {
5751 #if __ARM_MIXED_PAGE_SIZE__
5752 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5753 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5754 #else
5755 	const uint64_t pmap_page_size = PAGE_SIZE;
5756 #endif
5757 
5758 	for (ppnum_t page = 0; page < size; page++) {
5759 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER) != KERN_SUCCESS) {
5760 			panic("%s: failed pmap_enter_addr, "
5761 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5762 			    __FUNCTION__,
5763 			    pmap, va, (uint64_t)pa, size, prot, flags);
5764 		}
5765 
5766 		va += pmap_page_size;
5767 		pa += pmap_page_size;
5768 	}
5769 
5770 
5771 	return KERN_SUCCESS;
5772 }
5773 
5774 kern_return_t
5775 pmap_enter_addr(
5776 	pmap_t pmap,
5777 	vm_map_address_t v,
5778 	pmap_paddr_t pa,
5779 	vm_prot_t prot,
5780 	vm_prot_t fault_type,
5781 	unsigned int flags,
5782 	boolean_t wired,
5783 	pmap_mapping_type_t mapping_type)
5784 {
5785 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, mapping_type);
5786 }
5787 
5788 /*
5789  *	Insert the given physical page (p) at
5790  *	the specified virtual address (v) in the
5791  *	target physical map with the protection requested.
5792  *
5793  *	If specified, the page will be wired down, meaning
5794  *	that the related pte can not be reclaimed.
5795  *
5796  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5797  *	or lose information.  That is, this routine must actually
5798  *	insert this page into the given map eventually (must make
5799  *	forward progress eventually.
5800  */
5801 kern_return_t
5802 pmap_enter(
5803 	pmap_t pmap,
5804 	vm_map_address_t v,
5805 	ppnum_t pn,
5806 	vm_prot_t prot,
5807 	vm_prot_t fault_type,
5808 	unsigned int flags,
5809 	boolean_t wired,
5810 	pmap_mapping_type_t mapping_type)
5811 {
5812 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type);
5813 }
5814 
5815 /**
5816  * Helper function for determining the frame type that will be required for a physical page given
5817  * a set of mapping constraints.
5818  *
5819  * @param pmap The address space in which the page will be mapped.
5820  * @param pte The fully-configured page table entry, including permissions and output address, that
5821  *            will be used for the mapping.
5822  * @param vaddr The virtual address that will be mapped using [pte]
5823  * @param options Extra mapping options that would be passed to pmap_enter() when performing the mapping
5824  * @param mapping_type The mapping type enum that would be passed to pmap_enter() when performing the mapping
5825  * @param prev_frame_type Output param that will store the existing frame type for the physical page
5826  *                        mapped by [pte].  As an optimization, this will only be queried if [*new_frame_type]
5827  *                        is determined to be something other than XNU_DEFAULT, otherwise it will be assumed
5828  *                        to be XNU_DEFAULT
5829  * @param new_frame_type Output param that will store the new frame type that will be required for the
5830  *                       physical page mapped by [pte]
5831  */
5832 static inline void
5833 pmap_frame_type_for_pte(
5834 	pmap_t pmap __assert_only,
5835 	pt_entry_t pte,
5836 	vm_map_address_t vaddr __assert_only,
5837 	unsigned int options,
5838 	pmap_mapping_type_t mapping_type,
5839 	sptm_frame_type_t *prev_frame_type,
5840 	sptm_frame_type_t *new_frame_type)
5841 {
5842 	const pmap_paddr_t paddr = pte_to_pa(pte) & ~PAGE_MASK;
5843 	assert(prev_frame_type != NULL);
5844 	assert(new_frame_type != NULL);
5845 	*prev_frame_type = *new_frame_type = XNU_DEFAULT;
5846 
5847 	const uint64_t pte_perms = pte_to_xprr_perm(pte);
5848 	/*
5849 	 * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we
5850 	 * keep the existing logic of deriving the SPTM frame type from the XPRR permissions.
5851 	 *
5852 	 * If the caller specified another mapping type, we simply follow that. This refactor was
5853 	 * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at
5854 	 * what we want. It's better to let the caller specify the mapping type rather than use the
5855 	 * permissions for that.
5856 	 *
5857 	 * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323.
5858 	 */
5859 	if (__improbable(mapping_type != PMAP_MAPPING_TYPE_INFER)) {
5860 		switch (mapping_type) {
5861 		case PMAP_MAPPING_TYPE_DEFAULT:
5862 			*new_frame_type = (sptm_frame_type_t)mapping_type;
5863 			break;
5864 		case PMAP_MAPPING_TYPE_ROZONE:
5865 			assert(((pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + pt_attr_page_size(pmap_get_pt_attr(pmap)))));
5866 			*new_frame_type = (sptm_frame_type_t)mapping_type;
5867 			break;
5868 		case PMAP_MAPPING_TYPE_RESTRICTED:
5869 			if (use_xnu_restricted) {
5870 				*new_frame_type = (sptm_frame_type_t)mapping_type;
5871 			} else {
5872 				*new_frame_type = XNU_DEFAULT;
5873 			}
5874 			break;
5875 		default:
5876 			panic("invalid mapping type: %d", mapping_type);
5877 		}
5878 	} else if (__improbable(pte_perms == XPRR_USER_JIT_PERM)) {
5879 		/*
5880 		 * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using
5881 		 * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other
5882 		 * flags which the VM may have provided.
5883 		 *
5884 		 * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering
5885 		 * this case. We can't do this for now because this might trigger on some macOS
5886 		 * systems where applications use MAP_JIT with RW/RX permissions, and then later
5887 		 * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG
5888 		 * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can
5889 		 * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application
5890 		 * switches to RWX, then we can start asserting this requirement.
5891 		 */
5892 		*new_frame_type = XNU_USER_JIT;
5893 	} else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) {
5894 		/*
5895 		 * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must
5896 		 * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the
5897 		 * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type.
5898 		 */
5899 		*new_frame_type = XNU_USER_DEBUG;
5900 	} else if (pte_perms == XPRR_USER_RX_PERM) {
5901 		*new_frame_type = XNU_USER_EXEC;
5902 	} else if ((pte_perms == XPRR_USER_RW_PERM) ||
5903 	    (pte_was_writeable(pte) && (pte_perms == XPRR_USER_RO_PERM))) {
5904 		/**
5905 		 * Allow retyping from user executable types (except XNU_USER_DEBUG, which already
5906 		 * allows user RW mappings) back to XNU_DEFAULT if a writable mapping is requested.
5907 		 * Our retype logic will disconnect all existing mappings, so future attempts to
5908 		 * execute these pages will fault, retype back to exec, and go back through any
5909 		 * needed CS validation.  For all other current frame types, just leave the previous
5910 		 * and new frame types unchanged; for most other types attempting to add a user RW
5911 		 * mapping is a bug and we should just let the SPTM throw a violation.
5912 		 */
5913 		const sptm_frame_type_t cur_frame_type = sptm_get_frame_type(paddr);
5914 		if (__improbable(sptm_type_is_user_executable(cur_frame_type) &&
5915 		    (cur_frame_type != XNU_USER_DEBUG))) {
5916 			*prev_frame_type = cur_frame_type;
5917 		}
5918 	}
5919 
5920 	if (__improbable(*new_frame_type != XNU_DEFAULT)) {
5921 		*prev_frame_type = sptm_get_frame_type(paddr);
5922 	}
5923 }
5924 
5925 /*
5926  * Construct a PTE (and the physical page attributes) for the given virtual to
5927  * physical mapping.
5928  *
5929  * @param pmap The pmap representing the address space for which to construct
5930  *             the mapping.
5931  * @param pa The physical address to be mapped by the new PTE.
5932  * @param prot Access permissions to apply to the new PTE.
5933  * @param fault_type The type of access fault that is triggering the request
5934  *                   to construct the new PTE.
5935  * @param wired Whether the new PTE should have the wired bit set.
5936  * @param options The extra mapping options passed to pmap_enter().
5937  * @param pp_attr_bits Output parameter that will return the physical page attributes
5938  *                     to apply to pp_attr_table for the new mapping.
5939  *
5940  * This function has no side effects and is safe to call while attempting a
5941  * pmap_enter transaction.
5942  */
5943 MARK_AS_PMAP_TEXT static pt_entry_t
5944 pmap_construct_pte(
5945 	const pmap_t pmap,
5946 	pmap_paddr_t pa,
5947 	vm_prot_t prot,
5948 	vm_prot_t fault_type,
5949 	boolean_t wired,
5950 	unsigned int options __unused,
5951 	uint16_t *pp_attr_bits /* OUTPUT */
5952 	)
5953 {
5954 	const pt_attr_t* const pt_attr = pmap_get_pt_attr(pmap);
5955 	bool set_NX = false, set_XO = false, set_TPRO = false;
5956 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
5957 	assert(pp_attr_bits != NULL);
5958 	*pp_attr_bits = 0;
5959 
5960 	if (wired) {
5961 		pte |= ARM_PTE_WIRED;
5962 	}
5963 
5964 #if DEVELOPMENT || DEBUG
5965 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5966 #else
5967 	if ((prot & VM_PROT_EXECUTE))
5968 #endif
5969 	{
5970 		set_NX = false;
5971 	} else {
5972 		set_NX = true;
5973 	}
5974 
5975 	if (__improbable(prot == VM_PROT_EXECUTE)) {
5976 		set_XO = true;
5977 		if (!pmap_allows_xo(pmap)) {
5978 			panic("%s: attempted execute-only mapping", __func__);
5979 		}
5980 	}
5981 
5982 	if (set_NX) {
5983 		pte |= pt_attr_leaf_xn(pt_attr);
5984 	} else {
5985 		if (pmap == kernel_pmap) {
5986 			pte |= ARM_PTE_NX;
5987 		} else {
5988 			pte |= pt_attr_leaf_x(pt_attr);
5989 		}
5990 	}
5991 
5992 	if (pmap == kernel_pmap) {
5993 #if __ARM_KERNEL_PROTECT__
5994 		pte |= ARM_PTE_NG;
5995 #endif /* __ARM_KERNEL_PROTECT__ */
5996 		if (prot & VM_PROT_WRITE) {
5997 			pte |= ARM_PTE_AP(AP_RWNA);
5998 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5999 		} else {
6000 			pte |= ARM_PTE_AP(AP_RONA);
6001 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6002 		}
6003 	} else {
6004 		if (pmap->type != PMAP_TYPE_NESTED) {
6005 			pte |= ARM_PTE_NG;
6006 		}
6007 		if (set_TPRO) {
6008 			pte |= pt_attr_leaf_rona(pt_attr);
6009 			*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6010 		} else if (prot & VM_PROT_WRITE) {
6011 			assert(pmap->type != PMAP_TYPE_NESTED);
6012 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
6013 				if (fault_type & VM_PROT_WRITE) {
6014 					pte |= pt_attr_leaf_rw(pt_attr);
6015 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
6016 				} else {
6017 					pte |= pt_attr_leaf_ro(pt_attr);
6018 					/*
6019 					 * Mark the page as MODFAULT so that a subsequent write
6020 					 * may be handled through arm_fast_fault().
6021 					 */
6022 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
6023 					pte_set_was_writeable(pte, true);
6024 				}
6025 			} else {
6026 				pte |= pt_attr_leaf_rw(pt_attr);
6027 				*pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
6028 			}
6029 		} else {
6030 			if (__improbable(set_XO)) {
6031 				pte |= pt_attr_leaf_rona(pt_attr);
6032 			} else {
6033 				pte |= pt_attr_leaf_ro(pt_attr);
6034 			}
6035 			*pp_attr_bits |= PP_ATTR_REFERENCED;
6036 		}
6037 	}
6038 
6039 	pte |= ARM_PTE_AF;
6040 	return pte;
6041 }
6042 
6043 /**
6044  * This function allows the VM to query whether a mapping operation will result in a page being
6045  * retyped, without actually performing the mapping operation.  It's useful for the VM to know
6046  * this when performing up-front page validation under the VM object lock.
6047  *
6048  * @param pmap The address space in which the mapping will occur
6049  * @param vaddr The virtual address that will be mapped
6050  * @param pn The physical page number to be mapped by [vaddr]
6051  * @param prot The permissions to be used for the mapping
6052  * @param options The extra mapping options that would be passed to pmap_enter() if the
6053  *                mapping operation were performed
6054  * @param mapping_type The mapping type enum that would be passed to pmap_enter() if the
6055  *                     mapping operation were performed
6056  *
6057  * @return True if the mapping operation would produce a retype of the page at [pn],
6058  *         False otherwise
6059  */
6060 bool
6061 pmap_will_retype(
6062 	pmap_t pmap,
6063 	vm_map_address_t vaddr,
6064 	ppnum_t pn,
6065 	vm_prot_t prot,
6066 	unsigned int options,
6067 	pmap_mapping_type_t mapping_type)
6068 {
6069 	const pmap_paddr_t paddr = ptoa(pn);
6070 	uint16_t pp_attr_bits;
6071 	pt_entry_t pte = pmap_construct_pte(pmap, paddr, prot, prot, false, options, &pp_attr_bits);
6072 	sptm_frame_type_t prev_frame_type, new_frame_type;
6073 	pmap_frame_type_for_pte(pmap, pte, vaddr, options, mapping_type, &prev_frame_type, &new_frame_type);
6074 
6075 	return new_frame_type != prev_frame_type;
6076 }
6077 
6078 /*
6079  * Attempt to update a PTE constructed by pmap_enter_options().
6080  *
6081  * @note performs no page table or accounting modifications, nor any lasting SPTM page type modification, on failure.
6082  * @note expects to be called with preemption disabled to guarantee safe access to SPTM per-CPU data.
6083  *
6084  * @param pmap The pmap representing the address space in which to store the new PTE
6085  * @param pte_p The physical aperture KVA of the PTE to store
6086  * @param new_pte The new value to store in *pte_p
6087  * @param v The virtual address mapped by pte_p
6088  * @param locked_pvh Input/Output parameter pointing to a wrapped pv_head_table entry returned by
6089  *        a previous call to pvh_lock().  *locked_pvh will be updated if existing mappings
6090  *        need to be disconnected prior to retyping.
6091  * @param old_pte Returns the prior PTE contents, iff the PTE is successfully updated
6092  * @param options bitmask of PMAP_OPTIONS_* flags passed to pmap_enter_options().
6093  * @param mapping_type The type of the new mapping, this defines which SPTM frame type to use.
6094  *
6095  * @return SPTM_SUCCESS iff able to successfully update *pte_p to new_pte via sptm_map_page(),
6096  *         SPTM_MAP_VALID if an existing mapping was successfully upgraded via sptm_map_page(),
6097  *         SPTM_MAP_FLUSH_PENDING if the TLB flush of a previous mapping is still in-flight and
6098  *             the mapping operation should be retried, or if the mapping operation should be retried
6099  *             because we had to temporarily re-enable preemption which would invalidate caller-held
6100  *             per-CPU data.
6101  *         Otherwise an appropriate SPTM or TXM error code; in these cases the mapping should not be
6102  *             retried and the caller should return an error.
6103  */
6104 static inline sptm_return_t
6105 pmap_enter_pte(
6106 	pmap_t pmap,
6107 	pt_entry_t *pte_p,
6108 	pt_entry_t new_pte,
6109 	locked_pvh_t *locked_pvh,
6110 	pt_entry_t *old_pte,
6111 	vm_map_address_t v,
6112 	unsigned int options,
6113 	pmap_mapping_type_t mapping_type)
6114 {
6115 	sptm_pte_t prev_pte;
6116 	bool changed_wiring = false;
6117 
6118 	assert(pte_p != NULL);
6119 	assert(old_pte != NULL);
6120 
6121 	/* SPTM TODO: handle PAGE_RATIO_4 configurations if those devices remain supported. */
6122 
6123 	assert(get_preemption_level() > 0);
6124 	const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK;
6125 	sptm_frame_type_t prev_frame_type;
6126 	sptm_frame_type_t new_frame_type;
6127 
6128 	pmap_frame_type_for_pte(pmap, new_pte, v, options, mapping_type, &prev_frame_type, &new_frame_type);
6129 
6130 	if (__improbable(new_frame_type != prev_frame_type)) {
6131 		/**
6132 		 * Remove all existing mappings prior to retyping, so that we can safely retype without having to worry
6133 		 * about a concurrent operation on one of those mappings triggering an SPTM violation.  In particular,
6134 		 * pmap_remove() may clear a mapping to this page without holding its PVH lock.  This approach works
6135 		 * because we hold the PVH lock during this call, and any attempt to enter a new mapping for the page
6136 		 * will also need to grab the PVH lock and call this function.
6137 		 */
6138 		pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
6139 		    PMAP_OPTIONS_PPO_PENDING_RETYPE, locked_pvh, NULL);
6140 		/**
6141 		 * In the unlikely event that pmap_page_protect_options_with_flush_range() had to process
6142 		 * an excessively long PV list, it will have enabled preemption by placing the PVH lock
6143 		 * in sleep mode.  In this case, we may have been migrated to a different CPU, and caller
6144 		 * assumptions about the state of per-CPU data (such as per-CPU PVE availability) will no
6145 		 * longer hold true.  Ask the caller to retry by pretending we encountered a pending flush.
6146 		 */
6147 		if (__improbable(preemption_enabled())) {
6148 			return SPTM_MAP_FLUSH_PENDING;
6149 		}
6150 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6151 		/* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */
6152 		prev_frame_type = sptm_get_frame_type(pa);
6153 		if (new_frame_type != prev_frame_type) {
6154 			sptm_retype(pa, prev_frame_type, new_frame_type, retype_params);
6155 		}
6156 	}
6157 
6158 	if (pmap->type == PMAP_TYPE_NESTED) {
6159 		/**
6160 		 * Enter the epoch before we check the unnesting state of the leaf page table, so that a
6161 		 * concurrent pmap_unnest() operation can guarantee that we either observe the unnested
6162 		 * table state and install a non-global mapping, or have finished installing a global mapping
6163 		 * before it marks all existing mappings as non-global.
6164 		 */
6165 		pmap_epoch_enter();
6166 		vm_map_offset_t nested_region_size = os_atomic_load(&pmap->nested_region_size, acquire);
6167 		if (nested_region_size && (v >= pmap->nested_region_addr) && (v < (pmap->nested_region_addr + nested_region_size))) {
6168 			assert(pmap->nested_region_addr != 0);
6169 			assert(pmap->nested_region_unnested_table_bitmap != NULL);
6170 			unsigned int index = (unsigned int)((v - pmap->nested_region_addr) >>
6171 			    pt_attr_twig_shift(pmap_get_pt_attr(pmap)));
6172 
6173 			if ((bitmap_test(pmap->nested_region_unnested_table_bitmap, UNNEST_IN_PROGRESS_BIT(index)))) {
6174 				new_pte |= ARM_PTE_NG;
6175 			}
6176 		}
6177 	}
6178 	const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte);
6179 	if (pmap->type == PMAP_TYPE_NESTED) {
6180 		pmap_epoch_exit();
6181 	}
6182 	if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) {
6183 		/*
6184 		 * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as
6185 		 * opposed to a TXM error.  In the case of SPTM_MAP_FLUSH_PENDING, pmap_enter() will drop the PVH
6186 		 * lock before turning around to retry the mapping operation.  It may then be possible for the
6187 		 * mapping state of the page to change such that our next attempt to map it will fail with a TXM
6188 		 * error, so if we were to leave the new type in place here we would then have lost our record
6189 		 * of the previous type and would effectively leave the page in an inconsistent state.
6190 		 */
6191 		if (__improbable(new_frame_type != prev_frame_type)) {
6192 			sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6193 			sptm_retype(pa, new_frame_type, prev_frame_type, retype_params);
6194 		}
6195 		return sptm_status;
6196 	}
6197 
6198 	*old_pte = prev_pte = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes[0];
6199 
6200 	if (prev_pte != new_pte) {
6201 		changed_wiring = pte_is_compressed(prev_pte, pte_p) ?
6202 		    (new_pte & ARM_PTE_WIRED) != 0 :
6203 		    (new_pte & ARM_PTE_WIRED) != (prev_pte & ARM_PTE_WIRED);
6204 
6205 		if ((pmap != kernel_pmap) && changed_wiring) {
6206 			pte_update_wiredcnt(pmap, pte_p, (new_pte & ARM_PTE_WIRED) != 0);
6207 		}
6208 
6209 		PMAP_TRACE(4 + pt_attr_leaf_level(pmap_get_pt_attr(pmap)), PMAP_CODE(PMAP__TTE),
6210 		    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v),
6211 		    VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)), new_pte);
6212 	}
6213 
6214 	return sptm_status;
6215 }
6216 
6217 MARK_AS_PMAP_TEXT static pt_entry_t
6218 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
6219 {
6220 	pt_entry_t pte;
6221 
6222 	switch (wimg & (VM_WIMG_MASK)) {
6223 	case VM_WIMG_IO:
6224 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
6225 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
6226 		// AP, while preserving the security benefits of using device
6227 		// mapping against side-channel attacks. On pre-H14 platforms,
6228 		// the accesses will still be strongly ordered.
6229 		if (is_dram_addr(pa)) {
6230 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6231 		} else {
6232 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
6233 #if HAS_FEAT_XS
6234 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
6235 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
6236 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
6237 			}
6238 #endif /* HAS_FEAT_XS */
6239 		}
6240 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6241 		break;
6242 	case VM_WIMG_RT:
6243 		if (is_dram_addr(pa)) {
6244 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
6245 		} else {
6246 #if HAS_FEAT_XS
6247 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6248 #else /* HAS_FEAT_XS */
6249 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6250 #endif /* HAS_FEAT_XS */
6251 #if DEBUG || DEVELOPMENT
6252 			pmap_wcrt_on_non_dram_count_increment_atomic();
6253 #endif /* DEBUG || DEVELOPMENT */
6254 		}
6255 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6256 		break;
6257 	case VM_WIMG_POSTED:
6258 		if (is_dram_addr(pa)) {
6259 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6260 		} else {
6261 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
6262 		}
6263 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6264 		break;
6265 	case VM_WIMG_POSTED_REORDERED:
6266 		if (is_dram_addr(pa)) {
6267 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6268 		} else {
6269 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
6270 		}
6271 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6272 		break;
6273 	case VM_WIMG_POSTED_COMBINED_REORDERED:
6274 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6275 #if HAS_FEAT_XS
6276 		if (!is_dram_addr(pa)) {
6277 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6278 		}
6279 #endif /* HAS_FEAT_XS */
6280 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6281 		break;
6282 	case VM_WIMG_WCOMB:
6283 		if (is_dram_addr(pa)) {
6284 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
6285 		} else {
6286 #if HAS_FEAT_XS
6287 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
6288 #else /* HAS_FEAT_XS */
6289 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
6290 #endif /* HAS_FEAT_XS */
6291 #if DEBUG || DEVELOPMENT
6292 			pmap_wcrt_on_non_dram_count_increment_atomic();
6293 #endif /* DEBUG || DEVELOPMENT */
6294 		}
6295 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
6296 		break;
6297 	case VM_WIMG_WTHRU:
6298 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
6299 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6300 		break;
6301 	case VM_WIMG_COPYBACK:
6302 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
6303 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6304 		break;
6305 #if HAS_MTE
6306 	case VM_WIMG_MTE:
6307 		assert(is_mte_enabled);
6308 
6309 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_MTE);
6310 		pte |= ARM_PTE_SH(SH_MTE);
6311 		break;
6312 #else /* HAS_MTE */
6313 	case VM_WIMG_INNERWBACK:
6314 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
6315 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
6316 		break;
6317 #endif /* HAS_MTE */
6318 	default:
6319 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6320 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
6321 	}
6322 
6323 	return pte;
6324 }
6325 
6326 MARK_AS_PMAP_TEXT kern_return_t
6327 pmap_enter_options_internal(
6328 	pmap_t pmap,
6329 	vm_map_address_t v,
6330 	pmap_paddr_t pa,
6331 	vm_prot_t prot,
6332 	vm_prot_t fault_type,
6333 	unsigned int flags,
6334 	boolean_t wired,
6335 	unsigned int options,
6336 	pmap_mapping_type_t mapping_type)
6337 {
6338 	ppnum_t         pn = (ppnum_t)atop(pa);
6339 	pt_entry_t      *pte_p;
6340 	unsigned int    wimg_bits;
6341 	bool            committed = false;
6342 	kern_return_t   kr = KERN_SUCCESS;
6343 	uint16_t pp_attr_bits;
6344 	pv_free_list_t *local_pv_free;
6345 
6346 	validate_pmap_mutable(pmap);
6347 
6348 	/**
6349 	 * Prepare for the SPTM call early by prefetching the relavant FTEs. Cache misses
6350 	 * in SPTM accessing these turn out to contribute to a large portion of delay on
6351 	 * the critical path. Technically, sptm_prefetch_fte may not find an FTE associated
6352 	 * with pa and return LIBSPTM_FAILURE. However, we are okay with that as it's only
6353 	 * a best-effort performance optimization.
6354 	 */
6355 	sptm_prefetch_fte(pmap->ttep);
6356 	sptm_prefetch_fte(pa);
6357 
6358 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6359 
6360 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6361 		panic("pmap_enter_options() pmap %p v 0x%llx",
6362 		    pmap, (uint64_t)v);
6363 	}
6364 
6365 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6366 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6367 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6368 	}
6369 
6370 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6371 		panic("pmap_enter_options() pmap %p pa 0x%llx",
6372 		    pmap, (uint64_t)pa);
6373 	}
6374 
6375 	/* The PA should not extend beyond the architected physical address space */
6376 	pa &= ARM_PTE_PAGE_MASK;
6377 
6378 	if (__improbable((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap))) {
6379 #if (defined(KERNEL_INTEGRITY_CTRR) || defined(KERNEL_INTEGRITY_PV_CTRR)) && defined(CONFIG_XNUPOST)
6380 		extern vm_offset_t ctrr_test_page;
6381 		if (__probable(v != ctrr_test_page))
6382 #endif
6383 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6384 	}
6385 	if (__improbable((prot == VM_PROT_EXECUTE) && !pmap_allows_xo(pmap))) {
6386 		return KERN_PROTECTION_FAILURE;
6387 	}
6388 
6389 	assert(pn != vm_page_fictitious_addr);
6390 
6391 	pmap_lock(pmap, PMAP_LOCK_SHARED);
6392 
6393 	/*
6394 	 *	Expand pmap to include this pte.  Assume that
6395 	 *	pmap is always expanded to include enough hardware
6396 	 *	pages to map one VM page.
6397 	 */
6398 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6399 		/* Must unlock to expand the pmap. */
6400 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6401 
6402 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6403 
6404 		if (kr != KERN_SUCCESS) {
6405 			return kr;
6406 		}
6407 
6408 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6409 	}
6410 
6411 	if (options & PMAP_OPTIONS_NOENTER) {
6412 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6413 		return KERN_SUCCESS;
6414 	}
6415 
6416 	/*
6417 	 * Since we may not hold the pmap lock exclusive, updating the pte is
6418 	 * done via a cmpxchg loop.
6419 	 * We need to be careful about modifying non-local data structures before commiting
6420 	 * the new pte since we may need to re-do the transaction.
6421 	 */
6422 	const pt_entry_t prev_pte = os_atomic_load(pte_p, relaxed);
6423 
6424 	if (pte_is_valid(prev_pte) && (pte_to_pa(prev_pte) != pa)) {
6425 		/*
6426 		 * There is already a mapping here & it's for a different physical page.
6427 		 * First remove that mapping.
6428 		 * We assume that we can leave the pmap lock held for shared access rather
6429 		 * than exclusive access here, because we assume that the VM won't try to
6430 		 * simultaneously map the same VA to multiple different physical pages.
6431 		 * If that assumption is violated, sptm_map_page() will panic as the architecture
6432 		 * does not allow the output address of a mapping to be changed without a break-
6433 		 * before-make sequence.
6434 		 */
6435 		pmap_remove_range(pmap, v, v + PAGE_SIZE);
6436 	}
6437 
6438 	const pt_entry_t pte = pmap_construct_pte(pmap, pa, prot, fault_type, wired, options, &pp_attr_bits);
6439 
6440 	while (!committed) {
6441 		pt_entry_t spte = ARM_PTE_EMPTY;
6442 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6443 		bool skip_footprint_debit = false;
6444 
6445 		if (pa_valid(pa)) {
6446 			unsigned int pai;
6447 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6448 
6449 			is_internal = FALSE;
6450 			is_altacct = FALSE;
6451 
6452 			pai = pa_index(pa);
6453 			locked_pvh_t locked_pvh;
6454 
6455 			if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
6456 				locked_pvh = pvh_lock_nopreempt(pai);
6457 			} else {
6458 				locked_pvh = pvh_lock(pai);
6459 			}
6460 
6461 			/*
6462 			 * Make sure that the current per-cpu PV free list has
6463 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6464 			 * if the transaction succeeds. At this point, preemption has either
6465 			 * been disabled by the caller or by pvh_lock() above.
6466 			 * Note that we can still be interrupted, but a primary
6467 			 * interrupt handler can never enter the pmap.
6468 			 */
6469 			assert(get_preemption_level() > 0);
6470 			local_pv_free = &pmap_get_cpu_data()->pv_free;
6471 			const bool allocation_required = !pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL) &&
6472 			    !(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP) && pvh_ptep(locked_pvh.pvh) == pte_p);
6473 
6474 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6475 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6476 				int new_allocated_pves = 0;
6477 				volatile uint16_t *wiredcnt = NULL;
6478 				if (pmap != kernel_pmap) {
6479 					ptd_info_t *ptd_info = ptep_get_info(pte_p);
6480 					wiredcnt = &ptd_info->wiredcnt;
6481 				}
6482 
6483 				while (new_allocated_pves < 2) {
6484 					local_pv_free = &pmap_get_cpu_data()->pv_free;
6485 					pv_status = pv_alloc(pmap, PMAP_LOCK_SHARED, options, &new_pve_p[new_allocated_pves], &locked_pvh, wiredcnt);
6486 					if (pv_status == PV_ALLOC_FAIL) {
6487 						break;
6488 					} else if (pv_status == PV_ALLOC_RETRY) {
6489 						/*
6490 						 * In the case that pv_alloc() had to grab a new page of PVEs,
6491 						 * it will have dropped the pmap lock while doing so.
6492 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
6493 						 * be on a different CPU now.
6494 						 */
6495 						local_pv_free = &pmap_get_cpu_data()->pv_free;
6496 					} else {
6497 						/* If we've gotten this far then a node should've been allocated. */
6498 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6499 
6500 						new_allocated_pves++;
6501 					}
6502 				}
6503 
6504 				for (int i = 0; i < new_allocated_pves; i++) {
6505 					pv_free(new_pve_p[i]);
6506 				}
6507 			}
6508 
6509 			if (pv_status == PV_ALLOC_FAIL) {
6510 				pvh_unlock(&locked_pvh);
6511 				kr = KERN_RESOURCE_SHORTAGE;
6512 				break;
6513 			} else if (pv_status == PV_ALLOC_RETRY) {
6514 				pvh_unlock(&locked_pvh);
6515 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6516 				continue;
6517 			}
6518 
6519 #if HAS_MTE
6520 			if (flags & VM_MEM_MAP_MTE) {
6521 				wimg_bits = VM_WIMG_MTE;
6522 			} else
6523 #endif /* HAS_MTE */
6524 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6525 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6526 			} else {
6527 				wimg_bits = pmap_cache_attributes(pn);
6528 			}
6529 
6530 			/**
6531 			 * We may be retrying this operation after dropping the PVH lock.
6532 			 * Cache attributes for the physical page may have changed while the lock
6533 			 * was dropped, so update PTE cache attributes on each loop iteration.
6534 			 */
6535 			const pt_entry_t new_pte = pte | pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6536 
6537 
6538 			const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, new_pte, &locked_pvh, &spte, v, options, mapping_type);
6539 			assert(committed == false);
6540 			if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6541 				committed = true;
6542 			} else if (sptm_status == SPTM_MAP_FLUSH_PENDING) {
6543 				pvh_unlock(&locked_pvh);
6544 				continue;
6545 			} else if (sptm_status == SPTM_MAP_CODESIGN_ERROR) {
6546 				pvh_unlock(&locked_pvh);
6547 				kr = KERN_CODESIGN_ERROR;
6548 				break;
6549 			} else {
6550 				pvh_unlock(&locked_pvh);
6551 				kr = KERN_FAILURE;
6552 				break;
6553 			}
6554 			const bool had_valid_mapping = (sptm_status == SPTM_MAP_VALID);
6555 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6556 			if (!had_valid_mapping) {
6557 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6558 				int pve_ptep_idx = 0;
6559 				pv_status = pmap_enter_pv(pmap, pte_p, options, PMAP_LOCK_SHARED, &locked_pvh, &new_pve_p, &pve_ptep_idx);
6560 				/* We did all the allocations up top. So this shouldn't be able to fail. */
6561 				if (pv_status != PV_ALLOC_SUCCESS) {
6562 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6563 					    __func__, pv_status, new_pve_p, pmap);
6564 				}
6565 
6566 				if (pmap != kernel_pmap) {
6567 					if (options & PMAP_OPTIONS_INTERNAL) {
6568 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6569 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6570 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6571 							/*
6572 							 * Make a note to ourselves that this
6573 							 * mapping is using alternative
6574 							 * accounting. We'll need this in order
6575 							 * to know which ledger to debit when
6576 							 * the mapping is removed.
6577 							 *
6578 							 * The altacct bit must be set while
6579 							 * the pv head is locked. Defer the
6580 							 * ledger accounting until after we've
6581 							 * dropped the lock.
6582 							 */
6583 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6584 							is_altacct = TRUE;
6585 						}
6586 					}
6587 					if (ppattr_test_reusable(pai) &&
6588 					    !is_altacct) {
6589 						is_reusable = TRUE;
6590 					} else if (options & PMAP_OPTIONS_INTERNAL) {
6591 						is_internal = TRUE;
6592 					} else {
6593 						is_external = TRUE;
6594 					}
6595 				}
6596 			}
6597 
6598 			pvh_unlock(&locked_pvh);
6599 
6600 			if (pp_attr_bits != 0) {
6601 				ppattr_pa_set_bits(pa, pp_attr_bits);
6602 			}
6603 
6604 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
6605 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6606 
6607 				if (is_internal) {
6608 					/*
6609 					 * Make corresponding adjustments to
6610 					 * phys_footprint statistics.
6611 					 */
6612 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6613 					if (is_altacct) {
6614 						/*
6615 						 * If this page is internal and
6616 						 * in an IOKit region, credit
6617 						 * the task's total count of
6618 						 * dirty, internal IOKit pages.
6619 						 * It should *not* count towards
6620 						 * the task's total physical
6621 						 * memory footprint, because
6622 						 * this entire region was
6623 						 * already billed to the task
6624 						 * at the time the mapping was
6625 						 * created.
6626 						 *
6627 						 * Put another way, this is
6628 						 * internal++ and
6629 						 * alternate_accounting++, so
6630 						 * net effect on phys_footprint
6631 						 * is 0. That means: don't
6632 						 * touch phys_footprint here.
6633 						 */
6634 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6635 					} else {
6636 						if (pte_is_compressed(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6637 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6638 							skip_footprint_debit = true;
6639 						} else {
6640 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6641 						}
6642 					}
6643 				}
6644 				if (is_reusable) {
6645 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6646 				} else if (is_external) {
6647 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6648 				}
6649 			}
6650 		} else {
6651 			if (prot & VM_PROT_EXECUTE) {
6652 				kr = KERN_FAILURE;
6653 				break;
6654 			}
6655 
6656 			wimg_bits = pmap_cache_attributes(pn);
6657 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6658 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6659 			}
6660 
6661 			pt_entry_t new_pte = pte | pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6662 
6663 
6664 			/**
6665 			 * pmap_enter_pte() expects to be called with preemption disabled so it can access
6666 			 * the per-CPU prev_ptes array.
6667 			 */
6668 			disable_preemption();
6669 			const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, new_pte, NULL, &spte, v, options, mapping_type);
6670 			enable_preemption();
6671 			assert(committed == false);
6672 			if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
6673 				committed = true;
6674 
6675 				/**
6676 				 * If there was already a valid pte here then we reuse its
6677 				 * reference on the ptd and drop the one that we took above.
6678 				 */
6679 			} else if (__improbable(sptm_status != SPTM_MAP_FLUSH_PENDING)) {
6680 				panic("%s: Unexpected SPTM return code %u for non-managed PA 0x%llx", __func__, (unsigned int)sptm_status, (unsigned long long)pa);
6681 			}
6682 		}
6683 		if (committed) {
6684 			if (pte_is_compressed(spte, pte_p)) {
6685 				assert(pmap != kernel_pmap);
6686 
6687 				/* One less "compressed" */
6688 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6689 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
6690 
6691 				if (spte & ARM_PTE_COMPRESSED_ALT) {
6692 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6693 				} else if (!skip_footprint_debit) {
6694 					/* Was part of the footprint */
6695 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6696 				}
6697 			}
6698 		}
6699 	}
6700 
6701 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
6702 
6703 	if (kr == KERN_CODESIGN_ERROR) {
6704 		/* Print any logs from TXM */
6705 		txm_print_logs();
6706 	}
6707 	return kr;
6708 }
6709 
6710 kern_return_t
6711 pmap_enter_options_addr(
6712 	pmap_t pmap,
6713 	vm_map_address_t v,
6714 	pmap_paddr_t pa,
6715 	vm_prot_t prot,
6716 	vm_prot_t fault_type,
6717 	unsigned int flags,
6718 	boolean_t wired,
6719 	unsigned int options,
6720 	__unused void   *arg,
6721 	pmap_mapping_type_t mapping_type)
6722 {
6723 	kern_return_t kr = KERN_FAILURE;
6724 
6725 
6726 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6727 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6728 
6729 	kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options, mapping_type);
6730 
6731 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6732 
6733 	return kr;
6734 }
6735 
6736 kern_return_t
6737 pmap_enter_options(
6738 	pmap_t pmap,
6739 	vm_map_address_t v,
6740 	ppnum_t pn,
6741 	vm_prot_t prot,
6742 	vm_prot_t fault_type,
6743 	unsigned int flags,
6744 	boolean_t wired,
6745 	unsigned int options,
6746 	__unused void   *arg,
6747 	pmap_mapping_type_t mapping_type)
6748 {
6749 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot,
6750 	           fault_type, flags, wired, options, arg, mapping_type);
6751 }
6752 
6753 /*
6754  *	Routine:	pmap_change_wiring
6755  *	Function:	Change the wiring attribute for a map/virtual-address
6756  *			pair.
6757  *	In/out conditions:
6758  *			The mapping must already exist in the pmap.
6759  */
6760 MARK_AS_PMAP_TEXT void
6761 pmap_change_wiring_internal(
6762 	pmap_t pmap,
6763 	vm_map_address_t v,
6764 	boolean_t wired)
6765 {
6766 	pt_entry_t     *pte_p, prev_pte;
6767 
6768 	validate_pmap_mutable(pmap);
6769 
6770 	pmap_lock(pmap, PMAP_LOCK_SHARED);
6771 
6772 	const pt_entry_t new_wiring = (wired ? ARM_PTE_WIRED : 0);
6773 
6774 	pte_p = pmap_pte(pmap, v);
6775 	if (pte_p == PT_ENTRY_NULL) {
6776 		if (!wired) {
6777 			/*
6778 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6779 			 * may have been freed by a remove operation.
6780 			 */
6781 			goto pmap_change_wiring_return;
6782 		} else {
6783 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6784 		}
6785 	}
6786 
6787 	disable_preemption();
6788 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
6789 	sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring;
6790 
6791 	pmap_epoch_enter();
6792 	sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED);
6793 	pmap_epoch_exit();
6794 
6795 	prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed);
6796 	enable_preemption();
6797 
6798 	if (!pte_is_valid(prev_pte)) {
6799 		goto pmap_change_wiring_return;
6800 	}
6801 
6802 	if ((pmap != kernel_pmap) && (wired != pte_is_wired(prev_pte))) {
6803 		pte_update_wiredcnt(pmap, pte_p, wired);
6804 	}
6805 
6806 pmap_change_wiring_return:
6807 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
6808 }
6809 
6810 void
6811 pmap_change_wiring(
6812 	pmap_t pmap,
6813 	vm_map_address_t v,
6814 	boolean_t wired)
6815 {
6816 	pmap_change_wiring_internal(pmap, v, wired);
6817 }
6818 
6819 MARK_AS_PMAP_TEXT pmap_paddr_t
6820 pmap_find_pa_internal(
6821 	pmap_t pmap,
6822 	addr64_t va)
6823 {
6824 	pmap_paddr_t    pa = 0;
6825 
6826 	validate_pmap(pmap);
6827 
6828 	if (pmap != kernel_pmap) {
6829 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6830 	}
6831 
6832 	pa = pmap_vtophys(pmap, va);
6833 
6834 	if (pmap != kernel_pmap) {
6835 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6836 	}
6837 
6838 	return pa;
6839 }
6840 
6841 pmap_paddr_t
6842 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6843 {
6844 	pmap_paddr_t pa = 0;
6845 
6846 	if (pmap == kernel_pmap) {
6847 		pa = mmu_kvtop(va);
6848 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6849 		/*
6850 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6851 		 * translation even if PAN would prevent kernel access through the translation.
6852 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6853 		 */
6854 		pa = mmu_uvtop(va);
6855 	}
6856 	return pa;
6857 }
6858 
6859 pmap_paddr_t
6860 pmap_find_pa(
6861 	pmap_t pmap,
6862 	addr64_t va)
6863 {
6864 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6865 
6866 	if (pa != 0) {
6867 		return pa;
6868 	}
6869 
6870 	if (not_in_kdp) {
6871 		return pmap_find_pa_internal(pmap, va);
6872 	} else {
6873 		return pmap_vtophys(pmap, va);
6874 	}
6875 }
6876 
6877 ppnum_t
6878 pmap_find_phys_nofault(
6879 	pmap_t pmap,
6880 	addr64_t va)
6881 {
6882 	ppnum_t ppn;
6883 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6884 	return ppn;
6885 }
6886 
6887 ppnum_t
6888 pmap_find_phys(
6889 	pmap_t pmap,
6890 	addr64_t va)
6891 {
6892 	ppnum_t ppn;
6893 	ppn = atop(pmap_find_pa(pmap, va));
6894 	return ppn;
6895 }
6896 
6897 /**
6898  * Translate a kernel virtual address into a physical address.
6899  *
6900  * @param va The kernel virtual address to translate. Does not work on user
6901  *           virtual addresses.
6902  *
6903  * @return The physical address if the translation was successful, or zero if
6904  *         no valid mappings were found for the given virtual address.
6905  */
6906 pmap_paddr_t
6907 kvtophys(vm_offset_t va)
6908 {
6909 	sptm_paddr_t pa;
6910 
6911 	if (sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS) {
6912 		return 0;
6913 	}
6914 
6915 	return pa;
6916 }
6917 
6918 /**
6919  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6920  * points to a non-kernel-managed physical page, then this call will panic().
6921  *
6922  * @note The output of this function is guaranteed to be a kernel-managed
6923  *       physical page, which means it's safe to pass the output directly to
6924  *       pa_index() to create a physical address index for various pmap data
6925  *       structures.
6926  *
6927  * @param va The kernel virtual address to translate. Does not work on user
6928  *           virtual addresses.
6929  *
6930  * @return The translated physical address for the given virtual address.
6931  */
6932 pmap_paddr_t
6933 kvtophys_nofail(vm_offset_t va)
6934 {
6935 	pmap_paddr_t pa;
6936 
6937 	if (__improbable(sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS)) {
6938 		panic("%s: VA->PA translation failed for va %p", __func__, (void *)va);
6939 	}
6940 
6941 	return pa;
6942 }
6943 
6944 pmap_paddr_t
6945 pmap_vtophys(
6946 	pmap_t pmap,
6947 	addr64_t va)
6948 {
6949 	if ((va < pmap->min) || (va >= pmap->max)) {
6950 		return 0;
6951 	}
6952 
6953 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6954 
6955 	tt_entry_t * ttp = NULL;
6956 	tt_entry_t * ttep = NULL;
6957 	tt_entry_t   tte = ARM_TTE_EMPTY;
6958 	pmap_paddr_t pa = 0;
6959 	unsigned int cur_level;
6960 
6961 	ttp = pmap->tte;
6962 
6963 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6964 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6965 
6966 		tte = *ttep;
6967 
6968 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6969 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6970 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6971 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6972 
6973 		if ((tte & valid_mask) != valid_mask) {
6974 			return (pmap_paddr_t) 0;
6975 		}
6976 
6977 		/* This detects both leaf entries and intermediate block mappings. */
6978 		if ((tte & type_mask) == type_block) {
6979 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6980 			break;
6981 		}
6982 
6983 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6984 	}
6985 
6986 	return pa;
6987 }
6988 
6989 /*
6990  *	pmap_init_pte_page - Initialize a page table page.
6991  */
6992 MARK_AS_PMAP_TEXT void
6993 pmap_init_pte_page(
6994 	pmap_t pmap,
6995 	pt_entry_t *pte_p,
6996 	vm_offset_t va,
6997 	unsigned int ttlevel,
6998 	boolean_t alloc_ptd)
6999 {
7000 	pt_desc_t   *ptdp = NULL;
7001 	unsigned int pai = pa_index(kvtophys_nofail((vm_offset_t)pte_p));
7002 	const uintptr_t pvh = pai_to_pvh(pai);
7003 
7004 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
7005 		if (alloc_ptd) {
7006 			/*
7007 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
7008 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
7009 			 * bootstrap request, so we check for an existing PTD here.
7010 			 */
7011 			ptdp = ptd_alloc(pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
7012 			if (ptdp == NULL) {
7013 				panic("%s: unable to allocate PTD", __func__);
7014 			}
7015 			locked_pvh_t locked_pvh = pvh_lock(pai);
7016 			pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
7017 			pvh_unlock(&locked_pvh);
7018 		} else {
7019 			panic("pmap_init_pte_page(): no PTD for pte_p %p", pte_p);
7020 		}
7021 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
7022 		ptdp = pvh_ptd(pvh);
7023 	} else {
7024 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
7025 	}
7026 
7027 	// pagetable zero-fill and barrier should be guaranteed by the SPTM
7028 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
7029 }
7030 
7031 /*
7032  * This function guarantees that a pmap has the necessary page tables in place
7033  * to map the specified VA.  If necessary, it will allocate new tables at any
7034  * non-root level in the hierarchy (the root table is always already allocated
7035  * and stored in the pmap).
7036  *
7037  * @note This function is expected to be called without any pmap or PVH lock
7038  *       held.
7039  *
7040  * @note It is possible for an L3 table newly allocated by this function to be
7041  *       deleted by another thread before control returns to the caller, iff that
7042  *       table is an ordinary userspace table.  Callers that use this function
7043  *       to allocate new user L3 tables are therefore expected to keep calling
7044  *       this function until they observe a successful L3 PTE lookup with the pmap
7045  *       lock held.  As long as it does not drop the pmap lock, the caller may
7046  *       then safely use the looked-up L3 table.  See the use of this function in
7047  *       pmap_enter_options_internal() for an example.
7048  *
7049  * @param pmap The pmap for which to ensure mapping space is present.
7050  * @param vaddr The virtual address for which to ensure mapping space is present
7051  *              in [pmap].
7052  * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be
7053  *                allocated.  The only valid option is PMAP_OPTIONS_NOWAIT, which
7054  *                specifies that the allocation must not block.
7055  * @param level The maximum paging level for which to ensure a table is present.
7056  *
7057  * @return KERN_INVALID_ADDRESS if [v] is outside the pmap's mappable range,
7058  *         KERN_RESOURCE_SHORTAGE if a new table can't be allocated,
7059  *         KERN_SUCCESS otherwise.
7060  */
7061 MARK_AS_PMAP_TEXT static kern_return_t
7062 pmap_expand(
7063 	pmap_t pmap,
7064 	vm_map_address_t vaddr,
7065 	unsigned int options,
7066 	unsigned int level)
7067 {
7068 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7069 
7070 	if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) {
7071 		return KERN_INVALID_ADDRESS;
7072 	}
7073 	pmap_paddr_t table_pa = pmap->ttep;
7074 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7075 	const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1;
7076 	unsigned int ttlevel = pt_attr_root_level(pt_attr);
7077 	tt_entry_t *table_ttep = pmap->tte;
7078 	tt_entry_t *ttep;
7079 	tt_entry_t old_tte = ARM_TTE_EMPTY;
7080 
7081 	for (; ttlevel < level; ttlevel++) {
7082 		/**
7083 		 * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE.
7084 		 * Doing this step at the beginning of the loop instead of the end (which would make it part of
7085 		 * the prior iteration) avoids the possibility of executing this step to extract an L3 table KVA
7086 		 * from an L2 TTE, which would be useless because there would be no next iteration to make use
7087 		 * of the table KVA.
7088 		 */
7089 		if (table_ttep == NULL) {
7090 			assert(tte_is_valid_table(old_tte));
7091 			table_pa = old_tte & ARM_TTE_TABLE_MASK;
7092 			table_ttep = (tt_entry_t*)phystokv(table_pa);
7093 		}
7094 
7095 		vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr);
7096 
7097 		/**
7098 		 * We don't need to hold the pmap lock while walking the paging hierarchy.  Only L3 tables are
7099 		 * allowed to be dynamically removed, and only for regular user pmaps at that.  We may allocate
7100 		 * a new L3 table below, but we will only access L0-L2 tables, so there's no risk of a table
7101 		 * being deleted while we are using it for the next level(s) of lookup.
7102 		 */
7103 		ttep = &table_ttep[ttn_index(pt_attr, vaddr, ttlevel)];
7104 		old_tte = os_atomic_load(ttep, relaxed);
7105 		table_ttep = NULL;
7106 		if (!tte_is_valid_table(old_tte)) {
7107 			tt_entry_t new_tte, *new_ttep;
7108 			pt_desc_t *new_ptdp;
7109 			while (pmap_tt_allocate(pmap, &new_ttep, &new_ptdp, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) {
7110 				if (options & PMAP_OPTIONS_NOWAIT) {
7111 					return KERN_RESOURCE_SHORTAGE;
7112 				}
7113 				VM_PAGE_WAIT();
7114 			}
7115 			assert(pa_valid(table_pa));
7116 			/**
7117 			 * Grab the lower-level table's PVH lock to ensure we don't try to concurrently map different
7118 			 * tables at the same TTE.
7119 			 */
7120 			locked_pvh_t locked_pvh = pvh_lock(pa_index(table_pa));
7121 			old_tte = os_atomic_load(ttep, relaxed);
7122 			if (!tte_is_valid_table(old_tte)) {
7123 				/**
7124 				 * This call must be issued prior to sptm_map_table() so that the page table's
7125 				 * PTD info is valid by the time the new table becomes visible in the paging
7126 				 * hierarchy. sptm_map_table() is expected to issue a barrier that effectively
7127 				 * guarantees the PTD update will be visible to concurrent observers as soon as
7128 				 * the new table becomes visible in the paging hierarchy.
7129 				 */
7130 				pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE);
7131 				pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)new_ttep);
7132 				/*
7133 				 * If the table is going to map a kernel RO zone VA region, then we must
7134 				 * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE.  The SPTM's type system
7135 				 * requires the table to be transitioned through XNU_DEFAULT for refcount
7136 				 * enforcement, which is fine since this path is expected to execute only
7137 				 * once during boot.
7138 				 */
7139 				if (__improbable(ttlevel == pt_attr_twig_level(pt_attr)) &&
7140 				    (pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE)) {
7141 					sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
7142 					sptm_retype(pa, XNU_PAGE_TABLE, XNU_DEFAULT, retype_params);
7143 					retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
7144 					sptm_retype(pa, XNU_DEFAULT, XNU_PAGE_TABLE_ROZONE, retype_params);
7145 				}
7146 				new_tte = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
7147 				sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte);
7148 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
7149 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte);
7150 
7151 				/**
7152 				 * Now that we've fully mapped the table, do final initialization of PTD
7153 				 * state, which includes dropping the wired count to allow future reclamation
7154 				 * of the page table page.
7155 				 */
7156 				ptd_info_finalize(new_ptdp);
7157 
7158 				table_pa = pa;
7159 				/**
7160 				 * If we need to set up multiple TTEs mapping different parts of the same page
7161 				 * (e.g. because we're carving multiple 4K page tables out of a 16K native page,
7162 				 * determine which of the grouped TTEs is the one that we need to follow for the
7163 				 * next level of the table walk.
7164 				 */
7165 				table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) *
7166 				    (pmap_page_size / sizeof(tt_entry_t)));
7167 				new_ttep = (tt_entry_t *)NULL;
7168 			}
7169 			pvh_unlock(&locked_pvh);
7170 
7171 			if (new_ttep != (tt_entry_t *)NULL) {
7172 				pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1);
7173 				new_ttep = (tt_entry_t *)NULL;
7174 			}
7175 		}
7176 	}
7177 
7178 	return KERN_SUCCESS;
7179 }
7180 
7181 /*
7182  *	Routine:	pmap_gc
7183  *	Function:
7184  *              Pmap garbage collection
7185  *		Called by the pageout daemon when pages are scarce.
7186  *
7187  */
7188 void
7189 pmap_gc(void)
7190 {
7191 	/*
7192 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
7193 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
7194 	 * or may contain wired mappings.  However, it may make sense to scan the pmap VM
7195 	 * object here, and for each page consult the SPTM frame table and if necessary
7196 	 * the PTD in the PV head table.  If the frame table indicates the page is a leaf
7197 	 * page table page and the PTD indicates it has no wired mappings, we can call
7198 	 * pmap_remove() on the VA region mapped by the page and therein return the page
7199 	 * to the VM.
7200 	 */
7201 }
7202 
7203 /*
7204  *      By default, don't attempt pmap GC more frequently
7205  *      than once / 1 minutes.
7206  */
7207 
7208 void
7209 compute_pmap_gc_throttle(
7210 	void *arg __unused)
7211 {
7212 }
7213 
7214 /*
7215  * pmap_attribute_cache_sync(vm_offset_t pa)
7216  *
7217  * Invalidates all of the instruction cache on a physical page and
7218  * pushes any dirty data from the data cache for the same physical page
7219  */
7220 
7221 kern_return_t
7222 pmap_attribute_cache_sync(
7223 	ppnum_t pp,
7224 	vm_size_t size,
7225 	__unused vm_machine_attribute_t attribute,
7226 	__unused vm_machine_attribute_val_t * value)
7227 {
7228 	if (size > PAGE_SIZE) {
7229 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7230 	} else {
7231 		cache_sync_page(pp);
7232 	}
7233 
7234 	return KERN_SUCCESS;
7235 }
7236 
7237 /*
7238  * pmap_sync_page_data_phys(ppnum_t pp)
7239  *
7240  * Invalidates all of the instruction cache on a physical page and
7241  * pushes any dirty data from the data cache for the same physical page.
7242  * Not required on SPTM systems, because the SPTM automatically performs
7243  * the invalidate operation when retyping to one of the types that allow
7244  * for executable permissions.
7245  */
7246 void
7247 pmap_sync_page_data_phys(
7248 	__unused ppnum_t pp)
7249 {
7250 	return;
7251 }
7252 
7253 /*
7254  * pmap_sync_page_attributes_phys(ppnum_t pp)
7255  *
7256  * Write back and invalidate all cachelines on a physical page.
7257  */
7258 void
7259 pmap_sync_page_attributes_phys(
7260 	ppnum_t pp)
7261 {
7262 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7263 }
7264 
7265 #if CONFIG_COREDUMP
7266 /* temporary workaround */
7267 boolean_t
7268 coredumpok(
7269 	vm_map_t map,
7270 	mach_vm_offset_t va)
7271 {
7272 	pt_entry_t     *pte_p;
7273 	pt_entry_t      spte;
7274 
7275 	pte_p = pmap_pte(map->pmap, va);
7276 	if (0 == pte_p) {
7277 		return FALSE;
7278 	}
7279 	if (vm_map_entry_has_device_pager(map, va)) {
7280 		return FALSE;
7281 	}
7282 	spte = *pte_p;
7283 #if HAS_MTE
7284 	return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT || ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_MTE;
7285 #else /* !HAS_MTE */
7286 	return ARM_PTE_EXTRACT_ATTRINDX(spte) == CACHE_ATTRINDX_DEFAULT;
7287 #endif /* HAS_MTE */
7288 }
7289 #endif
7290 
7291 void
7292 fillPage(
7293 	ppnum_t pn,
7294 	unsigned int fill)
7295 {
7296 	unsigned int   *addr;
7297 	int             count;
7298 
7299 	addr = (unsigned int *) phystokv(ptoa(pn));
7300 	count = PAGE_SIZE / sizeof(unsigned int);
7301 	while (count--) {
7302 		*addr++ = fill;
7303 	}
7304 }
7305 
7306 extern void     mapping_set_mod(ppnum_t pn);
7307 
7308 void
7309 mapping_set_mod(
7310 	ppnum_t pn)
7311 {
7312 	pmap_set_modify(pn);
7313 }
7314 
7315 extern void     mapping_set_ref(ppnum_t pn);
7316 
7317 void
7318 mapping_set_ref(
7319 	ppnum_t pn)
7320 {
7321 	pmap_set_reference(pn);
7322 }
7323 
7324 /*
7325  * Clear specified attribute bits.
7326  *
7327  * Try to force an arm_fast_fault() for all mappings of
7328  * the page - to force attributes to be set again at fault time.
7329  * If the forcing succeeds, clear the cached bits at the head.
7330  * Otherwise, something must have been wired, so leave the cached
7331  * attributes alone.
7332  */
7333 MARK_AS_PMAP_TEXT static void
7334 phys_attribute_clear_with_flush_range(
7335 	ppnum_t         pn,
7336 	unsigned int    bits,
7337 	int             options,
7338 	void            *arg,
7339 	pmap_tlb_flush_range_t *flush_range)
7340 {
7341 	pmap_paddr_t    pa = ptoa(pn);
7342 	vm_prot_t       allow_mode = VM_PROT_ALL;
7343 
7344 	if ((arg != NULL) || (flush_range != NULL)) {
7345 		options = options & ~PMAP_OPTIONS_NOFLUSH;
7346 	}
7347 
7348 	if (__improbable((options & PMAP_OPTIONS_FF_WIRED) != 0)) {
7349 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7350 		    "invalid options",
7351 		    pn, bits, options, arg, flush_range);
7352 	}
7353 
7354 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
7355 	    (options & PMAP_OPTIONS_NOFLUSH))) {
7356 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7357 		    "should not clear 'modified' without flushing TLBs",
7358 		    pn, bits, options, arg, flush_range);
7359 	}
7360 
7361 	assert(pn != vm_page_fictitious_addr);
7362 
7363 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7364 		assert(bits == PP_ATTR_MODIFIED);
7365 
7366 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, NULL, flush_range);
7367 		/*
7368 		 * We short circuit this case; it should not need to
7369 		 * invoke arm_force_fast_fault, so just clear the modified bit.
7370 		 * pmap_page_protect has taken care of resetting
7371 		 * the state so that we'll see the next write as a fault to
7372 		 * the VM (i.e. we don't want a fast fault).
7373 		 */
7374 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7375 		return;
7376 	}
7377 	if (bits & PP_ATTR_REFERENCED) {
7378 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7379 	}
7380 	if (bits & PP_ATTR_MODIFIED) {
7381 		allow_mode &= ~VM_PROT_WRITE;
7382 	}
7383 
7384 	if (bits == PP_ATTR_NOENCRYPT) {
7385 		/*
7386 		 * We short circuit this case; it should not need to
7387 		 * invoke arm_force_fast_fault, so just clear and
7388 		 * return.  On ARM, this bit is just a debugging aid.
7389 		 */
7390 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7391 		return;
7392 	}
7393 
7394 	arm_force_fast_fault_with_flush_range(pn, allow_mode, options, NULL, (pp_attr_t)bits, flush_range);
7395 }
7396 
7397 MARK_AS_PMAP_TEXT void
7398 phys_attribute_clear_internal(
7399 	ppnum_t         pn,
7400 	unsigned int    bits,
7401 	int             options,
7402 	void            *arg)
7403 {
7404 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7405 }
7406 
7407 #if __ARM_RANGE_TLBI__
7408 
7409 MARK_AS_PMAP_TEXT static vm_map_address_t
7410 phys_attribute_clear_twig_internal(
7411 	pmap_t pmap,
7412 	vm_map_address_t start,
7413 	vm_map_address_t end,
7414 	unsigned int bits,
7415 	unsigned int options,
7416 	pmap_tlb_flush_range_t *flush_range)
7417 {
7418 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7419 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7420 	assert(end >= start);
7421 	assert((end - start) <= pt_attr_twig_size(pt_attr));
7422 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7423 	vm_map_address_t va = start;
7424 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7425 	tt_entry_t     *tte_p;
7426 	tte_p = pmap_tte(pmap, start);
7427 
7428 	/**
7429 	 * It's possible that this portion of our VA region has never been paged in, in which case
7430 	 * there may not be a valid twig or leaf table here.
7431 	 */
7432 	if ((tte_p == (tt_entry_t *) NULL) || !tte_is_valid_table(*tte_p)) {
7433 		assert(flush_range->pending_region_entries == 0);
7434 		return end;
7435 	}
7436 
7437 	pte_p = (pt_entry_t *) ttetokv(*tte_p);
7438 
7439 	start_pte_p = &pte_p[pte_index(pt_attr, start)];
7440 	end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7441 	assert(end_pte_p >= start_pte_p);
7442 	for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7443 		if (flush_range->pending_region_entries == 0) {
7444 			flush_range->pending_region_start = va;
7445 		} else {
7446 			assertf((flush_range->pending_region_start +
7447 			    (flush_range->pending_region_entries * pmap_page_size)) == va,
7448 			    "pending_region_start 0x%llx + 0x%lx pages != va 0%llx",
7449 			    (unsigned long long)flush_range->pending_region_start,
7450 			    (unsigned long)flush_range->pending_region_entries,
7451 			    (unsigned long long)va);
7452 		}
7453 		flush_range->current_ptep = curr_pte_p;
7454 		const pt_entry_t spte = os_atomic_load(curr_pte_p, relaxed);
7455 		const pmap_paddr_t pa = pte_to_pa(spte);
7456 		if (pte_is_valid(spte) && pa_valid(pa)) {
7457 			/* The PTE maps a managed page, so do the appropriate PV list-based permission changes. */
7458 			const ppnum_t pn = (ppnum_t) atop(pa);
7459 			phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7460 			if (__probable(flush_range->region_entry_added)) {
7461 				flush_range->region_entry_added = false;
7462 			} else {
7463 				/**
7464 				 * It's possible that some other thread removed the mapping between our check
7465 				 * of the PTE above and taking the PVH lock in the
7466 				 * phys_attribute_clear_with_flush_range() path.  In that case we have a
7467 				 * discontinuity in the region to update, so just submit any pending region
7468 				 * templates and start a new region op on the next iteration.
7469 				 */
7470 				pmap_multipage_op_submit_region(flush_range);
7471 			}
7472 		} else if (__improbable(!pte_is_valid(spte))) {
7473 			/**
7474 			 * We've found an invalid mapping, so we have a discontinuity in the the region to
7475 			 * update.  Handle this by submitting any pending region templates and starting a new
7476 			 * region on the next iteration.  In theory we could instead handle this by installing
7477 			 * a "safe" (AF bit cleared, minimal permissions) PTE template; the SPTM would just
7478 			 * ignore the update on finding an invalid mapping in the PTE.  But we don't know
7479 			 * what a "safe" template will be in all cases: for example, JIT regions require all
7480 			 * mappings to either be invalid or to have full RWX permissions.
7481 			 */
7482 			pmap_multipage_op_submit_region(flush_range);
7483 		} else if (pmap_insert_flush_range_template(spte, flush_range)) {
7484 			/**
7485 			 * We've found a mapping to a non-managed page, so just insert the existing
7486 			 * PTE into the pending region ops since we don't manage attributes for non-managed
7487 			 * pages.
7488 			 * If pmap_insert_flush_range_template() returns true, indicating that it reached
7489 			 * the mapping limit and submitted the SPTM call, then we also submit any pending
7490 			 * disjoint ops.  Having pending operations in either category will keep preemption
7491 			 * disabled, and we want to ensure that we can at least temporarily
7492 			 * re-enable preemption every SPTM_MAPPING_LIMIT mappings.
7493 			 */
7494 			pmap_multipage_op_submit_disjoint(0, flush_range);
7495 		}
7496 
7497 		/**
7498 		 * If the total number of pending + processed entries exceeds the mapping threshold,
7499 		 * we may need to submit all pending operations to avoid excessive preemption latency.
7500 		 * Otherwise, a small number of pending disjoint or region ops can hold preemption
7501 		 * disabled across an arbitrary number of total processed entries.
7502 		 * As an optimization, we may be able to avoid submitting if no urgent AST is
7503 		 * pending on the local CPU, but only if we aren't currently in an epoch.  If we are
7504 		 * in an epoch, failure to submit in a timely manner can cause another CPU to wait
7505 		 * too long for our epoch to drain.
7506 		 */
7507 		if (((flush_range->processed_entries + flush_range->pending_disjoint_entries +
7508 		    flush_range->pending_region_entries) >= SPTM_MAPPING_LIMIT) &&
7509 		    (pmap_in_epoch() || pmap_pending_preemption())) {
7510 			pmap_multipage_op_submit(flush_range);
7511 			assert(preemption_enabled());
7512 		}
7513 	}
7514 
7515 	/* SPTM region ops can't span L3 table boundaries, so submit any pending region templates now. */
7516 	pmap_multipage_op_submit_region(flush_range);
7517 	return end;
7518 }
7519 
7520 MARK_AS_PMAP_TEXT vm_map_address_t
7521 phys_attribute_clear_range_internal(
7522 	pmap_t pmap,
7523 	vm_map_address_t start,
7524 	vm_map_address_t end,
7525 	unsigned int bits,
7526 	unsigned int options)
7527 {
7528 	if (__improbable(end < start)) {
7529 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7530 	}
7531 	validate_pmap_mutable(pmap);
7532 
7533 	vm_map_address_t va = start;
7534 	pmap_tlb_flush_range_t flush_range = {
7535 		.ptfr_pmap = pmap,
7536 		.ptfr_start = start,
7537 		.ptfr_end = end,
7538 		.current_ptep = NULL,
7539 		.pending_region_start = 0,
7540 		.pending_region_entries = 0,
7541 		.region_entry_added = false,
7542 		.current_header = NULL,
7543 		.current_header_first_mapping_index = 0,
7544 		.processed_entries = 0,
7545 		.pending_disjoint_entries = 0,
7546 		.ptfr_flush_needed = false
7547 	};
7548 
7549 	pmap_lock(pmap, PMAP_LOCK_SHARED);
7550 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7551 
7552 	while (va < end) {
7553 		vm_map_address_t curr_end;
7554 
7555 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7556 		if (curr_end > end) {
7557 			curr_end = end;
7558 		}
7559 
7560 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7561 	}
7562 	pmap_multipage_op_submit(&flush_range);
7563 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
7564 	assert((flush_range.pending_disjoint_entries == 0) && (flush_range.pending_region_entries == 0));
7565 	if (flush_range.ptfr_flush_needed) {
7566 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7567 			flush_range.ptfr_start,
7568 			flush_range.ptfr_end - flush_range.ptfr_start,
7569 			flush_range.ptfr_pmap,
7570 			true);
7571 		sync_tlb_flush();
7572 	}
7573 	return va;
7574 }
7575 
7576 static void
7577 phys_attribute_clear_range(
7578 	pmap_t pmap,
7579 	vm_map_address_t start,
7580 	vm_map_address_t end,
7581 	unsigned int bits,
7582 	unsigned int options)
7583 {
7584 	/*
7585 	 * We allow single-page requests to execute non-preemptibly,
7586 	 * as it doesn't make sense to sample AST_URGENT for a single-page
7587 	 * operation, and there are a couple of special use cases that
7588 	 * require a non-preemptible single-page operation.
7589 	 */
7590 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7591 		pmap_verify_preemptible();
7592 	}
7593 	__assert_only const int preemption_level = get_preemption_level();
7594 
7595 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7596 
7597 	phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7598 
7599 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7600 
7601 	assert(preemption_level == get_preemption_level());
7602 }
7603 #endif /* __ARM_RANGE_TLBI__ */
7604 
7605 static void
7606 phys_attribute_clear(
7607 	ppnum_t         pn,
7608 	unsigned int    bits,
7609 	int             options,
7610 	void            *arg)
7611 {
7612 	/*
7613 	 * Do we really want this tracepoint?  It will be extremely chatty.
7614 	 * Also, should we have a corresponding trace point for the set path?
7615 	 */
7616 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7617 
7618 	phys_attribute_clear_internal(pn, bits, options, arg);
7619 
7620 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7621 }
7622 
7623 /*
7624  *	Set specified attribute bits.
7625  *
7626  *	Set cached value in the pv head because we have
7627  *	no per-mapping hardware support for referenced and
7628  *	modify bits.
7629  */
7630 MARK_AS_PMAP_TEXT void
7631 phys_attribute_set_internal(
7632 	ppnum_t pn,
7633 	unsigned int bits)
7634 {
7635 	pmap_paddr_t    pa = ptoa(pn);
7636 	assert(pn != vm_page_fictitious_addr);
7637 
7638 	ppattr_pa_set_bits(pa, (uint16_t)bits);
7639 
7640 	return;
7641 }
7642 
7643 static void
7644 phys_attribute_set(
7645 	ppnum_t pn,
7646 	unsigned int bits)
7647 {
7648 	phys_attribute_set_internal(pn, bits);
7649 }
7650 
7651 
7652 /*
7653  *	Check specified attribute bits.
7654  *
7655  *	use the software cached bits (since no hw support).
7656  */
7657 static boolean_t
7658 phys_attribute_test(
7659 	ppnum_t pn,
7660 	unsigned int bits)
7661 {
7662 	pmap_paddr_t    pa = ptoa(pn);
7663 	assert(pn != vm_page_fictitious_addr);
7664 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7665 }
7666 
7667 
7668 /*
7669  *	Set the modify/reference bits on the specified physical page.
7670  */
7671 void
7672 pmap_set_modify(ppnum_t pn)
7673 {
7674 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
7675 }
7676 
7677 
7678 /*
7679  *	Clear the modify bits on the specified physical page.
7680  */
7681 void
7682 pmap_clear_modify(
7683 	ppnum_t pn)
7684 {
7685 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7686 }
7687 
7688 
7689 /*
7690  *	pmap_is_modified:
7691  *
7692  *	Return whether or not the specified physical page is modified
7693  *	by any physical maps.
7694  */
7695 boolean_t
7696 pmap_is_modified(
7697 	ppnum_t pn)
7698 {
7699 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7700 }
7701 
7702 
7703 /*
7704  *	Set the reference bit on the specified physical page.
7705  */
7706 static void
7707 pmap_set_reference(
7708 	ppnum_t pn)
7709 {
7710 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
7711 }
7712 
7713 /*
7714  *	Clear the reference bits on the specified physical page.
7715  */
7716 void
7717 pmap_clear_reference(
7718 	ppnum_t pn)
7719 {
7720 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7721 }
7722 
7723 
7724 /*
7725  *	pmap_is_referenced:
7726  *
7727  *	Return whether or not the specified physical page is referenced
7728  *	by any physical maps.
7729  */
7730 boolean_t
7731 pmap_is_referenced(
7732 	ppnum_t pn)
7733 {
7734 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7735 }
7736 
7737 /*
7738  * pmap_get_refmod(phys)
7739  *  returns the referenced and modified bits of the specified
7740  *  physical page.
7741  */
7742 unsigned int
7743 pmap_get_refmod(
7744 	ppnum_t pn)
7745 {
7746 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7747 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7748 }
7749 
7750 static inline unsigned int
7751 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7752 {
7753 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7754 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7755 }
7756 
7757 /*
7758  * pmap_clear_refmod(phys, mask)
7759  *  clears the referenced and modified bits as specified by the mask
7760  *  of the specified physical page.
7761  */
7762 void
7763 pmap_clear_refmod_options(
7764 	ppnum_t         pn,
7765 	unsigned int    mask,
7766 	unsigned int    options,
7767 	void            *arg)
7768 {
7769 	unsigned int    bits;
7770 
7771 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7772 	phys_attribute_clear(pn, bits, options, arg);
7773 }
7774 
7775 /*
7776  * Perform pmap_clear_refmod_options on a virtual address range.
7777  * The operation will be performed in bulk & tlb flushes will be coalesced
7778  * if possible.
7779  *
7780  * Returns true if the operation is supported on this platform.
7781  * If this function returns false, the operation is not supported and
7782  * nothing has been modified in the pmap.
7783  */
7784 bool
7785 pmap_clear_refmod_range_options(
7786 	pmap_t pmap __unused,
7787 	vm_map_address_t start __unused,
7788 	vm_map_address_t end __unused,
7789 	unsigned int mask __unused,
7790 	unsigned int options __unused)
7791 {
7792 #if __ARM_RANGE_TLBI__
7793 	unsigned int    bits;
7794 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7795 	phys_attribute_clear_range(pmap, start, end, bits, options);
7796 	return true;
7797 #else /* __ARM_RANGE_TLBI__ */
7798 #pragma unused(pmap, start, end, mask, options)
7799 	/*
7800 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7801 	 * contiguous range of addresses. This is large performance improvement on
7802 	 * platforms that support ranged tlbi instructions. But on older platforms,
7803 	 * we can only flush per-page or the entire asid. So we currently
7804 	 * only support this operation on platforms that support ranged tlbi.
7805 	 * instructions. On other platforms, we require that
7806 	 * the VM modify the bits on a per-page basis.
7807 	 */
7808 	return false;
7809 #endif /* __ARM_RANGE_TLBI__ */
7810 }
7811 
7812 void
7813 pmap_clear_refmod(
7814 	ppnum_t pn,
7815 	unsigned int mask)
7816 {
7817 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7818 }
7819 
7820 unsigned int
7821 pmap_disconnect_options(
7822 	ppnum_t pn,
7823 	unsigned int options,
7824 	void *arg)
7825 {
7826 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7827 		/*
7828 		 * On ARM, the "modified" bit is managed by software, so
7829 		 * we know up-front if the physical page is "modified",
7830 		 * without having to scan all the PTEs pointing to it.
7831 		 * The caller should have made the VM page "busy" so noone
7832 		 * should be able to establish any new mapping and "modify"
7833 		 * the page behind us.
7834 		 */
7835 		if (pmap_is_modified(pn)) {
7836 			/*
7837 			 * The page has been modified and will be sent to
7838 			 * the VM compressor.
7839 			 */
7840 			options |= PMAP_OPTIONS_COMPRESSOR;
7841 		} else {
7842 			/*
7843 			 * The page hasn't been modified and will be freed
7844 			 * instead of compressed.
7845 			 */
7846 		}
7847 	}
7848 
7849 	/* disconnect the page */
7850 	pmap_page_protect_options(pn, 0, options, arg);
7851 
7852 	/* return ref/chg status */
7853 	return pmap_get_refmod(pn);
7854 }
7855 
7856 /*
7857  *	Routine:
7858  *		pmap_disconnect
7859  *
7860  *	Function:
7861  *		Disconnect all mappings for this page and return reference and change status
7862  *		in generic format.
7863  *
7864  */
7865 unsigned int
7866 pmap_disconnect(
7867 	ppnum_t pn)
7868 {
7869 	pmap_page_protect(pn, 0);       /* disconnect the page */
7870 	return pmap_get_refmod(pn);   /* return ref/chg status */
7871 }
7872 
7873 boolean_t
7874 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7875 {
7876 	if (ptoa(first) >= vm_last_phys) {
7877 		return FALSE;
7878 	}
7879 	if (ptoa(last) < vm_first_phys) {
7880 		return FALSE;
7881 	}
7882 
7883 	return TRUE;
7884 }
7885 
7886 /*
7887  * The state maintained by the noencrypt functions is used as a
7888  * debugging aid on ARM.  This incurs some overhead on the part
7889  * of the caller.  A special case check in phys_attribute_clear
7890  * (the most expensive path) currently minimizes this overhead,
7891  * but stubbing these functions out on RELEASE kernels yields
7892  * further wins.
7893  */
7894 boolean_t
7895 pmap_is_noencrypt(
7896 	ppnum_t pn)
7897 {
7898 #if DEVELOPMENT || DEBUG
7899 	boolean_t result = FALSE;
7900 
7901 	if (!pa_valid(ptoa(pn))) {
7902 		return FALSE;
7903 	}
7904 
7905 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7906 
7907 	return result;
7908 #else
7909 #pragma unused(pn)
7910 	return FALSE;
7911 #endif
7912 }
7913 
7914 void
7915 pmap_set_noencrypt(
7916 	ppnum_t pn)
7917 {
7918 #if DEVELOPMENT || DEBUG
7919 	if (!pa_valid(ptoa(pn))) {
7920 		return;
7921 	}
7922 
7923 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7924 #else
7925 #pragma unused(pn)
7926 #endif
7927 }
7928 
7929 void
7930 pmap_clear_noencrypt(
7931 	ppnum_t pn)
7932 {
7933 #if DEVELOPMENT || DEBUG
7934 	if (!pa_valid(ptoa(pn))) {
7935 		return;
7936 	}
7937 
7938 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7939 #else
7940 #pragma unused(pn)
7941 #endif
7942 }
7943 
7944 void
7945 pmap_lock_phys_page(ppnum_t pn)
7946 {
7947 	unsigned int    pai;
7948 	pmap_paddr_t    phys = ptoa(pn);
7949 
7950 	if (pa_valid(phys)) {
7951 		pai = pa_index(phys);
7952 		__unused const locked_pvh_t locked_pvh = pvh_lock(pai);
7953 	} else {
7954 		simple_lock(&phys_backup_lock, LCK_GRP_NULL);
7955 	}
7956 }
7957 
7958 
7959 void
7960 pmap_unlock_phys_page(ppnum_t pn)
7961 {
7962 	unsigned int    pai;
7963 	pmap_paddr_t    phys = ptoa(pn);
7964 
7965 	if (pa_valid(phys)) {
7966 		pai = pa_index(phys);
7967 		locked_pvh_t locked_pvh = {.pvh = pai_to_pvh(pai), .pai = pai};
7968 		pvh_unlock(&locked_pvh);
7969 	} else {
7970 		simple_unlock(&phys_backup_lock);
7971 	}
7972 }
7973 
7974 MARK_AS_PMAP_TEXT void
7975 pmap_clear_user_ttb_internal(void)
7976 {
7977 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7978 }
7979 
7980 void
7981 pmap_clear_user_ttb(void)
7982 {
7983 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7984 	pmap_clear_user_ttb_internal();
7985 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7986 }
7987 
7988 /**
7989  * Set up a "fast fault", or a page fault that won't go through the VM layer on
7990  * a page. This is primarily used to manage ref/mod bits in software. Depending
7991  * on the value of allow_mode, the next read and/or write of the page will fault
7992  * and the ref/mod bits will be updated.
7993  *
7994  * @param ppnum Page number to set up a fast fault on.
7995  * @param allow_mode VM_PROT_NONE will cause the next read and write access to
7996  *                   fault.
7997  *                   VM_PROT_READ will only cause the next write access to fault.
7998  *                   Other values are undefined.
7999  * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
8000  *                PMAP_OPTIONS_FF_WIRED forces a fast fault even on wired pages.
8001  *                PMAP_OPTIONS_SET_REUSABLE/PMAP_OPTIONS_CLEAR_REUSABLE updates
8002  *                the global reusable bit of the page.
8003  * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
8004  *                   by the caller.  This is an input/output parameter which may be updated
8005  *                   to reflect a new PV head value to be passed to a later call to pvh_unlock().
8006  * @param bits_to_clear Mask of additional pp_attr_t bits to clear for the physical
8007  *                      page, iff this function completes successfully and returns
8008  *                      TRUE.  This is typically some combination of
8009  *                      the referenced, modified, and noencrypt bits.
8010  * @param flush_range When present, this function will skip the TLB flush for the
8011  *                    mappings that are covered by the range, leaving that to be
8012  *                    done later by the caller.  It may also avoid submitting mapping
8013  *                    updates directly to the SPTM, instead accumulating them in a
8014  *                    per-CPU array to be submitted later by the caller.
8015  *
8016  * @return TRUE if the fast fault was successfully configured for all mappings
8017  *         of the page, FALSE otherwise (e.g. if wired mappings are present and
8018  *         PMAP_OPTIONS_FF_WIRED was not passed).
8019  *
8020  * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
8021  *
8022  * @warning PMAP_OPTIONS_FF_WIRED should only be used with pages accessible from
8023  *          EL0.  The kernel may assume that accesses to wired, kernel-owned pages
8024  *          won't fault.
8025  */
8026 MARK_AS_PMAP_TEXT static boolean_t
8027 arm_force_fast_fault_with_flush_range(
8028 	ppnum_t         ppnum,
8029 	vm_prot_t       allow_mode,
8030 	int             options,
8031 	locked_pvh_t   *locked_pvh,
8032 	pp_attr_t       bits_to_clear,
8033 	pmap_tlb_flush_range_t *flush_range)
8034 {
8035 	pmap_paddr_t     phys = ptoa(ppnum);
8036 	pv_entry_t      *pve_p;
8037 	pt_entry_t      *pte_p;
8038 	unsigned int     pai;
8039 	boolean_t        result;
8040 	unsigned int     num_mappings = 0, num_skipped_mappings = 0;
8041 	bool             ref_fault;
8042 	bool             mod_fault;
8043 	bool             clear_write_fault = false;
8044 	bool             ref_aliases_mod = false;
8045 
8046 	assert(ppnum != vm_page_fictitious_addr);
8047 
8048 	/**
8049 	 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
8050 	 *
8051 	 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
8052 	 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
8053 	 * semantics conflict with each other, so assert they are not both true.
8054 	 */
8055 	assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
8056 
8057 	if (!pa_valid(phys)) {
8058 		return FALSE;   /* Not a managed page. */
8059 	}
8060 
8061 	result = TRUE;
8062 	ref_fault = false;
8063 	mod_fault = false;
8064 	pai = pa_index(phys);
8065 	locked_pvh_t local_locked_pvh = {.pvh = 0};
8066 	if (__probable(locked_pvh == NULL)) {
8067 		if (flush_range != NULL) {
8068 			/**
8069 			 * If we're partway through processing a multi-page batched call,
8070 			 * preemption will already be disabled so we can't simply call
8071 			 * pvh_lock() which may block.  Instead, we first try to acquire
8072 			 * the lock without waiting, which in most cases should succeed.
8073 			 * If it fails, we submit the pending batched operations to re-
8074 			 * enable preemption and then acquire the lock normally.
8075 			 */
8076 			local_locked_pvh = pvh_try_lock(pai);
8077 			if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
8078 				pmap_multipage_op_submit(flush_range);
8079 				local_locked_pvh = pvh_lock(pai);
8080 			}
8081 		} else {
8082 			local_locked_pvh = pvh_lock(pai);
8083 		}
8084 	} else {
8085 		local_locked_pvh = *locked_pvh;
8086 		assert(pai == local_locked_pvh.pai);
8087 	}
8088 	assert(local_locked_pvh.pvh != 0);
8089 	pvh_assert_locked(pai);
8090 
8091 	pte_p = PT_ENTRY_NULL;
8092 	pve_p = PV_ENTRY_NULL;
8093 	if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
8094 		pte_p = pvh_ptep(local_locked_pvh.pvh);
8095 	} else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
8096 		pve_p = pvh_pve_list(local_locked_pvh.pvh);
8097 	} else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
8098 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
8099 	}
8100 
8101 	const bool is_reusable = ppattr_test_reusable(pai);
8102 
8103 	bool pvh_lock_sleep_mode_needed = false;
8104 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
8105 	sptm_disjoint_op_t *sptm_ops = NULL;
8106 
8107 	/**
8108 	 * This would also work as a block, with the above variables declared using the
8109 	 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
8110 	 * dereferencing __block variables through stack forwarding pointers) isn't needed
8111 	 * here, as we never need to use this code sequence as a closure.
8112 	 */
8113 	#define FFF_PERCPU_INIT() do { \
8114 	        disable_preemption(); \
8115 	        sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
8116 	        sptm_ops = sptm_pcpu->sptm_ops; \
8117 	} while (0)
8118 
8119 	FFF_PERCPU_INIT();
8120 
8121 	int pve_ptep_idx = 0;
8122 
8123 	/**
8124 	 * With regard to TLBI, there are three cases:
8125 	 *
8126 	 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
8127 	 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
8128 	 *    itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
8129 	 *    mapping is out of the range.
8130 	 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
8131 	 *    let SPTM handle TLBI flushing.
8132 	 */
8133 	const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
8134 	const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
8135 
8136 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8137 		pt_entry_t       spte;
8138 		pt_entry_t       tmplate;
8139 
8140 		if (__improbable(pvh_lock_sleep_mode_needed)) {
8141 			assert((num_mappings == 0) && (num_skipped_mappings == 0));
8142 			/**
8143 			 * Undo the explicit preemption disable done in the last call to FFF_PER_CPU_INIT().
8144 			 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
8145 			 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
8146 			 * core while processing SPTM per-CPU data.  At the same time, we also want preemption
8147 			 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
8148 			 * urgent ASTs can be handled.
8149 			 */
8150 			enable_preemption();
8151 			pvh_lock_enter_sleep_mode(&local_locked_pvh);
8152 			pvh_lock_sleep_mode_needed = false;
8153 			FFF_PERCPU_INIT();
8154 		}
8155 
8156 		if (pve_p != PV_ENTRY_NULL) {
8157 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8158 			if (pte_p == PT_ENTRY_NULL) {
8159 				goto fff_skip_pve;
8160 			}
8161 		}
8162 
8163 #ifdef PVH_FLAG_IOMMU
8164 		if (pvh_ptep_is_iommu(pte_p)) {
8165 			++num_skipped_mappings;
8166 			goto fff_skip_pve;
8167 		}
8168 #endif
8169 		spte = os_atomic_load(pte_p, relaxed);
8170 		if (pte_is_compressed(spte, pte_p)) {
8171 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8172 		}
8173 
8174 		pt_desc_t *ptdp = NULL;
8175 		pmap_t pmap = NULL;
8176 		vm_map_address_t va = 0;
8177 
8178 		if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
8179 			/**
8180 			 * If the current mapping matches the flush range's current iteration position,
8181 			 * there's no need to do the work of getting the PTD.  We already know the pmap,
8182 			 * and the VA is implied by flush_range->pending_region_start.
8183 			 */
8184 			pmap = flush_range->ptfr_pmap;
8185 		} else {
8186 			ptdp = ptep_get_ptd(pte_p);
8187 			pmap = ptdp->pmap;
8188 			va = ptd_get_va(ptdp, pte_p);
8189 			assert(va >= pmap->min && va < pmap->max);
8190 		}
8191 
8192 		bool skip_pte = pte_is_wired(spte) &&
8193 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
8194 
8195 		if (skip_pte) {
8196 			result = FALSE;
8197 		}
8198 
8199 		// A concurrent pmap_remove() may have cleared the PTE
8200 		if (__improbable(!pte_is_valid(spte))) {
8201 			skip_pte = true;
8202 		}
8203 
8204 		/**
8205 		 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
8206 		 * pending disjoint ops, so we don't need to do flush range disjoint op management.
8207 		 */
8208 		if ((flush_range != NULL) && (ptdp != NULL) && !skip_pte) {
8209 			/**
8210 			 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
8211 			 * We do this in three cases:
8212 			 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
8213 			 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
8214 			 *    for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
8215 			 * 3) We need to change the options passed to the SPTM for a run of one or more mappings.  Specifically,
8216 			 *    if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
8217 			 *    belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
8218 			 *    the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
8219 			 */
8220 			uint32_t per_mapping_sptm_update_options = sptm_update_options;
8221 			if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8222 				per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
8223 			}
8224 			if ((num_mappings == 0) ||
8225 			    (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
8226 				if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
8227 					/**
8228 					 * If we needed to submit the pending disjoint ops to make room for the new page,
8229 					 * flush any pending region ops to reenable preemption and restart the loop with
8230 					 * the lock in sleep mode.  This prevents preemption from being held disabled
8231 					 * for an arbitrary amount of time in the pathological case in which we have
8232 					 * both pending region ops and an excessively long PV list that repeatedly
8233 					 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
8234 					 */
8235 					pmap_multipage_op_submit_region(flush_range);
8236 					assert(num_mappings == 0);
8237 					num_skipped_mappings = 0;
8238 					pvh_lock_sleep_mode_needed = true;
8239 					continue;
8240 				}
8241 			}
8242 		}
8243 
8244 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8245 
8246 		/* update pmap stats and ledgers */
8247 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
8248 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
8249 		if (is_altacct) {
8250 			/*
8251 			 * We do not track "reusable" status for
8252 			 * "alternate accounting" mappings.
8253 			 */
8254 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
8255 		    is_reusable &&
8256 		    is_internal &&
8257 		    pmap != kernel_pmap) {
8258 			/* one less "reusable" */
8259 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8260 			/* one more "internal" */
8261 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8262 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8263 
8264 			/*
8265 			 * Since the page is being marked non-reusable, we assume that it will be
8266 			 * modified soon.  Avoid the cost of another trap to handle the fast
8267 			 * fault when we next write to this page.
8268 			 */
8269 			clear_write_fault = true;
8270 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
8271 		    !is_reusable &&
8272 		    is_internal &&
8273 		    pmap != kernel_pmap) {
8274 			/* one more "reusable" */
8275 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8276 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8277 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
8278 		}
8279 
8280 		if (skip_pte) {
8281 			++num_skipped_mappings;
8282 			goto fff_skip_pve;
8283 		}
8284 
8285 		tmplate = spte;
8286 
8287 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
8288 			/* read protection sets the pte to fault */
8289 			tmplate =  tmplate & ~ARM_PTE_AF;
8290 			ref_fault = true;
8291 		}
8292 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
8293 			/* take away write permission if set */
8294 			if (pmap == kernel_pmap) {
8295 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
8296 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
8297 					pte_set_was_writeable(tmplate, true);
8298 					mod_fault = true;
8299 				}
8300 			} else {
8301 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
8302 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
8303 					pte_set_was_writeable(tmplate, true);
8304 					mod_fault = true;
8305 				}
8306 			}
8307 		}
8308 
8309 		if (ptdp != NULL) {
8310 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
8311 			sptm_ops[num_mappings].vaddr = va;
8312 			sptm_ops[num_mappings].pte_template = tmplate;
8313 			++num_mappings;
8314 		} else if (pmap_insert_flush_range_template(tmplate, flush_range)) {
8315 			/**
8316 			 * We submit both the pending disjoint and pending region ops whenever
8317 			 * either category reaches the mapping limit.  Having pending operations
8318 			 * in either category will keep preemption disabled, and we want to ensure
8319 			 * that we can at least temporarily re-enable preemption roughly every
8320 			 * SPTM_MAPPING_LIMIT mappings.
8321 			 */
8322 			pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
8323 			pvh_lock_sleep_mode_needed = true;
8324 			num_mappings = num_skipped_mappings = 0;
8325 		}
8326 fff_skip_pve:
8327 		if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
8328 			if (flush_range != NULL) {
8329 				/* See comment above for why we submit both disjoint and region ops when we hit the limit. */
8330 				pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
8331 				pmap_multipage_op_submit_region(flush_range);
8332 			} else if (num_mappings > 0) {
8333 				sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
8334 			}
8335 			pvh_lock_sleep_mode_needed = true;
8336 			num_mappings = num_skipped_mappings = 0;
8337 		}
8338 		pte_p = PT_ENTRY_NULL;
8339 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8340 			pve_ptep_idx = 0;
8341 			pve_p = pve_next(pve_p);
8342 		}
8343 	}
8344 
8345 	if (num_mappings != 0) {
8346 		sptm_return_t sptm_ret;
8347 
8348 		if (flush_range == NULL) {
8349 			sptm_ret = sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
8350 		} else {
8351 			/* Resync the pending mapping state in flush_range with our local state. */
8352 			assert(num_mappings >= flush_range->pending_disjoint_entries);
8353 			flush_range->pending_disjoint_entries = num_mappings;
8354 		}
8355 	}
8356 
8357 	/**
8358 	 * Undo the explicit disable_preemption() done in FFF_PERCPU_INIT().
8359 	 * Note that enable_preemption() decrements a per-thread counter, so if
8360 	 * we happen to still hold the PVH lock in spin mode then preemption won't
8361 	 * actually be re-enabled until we drop the lock (which also decrements
8362 	 * the per-thread counter.
8363 	 */
8364 	enable_preemption();
8365 
8366 	/*
8367 	 * If we are using the same approach for ref and mod
8368 	 * faults on this PTE, do not clear the write fault;
8369 	 * this would cause both ref and mod to be set on the
8370 	 * page again, and prevent us from taking ANY read/write
8371 	 * fault on the mapping.
8372 	 */
8373 	if (clear_write_fault && !ref_aliases_mod) {
8374 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, local_locked_pvh.pvh, PT_ENTRY_NULL, 0);
8375 	}
8376 
8377 	pp_attr_t attrs_to_clear = (result ? bits_to_clear : 0);
8378 	pp_attr_t attrs_to_set = 0;
8379 	/* update global "reusable" status for this page */
8380 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8381 		attrs_to_clear |= PP_ATTR_REUSABLE;
8382 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8383 		attrs_to_set |= PP_ATTR_REUSABLE;
8384 	}
8385 
8386 	if (mod_fault) {
8387 		attrs_to_set |= PP_ATTR_MODFAULT;
8388 	}
8389 	if (ref_fault) {
8390 		attrs_to_set |= PP_ATTR_REFFAULT;
8391 	}
8392 
8393 	if (attrs_to_set | attrs_to_clear) {
8394 		ppattr_modify_bits(pai, attrs_to_clear, attrs_to_set);
8395 	}
8396 
8397 	if (__probable(locked_pvh == NULL)) {
8398 		pvh_unlock(&local_locked_pvh);
8399 	} else {
8400 		*locked_pvh = local_locked_pvh;
8401 	}
8402 	if ((flush_range != NULL) && !preemption_enabled()) {
8403 		flush_range->processed_entries += num_skipped_mappings;
8404 	}
8405 	return result;
8406 }
8407 
8408 MARK_AS_PMAP_TEXT boolean_t
8409 arm_force_fast_fault_internal(
8410 	ppnum_t         ppnum,
8411 	vm_prot_t       allow_mode,
8412 	int             options)
8413 {
8414 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8415 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8416 	}
8417 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL, 0, NULL);
8418 }
8419 
8420 /*
8421  *	Routine:	arm_force_fast_fault
8422  *
8423  *	Function:
8424  *		Force all mappings for this page to fault according
8425  *		to the access modes allowed, so we can gather ref/modify
8426  *		bits again.
8427  */
8428 
8429 boolean_t
8430 arm_force_fast_fault(
8431 	ppnum_t         ppnum,
8432 	vm_prot_t       allow_mode,
8433 	int             options,
8434 	__unused void   *arg)
8435 {
8436 	pmap_paddr_t    phys = ptoa(ppnum);
8437 
8438 	assert(ppnum != vm_page_fictitious_addr);
8439 
8440 	if (!pa_valid(phys)) {
8441 		return FALSE;   /* Not a managed page. */
8442 	}
8443 
8444 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8445 }
8446 
8447 /**
8448  * Clear pending force fault for at most SPTM_MAPPING_LIMIT mappings for this
8449  * page based on the observed fault type, and update the appropriate ref/modify
8450  * bits for the physical page. This typically involves adding write permissions
8451  * back for write faults and setting the Access Flag for both read/write faults
8452  * (since the lack of those things is what caused the fault in the first place).
8453  *
8454  * @note Only SPTM_MAPPING_LIMIT number of mappings can be modified in a single
8455  *       arm_clear_fast_fault() call to prevent excessive PVH lock contention as
8456  *       the PVH lock should be held for `ppnum` already. If a fault is
8457  *       subsequently taken on a mapping we haven't processed, arm_fast_fault()
8458  *       will call this function with a non-NULL pte_p to perform a targeted
8459  *       fixup.
8460  *
8461  * @param ppnum Page number of the page to clear a pending force fault on.
8462  * @param fault_type The type of access/fault that triggered us wanting to clear
8463  *                   the pending force fault status. This determines how we
8464  *                   modify the PTE to not cause a fault in the future and also
8465  *                   whether we mark the PTE as referenced or modified.
8466  *                   Typically a write fault would cause the page to be marked
8467  *                   as referenced and modified, and a read fault would only
8468  *                   cause the page to be marked as referenced.
8469  * @param pvh pv_head_table entry value for [ppnum] returned by a previous call
8470  *            to pvh_lock().
8471  * @param pte_p If this value is non-PT_ENTRY_NULL then only this specified PTE
8472  *              will be modified. If it is PT_ENTRY_NULL, then every mapping to
8473  *              `ppnum` will be modified.
8474  * @param attrs_to_clear Mask of additional pp_attr_t bits to clear for the physical
8475  *                       page upon completion of this function.  This is typically
8476  *                       some combination of the REFFAULT and MODFAULT bits.
8477  *
8478  * @return TRUE if any PTEs were modified, FALSE otherwise.
8479  */
8480 MARK_AS_PMAP_TEXT static boolean_t
8481 arm_clear_fast_fault(
8482 	ppnum_t ppnum,
8483 	vm_prot_t fault_type,
8484 	uintptr_t pvh,
8485 	pt_entry_t *pte_p,
8486 	pp_attr_t attrs_to_clear)
8487 {
8488 	const pmap_paddr_t pa = ptoa(ppnum);
8489 	pv_entry_t     *pve_p;
8490 	boolean_t       result;
8491 	unsigned int    num_mappings = 0, num_skipped_mappings = 0;
8492 	pp_attr_t       attrs_to_set = 0;
8493 
8494 	assert(ppnum != vm_page_fictitious_addr);
8495 
8496 	if (!pa_valid(pa)) {
8497 		return FALSE;   /* Not a managed page. */
8498 	}
8499 
8500 	result = FALSE;
8501 	pve_p = PV_ENTRY_NULL;
8502 	if (pte_p == PT_ENTRY_NULL) {
8503 		if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
8504 			pte_p = pvh_ptep(pvh);
8505 		} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
8506 			pve_p = pvh_pve_list(pvh);
8507 		} else if (__improbable(!pvh_test_type(pvh, PVH_TYPE_NULL))) {
8508 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)pvh, (uint64_t)pa);
8509 		}
8510 	}
8511 
8512 	disable_preemption();
8513 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8514 	sptm_disjoint_op_t *sptm_ops = sptm_pcpu->sptm_ops;
8515 
8516 	int pve_ptep_idx = 0;
8517 
8518 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8519 		pt_entry_t spte;
8520 		pt_entry_t tmplate;
8521 
8522 		if (pve_p != PV_ENTRY_NULL) {
8523 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8524 			if (pte_p == PT_ENTRY_NULL) {
8525 				goto cff_skip_pve;
8526 			}
8527 		}
8528 
8529 #ifdef PVH_FLAG_IOMMU
8530 		if (pvh_ptep_is_iommu(pte_p)) {
8531 			++num_skipped_mappings;
8532 			goto cff_skip_pve;
8533 		}
8534 #endif
8535 		spte = os_atomic_load(pte_p, relaxed);
8536 		// A concurrent pmap_remove() may have cleared the PTE
8537 		if (__improbable(!pte_is_valid(spte))) {
8538 			++num_skipped_mappings;
8539 			goto cff_skip_pve;
8540 		}
8541 
8542 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8543 		const pmap_t pmap = ptdp->pmap;
8544 
8545 		tmplate = spte;
8546 
8547 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8548 			assert(pmap);
8549 			{
8550 				if (pmap == kernel_pmap) {
8551 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8552 				} else {
8553 					assert(pmap->type != PMAP_TYPE_NESTED);
8554 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8555 				}
8556 			}
8557 
8558 			tmplate |= ARM_PTE_AF;
8559 
8560 			pte_set_was_writeable(tmplate, false);
8561 			attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8562 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8563 			assert(pmap);
8564 			tmplate = spte | ARM_PTE_AF;
8565 
8566 			{
8567 				attrs_to_set |= PP_ATTR_REFERENCED;
8568 			}
8569 		}
8570 
8571 		assert(spte != ARM_PTE_EMPTY);
8572 
8573 		if (spte != tmplate) {
8574 			const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8575 			assert(va >= pmap->min && va < pmap->max);
8576 
8577 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
8578 			sptm_ops[num_mappings].vaddr = va;
8579 			sptm_ops[num_mappings].pte_template = tmplate;
8580 			++num_mappings;
8581 			result = TRUE;
8582 		}
8583 
8584 cff_skip_pve:
8585 		if ((num_mappings + num_skipped_mappings) == SPTM_MAPPING_LIMIT) {
8586 			if (num_mappings != 0) {
8587 				sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8588 				    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8589 				num_mappings = 0;
8590 			}
8591 			/*
8592 			 * We've reached the limit of mappings that can be processed in a single arm_clear_fast_fault()
8593 			 * call.  Bail out here to avoid excessive PVH lock duration on the fault path.  If a fault is
8594 			 * subsequently taken on a mapping we haven't processed, arm_fast_fault() will call this
8595 			 * function with a non-NULL pte_p to perform a targeted fixup.
8596 			 */
8597 			break;
8598 		}
8599 
8600 		pte_p = PT_ENTRY_NULL;
8601 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8602 			pve_ptep_idx = 0;
8603 			pve_p = pve_next(pve_p);
8604 		}
8605 	}
8606 
8607 	if (num_mappings != 0) {
8608 		assert(result == TRUE);
8609 		sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
8610 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
8611 	}
8612 
8613 	if (attrs_to_set | attrs_to_clear) {
8614 		ppattr_modify_bits(pa_index(pa), attrs_to_clear, attrs_to_set);
8615 	}
8616 	enable_preemption();
8617 
8618 	return result;
8619 }
8620 
8621 /*
8622  * Determine if the fault was induced by software tracking of
8623  * modify/reference bits.  If so, re-enable the mapping (and set
8624  * the appropriate bits).
8625  *
8626  * Returns KERN_SUCCESS if the fault was induced and was
8627  * successfully handled.
8628  *
8629  * Returns KERN_FAILURE if the fault was not induced and
8630  * the function was unable to deal with it.
8631  *
8632  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8633  * disallows this type of access.
8634  */
8635 MARK_AS_PMAP_TEXT kern_return_t
8636 arm_fast_fault_internal(
8637 	pmap_t pmap,
8638 	vm_map_address_t va,
8639 	vm_prot_t fault_type,
8640 	__unused bool was_af_fault,
8641 	__unused bool from_user)
8642 {
8643 	kern_return_t   result = KERN_FAILURE;
8644 	pt_entry_t     *ptep;
8645 	pt_entry_t      spte = ARM_PTE_EMPTY;
8646 	locked_pvh_t    locked_pvh = {.pvh = 0};
8647 	unsigned int    pai;
8648 	pmap_paddr_t    pa;
8649 	validate_pmap_mutable(pmap);
8650 
8651 	if (__probable(preemption_enabled())) {
8652 		pmap_lock(pmap, PMAP_LOCK_SHARED);
8653 	} else if (__improbable(!pmap_try_lock(pmap, PMAP_LOCK_SHARED))) {
8654 		/**
8655 		 * In certain cases, arm_fast_fault() may be invoked with preemption disabled
8656 		 * on the copyio path.  In theses cases the (in-kernel) caller expects that any
8657 		 * faults taken against the user address may not be handled successfully
8658 		 * (vm_fault() allows non-preemptible callers with the possibility that the
8659 		 * fault may not be successfully handled) and will result in the copyio operation
8660 		 * returning EFAULT.  It is then the caller's responsibility to retry the copyio
8661 		 * operation in a preemptible context.
8662 		 *
8663 		 * For these cases attempting to acquire the sleepable lock will panic, so
8664 		 * we simply make a best effort and return failure just as the VM does if we
8665 		 * can't acquire the lock without sleeping.
8666 		 */
8667 		return result;
8668 	}
8669 
8670 	/*
8671 	 * If the entry doesn't exist, is completely invalid, or is already
8672 	 * valid, we can't fix it here.
8673 	 */
8674 
8675 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8676 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8677 	if (ptep != PT_ENTRY_NULL) {
8678 		while (true) {
8679 			spte = os_atomic_load(ptep, relaxed);
8680 
8681 			pa = pte_to_pa(spte);
8682 
8683 			if ((spte == ARM_PTE_EMPTY) || pte_is_compressed(spte, ptep)) {
8684 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8685 				return result;
8686 			}
8687 
8688 			if (!pa_valid(pa)) {
8689 				const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
8690 				if (frame_type == XNU_PROTECTED_IO) {
8691 					result = KERN_PROTECTION_FAILURE;
8692 				}
8693 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
8694 				return result;
8695 			}
8696 			pai = pa_index(pa);
8697 			/**
8698 			 * Check for preemption disablement and in that case use pvh_try_lock()
8699 			 * for the same reason we use pmap_try_lock() above.
8700 			 */
8701 			if (__probable(preemption_enabled())) {
8702 				locked_pvh = pvh_lock(pai);
8703 			} else {
8704 				locked_pvh = pvh_try_lock(pai);
8705 				if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
8706 					pmap_unlock(pmap, PMAP_LOCK_SHARED);
8707 					return result;
8708 				}
8709 			}
8710 			assert(locked_pvh.pvh != 0);
8711 			if (os_atomic_load(ptep, relaxed) == spte) {
8712 				/*
8713 				 * Double-check the spte value, as we care about the AF bit.
8714 				 * It's also possible that pmap_page_protect() transitioned the
8715 				 * PTE to compressed/empty before we grabbed the PVH lock.
8716 				 */
8717 				break;
8718 			}
8719 			pvh_unlock(&locked_pvh);
8720 		}
8721 	} else {
8722 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
8723 		return result;
8724 	}
8725 
8726 
8727 	if (result == KERN_SUCCESS) {
8728 		goto ff_cleanup;
8729 	}
8730 
8731 	pp_attr_t attrs = os_atomic_load(&pp_attr_table[pai], relaxed);
8732 	if ((attrs & PP_ATTR_REFFAULT) || ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT))) {
8733 		/*
8734 		 * An attempted access will always clear ref/mod fault state, as
8735 		 * appropriate for the fault type.  arm_clear_fast_fault will
8736 		 * update the associated PTEs for the page as appropriate; if
8737 		 * any PTEs are updated, we redrive the access.  If the mapping
8738 		 * does not actually allow for the attempted access, the
8739 		 * following fault will (hopefully) fail to update any PTEs, and
8740 		 * thus cause arm_fast_fault to decide that it failed to handle
8741 		 * the fault.
8742 		 */
8743 		pp_attr_t attrs_to_clear = 0;
8744 		if (attrs & PP_ATTR_REFFAULT) {
8745 			attrs_to_clear |= PP_ATTR_REFFAULT;
8746 		}
8747 		if ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT)) {
8748 			attrs_to_clear |= PP_ATTR_MODFAULT;
8749 		}
8750 
8751 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, PT_ENTRY_NULL, attrs_to_clear)) {
8752 			/*
8753 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
8754 			 * cost of not doing so is a another fault in a case
8755 			 * that should already result in an exception.
8756 			 */
8757 			result = KERN_SUCCESS;
8758 		}
8759 	}
8760 
8761 	/*
8762 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8763 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8764 	 * on mappings of the same page
8765 	 */
8766 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8767 		uintptr_t ap_ro, ap_rw, ap_x;
8768 		if (pmap == kernel_pmap) {
8769 			ap_ro = ARM_PTE_AP(AP_RONA);
8770 			ap_rw = ARM_PTE_AP(AP_RWNA);
8771 			ap_x = ARM_PTE_NX;
8772 		} else {
8773 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8774 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8775 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8776 		}
8777 		/*
8778 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8779 		 * hardware they may be xPRR-protected, in which case they'll be handled
8780 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8781 		 * handling path currently does not call arm_fast_fault() without at least
8782 		 * VM_PROT_READ in fault_type.
8783 		 */
8784 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8785 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8786 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8787 				result = KERN_SUCCESS;
8788 			}
8789 		}
8790 	}
8791 
8792 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, ptep, 0)) {
8793 		/*
8794 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8795 		 * another pending PV list operation or an excessively large PV list.
8796 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8797 		 * taking a fault on the same mapping.
8798 		 */
8799 		result = KERN_SUCCESS;
8800 	}
8801 
8802 ff_cleanup:
8803 
8804 	pvh_unlock(&locked_pvh);
8805 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8806 	return result;
8807 }
8808 
8809 kern_return_t
8810 arm_fast_fault(
8811 	pmap_t pmap,
8812 	vm_map_address_t va,
8813 	vm_prot_t fault_type,
8814 	bool was_af_fault,
8815 	__unused bool from_user)
8816 {
8817 	kern_return_t   result = KERN_FAILURE;
8818 
8819 	if (va < pmap->min || va >= pmap->max) {
8820 		return result;
8821 	}
8822 
8823 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8824 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8825 	    from_user);
8826 
8827 
8828 	result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8829 
8830 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8831 
8832 	return result;
8833 }
8834 
8835 void
8836 pmap_copy_page(
8837 	ppnum_t psrc,
8838 	ppnum_t pdst,
8839 	int options)
8840 {
8841 	bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8842 	    (addr64_t) (ptoa(pdst)),
8843 	    PAGE_SIZE,
8844 	    options);
8845 }
8846 
8847 
8848 /*
8849  *	pmap_copy_page copies the specified (machine independent) pages.
8850  */
8851 void
8852 pmap_copy_part_page(
8853 	ppnum_t psrc,
8854 	vm_offset_t src_offset,
8855 	ppnum_t pdst,
8856 	vm_offset_t dst_offset,
8857 	vm_size_t len)
8858 {
8859 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8860 	    (addr64_t) (ptoa(pdst) + dst_offset),
8861 	    len);
8862 }
8863 
8864 
8865 /*
8866  *	pmap_zero_page zeros the specified (machine independent) page.
8867  */
8868 void
8869 pmap_zero_page(
8870 	ppnum_t pn)
8871 {
8872 	assert(pn != vm_page_fictitious_addr);
8873 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8874 }
8875 
8876 /*
8877  *	pmap_zero_page_with_options allows to specify further operations
8878  *	to perform with the zeroing.
8879  */
8880 void
8881 pmap_zero_page_with_options(
8882 	ppnum_t pn,
8883 	int options)
8884 {
8885 	assert(pn != vm_page_fictitious_addr);
8886 	bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8887 }
8888 
8889 /*
8890  *	pmap_zero_part_page
8891  *	zeros the specified (machine independent) part of a page.
8892  */
8893 void
8894 pmap_zero_part_page(
8895 	ppnum_t pn,
8896 	vm_offset_t offset,
8897 	vm_size_t len)
8898 {
8899 	assert(pn != vm_page_fictitious_addr);
8900 	assert(offset + len <= PAGE_SIZE);
8901 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8902 }
8903 
8904 void
8905 pmap_map_globals(
8906 	void)
8907 {
8908 	pt_entry_t      pte;
8909 
8910 	pte = pa_to_pte(kvtophys_nofail((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX |
8911 	    ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8912 #if __ARM_KERNEL_PROTECT__
8913 	pte |= ARM_PTE_NG;
8914 #endif /* __ARM_KERNEL_PROTECT__ */
8915 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8916 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8917 	sptm_map_page(kernel_pmap->ttep, LOWGLOBAL_ALIAS, pte);
8918 
8919 
8920 #if KASAN
8921 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8922 #endif
8923 }
8924 
8925 vm_offset_t
8926 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8927 {
8928 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8929 		panic("%s: invalid index %u", __func__, index);
8930 	}
8931 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8932 }
8933 
8934 MARK_AS_PMAP_TEXT unsigned int
8935 pmap_map_cpu_windows_copy_internal(
8936 	ppnum_t pn,
8937 	vm_prot_t prot,
8938 	unsigned int wimg_bits)
8939 {
8940 	pt_entry_t      *ptep = NULL, pte;
8941 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8942 	unsigned int    cpu_num;
8943 	unsigned int    cpu_window_index;
8944 	vm_offset_t     cpu_copywindow_vaddr = 0;
8945 	bool            need_strong_sync = false;
8946 
8947 	assert(get_preemption_level() > 0);
8948 	cpu_num = pmap_cpu_data->cpu_number;
8949 
8950 	for (cpu_window_index = 0; cpu_window_index < CPUWINDOWS_MAX; cpu_window_index++) {
8951 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, cpu_window_index);
8952 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8953 		assert(!pte_is_compressed(*ptep, ptep));
8954 		if (!pte_is_valid(*ptep)) {
8955 			break;
8956 		}
8957 	}
8958 	if (__improbable(cpu_window_index == CPUWINDOWS_MAX)) {
8959 		panic("%s: out of windows", __func__);
8960 	}
8961 
8962 	const pmap_paddr_t paddr = ptoa(pn);
8963 	pte = pa_to_pte(paddr) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8964 #if __ARM_KERNEL_PROTECT__
8965 	pte |= ARM_PTE_NG;
8966 #endif /* __ARM_KERNEL_PROTECT__ */
8967 	pte |= wimg_to_pte(wimg_bits, paddr);
8968 
8969 	if (prot & VM_PROT_WRITE) {
8970 		pte |= ARM_PTE_AP(AP_RWNA);
8971 	} else {
8972 		pte |= ARM_PTE_AP(AP_RONA);
8973 	}
8974 
8975 	/*
8976 	 * It's expected to be safe for an interrupt handler to nest copy-window usage with the
8977 	 * active thread on a CPU, as long as a sufficient number of copy windows are available.
8978 	 * --If the interrupt handler executes before the active thread creates the per-CPU mapping,
8979 	 *   or after the active thread completely removes the mapping, it may use the same mapping
8980 	 *   but will finish execution and tear down the mapping without the thread needing to know.
8981 	 * --If the interrupt handler executes after the active thread creates the per-CPU mapping,
8982 	 *   it will observe the valid mapping and use a different copy window.
8983 	 * --If the interrupt handler executes after the active thread clears the PTE in
8984 	 *   pmap_unmap_cpu_windows_copy() but before the active thread flushes the TLB, the code
8985 	 *   for computing cpu_window_index above will observe the PTE_INVALID_IN_FLIGHT token set
8986 	 *   by the SPTM, and will select a different index.
8987 	 */
8988 	const sptm_return_t sptm_status = sptm_map_page(kernel_pmap->ttep, cpu_copywindow_vaddr, pte);
8989 	if (__improbable(sptm_status != SPTM_SUCCESS)) {
8990 		panic("%s: failed to map CPU copy-window VA 0x%llx with SPTM status %d",
8991 		    __func__, (unsigned long long)cpu_copywindow_vaddr, sptm_status);
8992 	}
8993 
8994 
8995 	/*
8996 	 * Clean up any pending strong TLB flush for the same window in a thread we may have
8997 	 * interrupted.
8998 	 */
8999 	if (__improbable(pmap_cpu_data->copywindow_strong_sync[cpu_window_index])) {
9000 		arm64_sync_tlb(true);
9001 	}
9002 	pmap_cpu_data->copywindow_strong_sync[cpu_window_index] = need_strong_sync;
9003 
9004 	return cpu_window_index;
9005 }
9006 
9007 unsigned int
9008 pmap_map_cpu_windows_copy(
9009 	ppnum_t pn,
9010 	vm_prot_t prot,
9011 	unsigned int wimg_bits)
9012 {
9013 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
9014 }
9015 
9016 MARK_AS_PMAP_TEXT void
9017 pmap_unmap_cpu_windows_copy_internal(
9018 	unsigned int index)
9019 {
9020 	unsigned int    cpu_num;
9021 	vm_offset_t     cpu_copywindow_vaddr = 0;
9022 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
9023 
9024 	assert(index < CPUWINDOWS_MAX);
9025 	assert(get_preemption_level() > 0);
9026 
9027 	cpu_num = pmap_cpu_data->cpu_number;
9028 
9029 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
9030 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
9031 	 * (which are likely to have been on I/O memory) are complete before
9032 	 * tearing down the mapping. */
9033 	__builtin_arm_dsb(DSB_SY);
9034 	sptm_unmap_region(kernel_pmap->ttep, cpu_copywindow_vaddr, 1, 0);
9035 	if (__improbable(pmap_cpu_data->copywindow_strong_sync[index])) {
9036 		arm64_sync_tlb(true);
9037 		pmap_cpu_data->copywindow_strong_sync[index] = false;
9038 	}
9039 }
9040 
9041 void
9042 pmap_unmap_cpu_windows_copy(
9043 	unsigned int index)
9044 {
9045 	return pmap_unmap_cpu_windows_copy_internal(index);
9046 }
9047 
9048 /*
9049  * Indicate that a pmap is intended to be used as a nested pmap
9050  * within one or more larger address spaces.  This must be set
9051  * before pmap_nest() is called with this pmap as the 'subordinate'.
9052  */
9053 MARK_AS_PMAP_TEXT void
9054 pmap_set_nested_internal(
9055 	pmap_t pmap)
9056 {
9057 	validate_pmap_mutable(pmap);
9058 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9059 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
9060 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
9061 		    __func__, pmap, pmap->type);
9062 	}
9063 	pmap->type = PMAP_TYPE_NESTED;
9064 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
9065 	retype_params.attr_idx = (pt_attr_page_size(pt_attr) == 4096) ? SPTM_PT_GEOMETRY_4K : SPTM_PT_GEOMETRY_16K;
9066 	pmap_txm_acquire_exclusive_lock(pmap);
9067 	sptm_retype(pmap->ttep, XNU_USER_ROOT_TABLE, XNU_SHARED_ROOT_TABLE, retype_params);
9068 	pmap_txm_release_exclusive_lock(pmap);
9069 	pmap_get_pt_ops(pmap)->free_id(pmap);
9070 }
9071 
9072 void
9073 pmap_set_nested(
9074 	pmap_t pmap)
9075 {
9076 	pmap_set_nested_internal(pmap);
9077 }
9078 
9079 bool
9080 pmap_is_nested(
9081 	pmap_t pmap)
9082 {
9083 	return pmap->type == PMAP_TYPE_NESTED;
9084 }
9085 
9086 /*
9087  * pmap_trim_range(pmap, start, end)
9088  *
9089  * pmap  = pmap to operate on
9090  * start = start of the range
9091  * end   = end of the range
9092  *
9093  * Attempts to deallocate TTEs for the given range in the nested range.
9094  */
9095 MARK_AS_PMAP_TEXT static void
9096 pmap_trim_range(
9097 	pmap_t pmap,
9098 	addr64_t start,
9099 	addr64_t end)
9100 {
9101 	addr64_t cur;
9102 	addr64_t nested_region_start;
9103 	addr64_t nested_region_end;
9104 	addr64_t adjusted_start;
9105 	addr64_t adjusted_end;
9106 	addr64_t adjust_offmask;
9107 	tt_entry_t * tte_p;
9108 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
9109 
9110 	if (__improbable(end < start)) {
9111 		panic("%s: invalid address range, "
9112 		    "pmap=%p, start=%p, end=%p",
9113 		    __func__,
9114 		    pmap, (void*)start, (void*)end);
9115 	}
9116 
9117 	nested_region_start = pmap->nested_region_addr;
9118 	nested_region_end = nested_region_start + pmap->nested_region_size;
9119 
9120 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
9121 		panic("%s: range outside nested region %p-%p, "
9122 		    "pmap=%p, start=%p, end=%p",
9123 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
9124 		    pmap, (void*)start, (void*)end);
9125 	}
9126 
9127 	/* Contract the range to TT page boundaries. */
9128 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
9129 
9130 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
9131 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
9132 	adjusted_end = end & ~adjust_offmask;
9133 
9134 	/* Iterate over the range, trying to remove TTEs. */
9135 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) {
9136 		tte_p = pmap_tte(pmap, cur);
9137 
9138 		if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
9139 			if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) {
9140 				/* Deallocate for the nested map. */
9141 				pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr), false);
9142 			} else if (pmap->type == PMAP_TYPE_USER) {
9143 				/**
9144 				 * Just remove for the parent map. If the leaf table pointed
9145 				 * to by the TTE being removed (owned by the nested pmap)
9146 				 * has any mappings, then this call will panic. This
9147 				 * enforces the policy that tables being trimmed must be
9148 				 * empty to prevent possible use-after-free attacks.
9149 				 */
9150 				pmap_tte_trim(pmap, cur, tte_p);
9151 			} else {
9152 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9153 			}
9154 		}
9155 	}
9156 }
9157 
9158 /*
9159  * pmap_trim_internal(grand, subord, vstart, size)
9160  *
9161  * grand  = pmap subord is nested in
9162  * subord = nested pmap
9163  * vstart = start of the used range in grand
9164  * size   = size of the used range
9165  *
9166  * Attempts to trim the shared region page tables down to only cover the given
9167  * range in subord and grand.
9168  *
9169  * This function assumes that trimming of [subord] happens exactly once, against
9170  * a temporary [grand] pmap, and that it happens before [subord] is ever actually
9171  * nested in a real task pmap.  Unlike its PPL predecessor (which can't trust its
9172  * callers), the SPTM implementation therefore does not do any refcounting to
9173  * track top-level pmaps that may have nested tables outside the trimmed range.
9174  */
9175 MARK_AS_PMAP_TEXT void
9176 pmap_trim_internal(
9177 	pmap_t grand,
9178 	pmap_t subord,
9179 	addr64_t vstart,
9180 	uint64_t size)
9181 {
9182 	addr64_t vend;
9183 	addr64_t adjust_offmask;
9184 
9185 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9186 		panic("%s: grand addr wraps around, "
9187 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9188 		    __func__, grand, subord, (void*)vstart, size);
9189 	}
9190 
9191 	validate_pmap_mutable(grand);
9192 	validate_pmap(subord);
9193 
9194 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9195 
9196 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9197 		panic("%s: subord is of non-nestable type 0x%hhx, "
9198 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9199 		    __func__, subord->type, grand, subord, (void*)vstart, size);
9200 	}
9201 
9202 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9203 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9204 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9205 		    __func__, grand->type, grand, subord, (void*)vstart, size);
9206 	}
9207 
9208 	if (__improbable(grand->nested_pmap != subord)) {
9209 		panic("%s: grand->nested != subord, "
9210 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9211 		    __func__, grand, subord, (void*)vstart, size);
9212 	}
9213 
9214 	if (__improbable((vstart < grand->nested_region_addr) ||
9215 	    (vend > (grand->nested_region_addr + grand->nested_region_size)))) {
9216 		panic("%s: grand range not in nested region, "
9217 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9218 		    __func__, grand, subord, (void*)vstart, size);
9219 	}
9220 
9221 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
9222 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
9223 	vm_map_offset_t true_end = vend;
9224 
9225 	os_atomic_store(&subord->nested_region_true_start, vstart & ~adjust_offmask, relaxed);
9226 
9227 	if (__improbable(os_add_overflow(true_end, adjust_offmask, &true_end))) {
9228 		panic("%s: padded true end wraps around, "
9229 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
9230 		    __func__, grand, subord, (void*)vstart, size);
9231 	}
9232 
9233 	os_atomic_store(&subord->nested_region_true_end, true_end & ~adjust_offmask, relaxed);
9234 
9235 	os_atomic_store(&grand->nested_region_true_start, subord->nested_region_true_start, relaxed);
9236 	os_atomic_store(&grand->nested_region_true_end, subord->nested_region_true_end, relaxed);
9237 	/* Trim grand to only cover the given range. */
9238 	pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9239 	pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9240 	pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9241 	pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9242 }
9243 
9244 void
9245 pmap_trim(
9246 	pmap_t grand,
9247 	pmap_t subord,
9248 	addr64_t vstart,
9249 	uint64_t size)
9250 {
9251 	pmap_trim_internal(grand, subord, vstart, size);
9252 }
9253 
9254 #if HAS_APPLE_PAC
9255 
9256 void *
9257 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9258 {
9259 	void *res = NULL;
9260 	const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9261 
9262 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9263 	__compiler_materialize_and_prevent_reordering_on(value);
9264 	res = sptm_sign_user_pointer(value, key, discriminator, jop_key);
9265 	__compiler_materialize_and_prevent_reordering_on(res);
9266 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9267 
9268 	ml_set_interrupts_enabled(current_intr_state);
9269 
9270 	return res;
9271 }
9272 
9273 typedef struct {
9274 	void *locations[SPTM_BATCHED_OPS_LIMIT];
9275 	unsigned int index;
9276 	uint64_t jop_key;
9277 } pmap_batch_sign_user_ptr_state_t;
9278 
9279 static pmap_batch_sign_user_ptr_state_t PERCPU_DATA(percpu_pmap_batch_sign_user_ptr_state);
9280 
9281 /**
9282  * Accumulates a user pointer signing request, and calls into SPTM to sign
9283  * them as it sees fit or is told to do so. If an SPTM call is made,
9284  * this function copies the signed pointers to their respective locations.
9285  *
9286  * @note This function will disable preemption when called for the first
9287  *       time or for the first time after a submission to SPTM. It enables
9288  *       preemption after a submission is made.
9289  *
9290  * @note The caller can force the submission of accumulated ops so far by
9291  *       passing a NULL location pointer.
9292  *
9293  * @note The jop_key argument is expected to be consistent throughout a
9294  *       batch. This function will panic if it detects the jop_key passed
9295  *       in is inconsistent with the other ops in the batch.
9296  *
9297  * @param location The destination where the signed pointer will be copied
9298  *                 to. The caller can pass a NULL pointer to force an SPTM
9299  *                 submission of the accumulated signing ops so far. In
9300  *                 such case, the rest of the argument list is ignored.
9301  * @param value The pointer to be signed.
9302  * @param key The key used to sign the pointer.
9303  * @param discriminator The discriminator used to sign the pointer.
9304  * @param jop_key The JOP key used to sign the pointer.
9305  *
9306  * @return true if an SPTM call was made. Otherwise false.
9307  */
9308 bool
9309 pmap_batch_sign_user_ptr(void *location, void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9310 {
9311 	bool submitted_to_sptm = false;
9312 
9313 	/* Disable preemption to access percpu data. */
9314 	disable_preemption();
9315 
9316 	pmap_batch_sign_user_ptr_state_t *state = PERCPU_GET(percpu_pmap_batch_sign_user_ptr_state);
9317 	void **locations = state->locations;
9318 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9319 	sptm_user_pointer_op_t *sptm_user_pointer_ops = (sptm_user_pointer_op_t *) sptm_pcpu->sptm_user_pointer_ops;
9320 	uintptr_t *sptm_values = (uintptr_t *) sptm_pcpu->sptm_prev_ptes;
9321 
9322 	if (state->index != 0) {
9323 		/* Avoid leaking preemption counts by offsetting the disable at the beginning of this function. */
9324 		enable_preemption();
9325 
9326 		/* Disabled preemption is still expected. */
9327 		assert(!preemption_enabled());
9328 	}
9329 
9330 	assert(state->index < SPTM_BATCHED_OPS_LIMIT);
9331 
9332 	/* Stash a pointer signing op if a copy location is supplied. */
9333 	if (location != NULL) {
9334 		locations[state->index] = location;
9335 		sptm_user_pointer_ops[state->index].value = (uintptr_t)value;
9336 		sptm_user_pointer_ops[state->index].key = key;
9337 		sptm_user_pointer_ops[state->index].discriminator = discriminator;
9338 
9339 		if (state->index == 0) {
9340 			state->jop_key = jop_key;
9341 		} else {
9342 			assert(state->jop_key == jop_key);
9343 		}
9344 
9345 		state->index = state->index + 1;
9346 	}
9347 
9348 	/**
9349 	 * Submit the stashed ops on this cpu to SPTM when:
9350 	 *   1. there are SPTM_BATCHED_OPS_LIMIT ops accumulated on the cpu, or
9351 	 *   2. the caller asks us to submit whatever we have accumulated by
9352 	 *      passing in a NULL location argument.
9353 	 */
9354 	if (state->index == SPTM_BATCHED_OPS_LIMIT || location == NULL) {
9355 		if (__probable(state->index > 0)) {
9356 			const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9357 
9358 			uint64_t saved_jop_state = ml_enable_user_jop_key(state->jop_key);
9359 			sptm_batch_sign_user_pointer(sptm_pcpu->sptm_user_pointer_ops_pa, state->index, state->jop_key);
9360 			ml_disable_user_jop_key(state->jop_key, saved_jop_state);
9361 
9362 			ml_set_interrupts_enabled(current_intr_state);
9363 
9364 			for (unsigned int i = 0; i < state->index; i++) {
9365 				memcpy(locations[i], &(sptm_values[i]), sizeof(sptm_values[i]));
9366 			}
9367 
9368 			state->index = 0;
9369 			state->jop_key = 0;
9370 			submitted_to_sptm = true;
9371 		}
9372 	}
9373 
9374 	/**
9375 	 * There is a slight difference between using submitted_to_sptm and
9376 	 * state->index here. We need to take care of the case when there is
9377 	 * no op accumulated but a NULL location passed in, where submitted_to_sptm
9378 	 * will be false and leak a preemption count.
9379 	 */
9380 	if (state->index == 0) {
9381 		assert(submitted_to_sptm || (location == NULL));
9382 		enable_preemption();
9383 	}
9384 
9385 	return submitted_to_sptm;
9386 }
9387 
9388 void *
9389 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9390 {
9391 	void *res = NULL;
9392 	const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
9393 
9394 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9395 	__compiler_materialize_and_prevent_reordering_on(value);
9396 	res = sptm_auth_user_pointer(value, key, discriminator, jop_key);
9397 	__compiler_materialize_and_prevent_reordering_on(res);
9398 	ml_disable_user_jop_key(jop_key, saved_jop_state);
9399 
9400 	if (res == SPTM_AUTH_FAILURE) {
9401 		res = ml_poison_ptr(value, key);
9402 	}
9403 
9404 	ml_set_interrupts_enabled(current_intr_state);
9405 
9406 	return res;
9407 }
9408 #endif /* HAS_APPLE_PAC */
9409 
9410 /**
9411  * Establishes the pmap associated with a shared region as the nested pmap
9412  * for a top-level user pmap.
9413  *
9414  * @param grand The top-level user pmap
9415  * @param subord The pmap to be set as [grand]'s nested pmap
9416  * @param vstart The base VA of the region to be nested.
9417  * @param size The size (in bytes) of the region to be nested.
9418  */
9419 void
9420 pmap_set_shared_region(
9421 	pmap_t grand,
9422 	pmap_t subord,
9423 	addr64_t vstart,
9424 	uint64_t size)
9425 {
9426 	addr64_t vend;
9427 
9428 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_START,
9429 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord), vstart, size);
9430 
9431 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9432 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9433 	}
9434 
9435 	validate_pmap_mutable(grand);
9436 	validate_pmap(subord);
9437 	os_ref_retain_raw(&subord->ref_count, &pmap_refgrp);
9438 
9439 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9440 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9441 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9442 	}
9443 
9444 	if (__improbable(((size | vstart) &
9445 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9446 		panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9447 		    __func__, grand, vstart, size);
9448 	}
9449 
9450 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9451 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9452 	}
9453 
9454 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
9455 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9456 	}
9457 
9458 	if (subord->nested_region_size == 0) {
9459 		/**
9460 		 * Since subord->nested_region_size is 0, this is the first time subord is being
9461 		 * associated with a top-level pmap.  We therefore need to take a few extra steps to
9462 		 * ensure the shared region is properly configured.  This initial setup step is expected
9463 		 * to be issued by the VM layer against a temporary grand pmap before any other pmap
9464 		 * is allowed to associate with subord, so synchronization is not needed here to prevent
9465 		 * concurrent initialization.
9466 		 */
9467 		sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift);
9468 
9469 		/**
9470 		 * Since this is the first time subord is being associated with a top-level pmap, ensure
9471 		 * its nested region is fully expanded to L3 so that all relevant L3 tables can later be
9472 		 * inserted into top-level pmaps via pmap_nest().  Note that pmap_remove() will never
9473 		 * dynamically free L3 tables from nested pmaps.  However, some of these tables may be
9474 		 * freed by a later call to pmap_trim().
9475 		 */
9476 		vm_map_offset_t vaddr = vstart;
9477 		while (vaddr < vend) {
9478 			const tt_entry_t *const stte_p = pmap_tte(subord, vaddr);
9479 			if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9480 				__assert_only kern_return_t kr;
9481 				kr = pmap_expand(subord, vaddr, 0, pt_attr_leaf_level(pt_attr));
9482 				assert3u(kr, ==, KERN_SUCCESS);
9483 			}
9484 			vaddr += pt_attr_twig_size(pt_attr);
9485 		}
9486 
9487 		const uint64_t nested_region_unnested_table_bits = (size >> (pt_attr_twig_shift(pt_attr) - 1));
9488 		if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) {
9489 			panic("%s: bitmap allocation size %llu will truncate, "
9490 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9491 			    __func__, nested_region_unnested_table_bits,
9492 			    grand, subord, vstart, size);
9493 		}
9494 
9495 		subord->nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits);
9496 		subord->nested_region_addr = vstart;
9497 		subord->nested_region_size = (mach_vm_offset_t)size;
9498 	}
9499 
9500 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9501 		grand->nested_region_addr = vstart;
9502 		grand->nested_region_size = (mach_vm_offset_t)size;
9503 		assert3u(grand->nested_region_addr, ==, subord->nested_region_addr);
9504 		assert3u(grand->nested_region_size, ==, subord->nested_region_size);
9505 		pmap_txm_acquire_exclusive_lock(grand);
9506 		pmap_txm_acquire_shared_lock(subord);
9507 		sptm_set_shared_region(grand->ttep, subord->ttep);
9508 		pmap_txm_release_shared_lock(subord);
9509 		pmap_txm_release_exclusive_lock(grand);
9510 	} else {
9511 		panic("%s: pmap %p already has a nested pmap %p", __func__, grand, grand->nested_pmap);
9512 	}
9513 
9514 	PMAP_TRACE(2, PMAP_CODE(PMAP__SET_SHARED_REGION) | DBG_FUNC_END);
9515 }
9516 
9517 /**
9518  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9519  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9520  * This function operates in 2 main phases:
9521  * 1. Expands grand to ensure the required twig-level page table pages for
9522  *    the mapping range are present in grand.
9523  * 2. Invokes sptm_nest_region() to copy the relevant TTEs from subord to grand.
9524  *
9525  * @note This function requires that pmap_set_shared_region() has already been
9526  *       called for the [grand, subord] pair.
9527  *
9528  * @note The VA region defined by vstart and vsize must lie entirely within the
9529  *       VA region established by the previous call to pmap_set_shared_region().
9530  *
9531  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
9532  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
9533  * @param vstart twig-aligned virtual address for the beginning of the nesting range
9534  * @param size twig-aligned size of the nesting range
9535  *
9536  * @return KERN_RESOURCE_SHORTAGE on allocation failure, KERN_SUCCESS otherwise
9537  */
9538 MARK_AS_PMAP_TEXT kern_return_t
9539 pmap_nest_internal(
9540 	pmap_t grand,
9541 	pmap_t subord,
9542 	addr64_t vstart,
9543 	uint64_t size)
9544 {
9545 	kern_return_t kr = KERN_SUCCESS;
9546 	vm_map_offset_t vaddr;
9547 	tt_entry_t     *gtte_p;
9548 
9549 	addr64_t vend;
9550 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
9551 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9552 	}
9553 
9554 	validate_pmap_mutable(grand);
9555 	validate_pmap(subord);
9556 
9557 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9558 
9559 	if (__improbable(((size | vstart) &
9560 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9561 		panic("%s: pmap %p unaligned nesting request 0x%llx, 0x%llx",
9562 		    __func__, grand, vstart, size);
9563 	}
9564 
9565 	if (__improbable(subord != grand->nested_pmap)) {
9566 		panic("%s: attempt to nest pmap %p into pmap %p which has a different nested pmap %p",
9567 		    __func__, subord, grand, grand->nested_pmap);
9568 	}
9569 
9570 	addr64_t true_start = vstart;
9571 	if (true_start < subord->nested_region_true_start) {
9572 		true_start = subord->nested_region_true_start;
9573 	}
9574 
9575 	addr64_t true_end = vend;
9576 	if (true_end > subord->nested_region_true_end) {
9577 		true_end = subord->nested_region_true_end;
9578 	}
9579 
9580 	/* Ensure grand is expanded to L2 so that sptm_nest_region() can copy L3 entries from subord. */
9581 	vaddr = (vm_map_offset_t) true_start;
9582 
9583 	while (vaddr < true_end) {
9584 		gtte_p = pmap_tte(grand, vaddr);
9585 		if (gtte_p == PT_ENTRY_NULL) {
9586 			kr = pmap_expand(grand, vaddr, 0, pt_attr_twig_level(pt_attr));
9587 
9588 			if (kr != KERN_SUCCESS) {
9589 				goto done;
9590 			}
9591 		}
9592 
9593 		vaddr += pt_attr_twig_size(pt_attr);
9594 	}
9595 
9596 	vaddr = (vm_map_offset_t) true_start;
9597 
9598 	while (vaddr < true_end) {
9599 		/*
9600 		 * The SPTM requires the run of TTE updates to all reside within the same L2 page, so the region
9601 		 * we supply to the SPTM can't span multiple L1 TTEs.
9602 		 */
9603 		vm_map_offset_t vlim = ((vaddr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9604 		if (vlim > true_end) {
9605 			vlim = true_end;
9606 		}
9607 		sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift);
9608 		vaddr = vlim;
9609 	}
9610 
9611 done:
9612 	return kr;
9613 }
9614 
9615 kern_return_t
9616 pmap_nest(
9617 	pmap_t grand,
9618 	pmap_t subord,
9619 	addr64_t vstart,
9620 	uint64_t size)
9621 {
9622 	kern_return_t kr = KERN_SUCCESS;
9623 
9624 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9625 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9626 	    VM_KERNEL_ADDRHIDE(vstart));
9627 
9628 	pmap_verify_preemptible();
9629 	kr = pmap_nest_internal(grand, subord, vstart, size);
9630 
9631 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9632 
9633 	return kr;
9634 }
9635 
9636 /*
9637  *	kern_return_t pmap_unnest(grand, vaddr)
9638  *
9639  *	grand  = the pmap that will have the virtual range unnested
9640  *	vaddr  = start of range in pmap to be unnested
9641  *	size   = size of range in pmap to be unnested
9642  *
9643  */
9644 
9645 kern_return_t
9646 pmap_unnest(
9647 	pmap_t grand,
9648 	addr64_t vaddr,
9649 	uint64_t size)
9650 {
9651 	return pmap_unnest_options(grand, vaddr, size, 0);
9652 }
9653 
9654 /**
9655  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9656  * from a top-level pmap ('grand').  The corresponding mappings in the nested
9657  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9658  * still have the region nested.  The mappings in 'grand' will be left empty
9659  * with the assumption that they will be demand-filled by subsequent access faults.
9660  *
9661  * This function operates in 2 main phases:
9662  * 1. Iteration over the nested pmap's mappings for the specified range to mark
9663  *    them non-global.
9664  * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand.
9665  *
9666  * @param grand pmap from which to unnest mappings
9667  * @param vaddr twig-aligned virtual address for the beginning of the nested range
9668  * @param size twig-aligned size of the nested range
9669  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9670  *        grand is being torn down and step 1) above is not needed.
9671  */
9672 MARK_AS_PMAP_TEXT void
9673 pmap_unnest_options_internal(
9674 	pmap_t grand,
9675 	addr64_t vaddr,
9676 	uint64_t size,
9677 	unsigned int option)
9678 {
9679 	vm_map_offset_t start;
9680 	vm_map_offset_t addr;
9681 	unsigned int    current_index;
9682 	unsigned int    start_index;
9683 	unsigned int    max_index;
9684 
9685 	addr64_t vend;
9686 	addr64_t true_end;
9687 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9688 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9689 	}
9690 
9691 	validate_pmap_mutable(grand);
9692 
9693 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9694 
9695 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9696 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9697 		    (unsigned long long)vaddr, (unsigned long long)size);
9698 	}
9699 
9700 	struct pmap * const subord = grand->nested_pmap;
9701 	if (__improbable(subord == NULL)) {
9702 		panic("%s: %p has no nested pmap", __func__, grand);
9703 	}
9704 
9705 	true_end = vend;
9706 	if (true_end > subord->nested_region_true_end) {
9707 		true_end = subord->nested_region_true_end;
9708 	}
9709 
9710 	if ((option & PMAP_UNNEST_CLEAN) == 0) {
9711 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9712 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9713 		}
9714 
9715 		start = vaddr;
9716 		if (start < subord->nested_region_true_start) {
9717 			start = subord->nested_region_true_start;
9718 		}
9719 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9720 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9721 
9722 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9723 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9724 
9725 			bool unnested = bitmap_test(subord->nested_region_unnested_table_bitmap, UNNEST_BIT(current_index));
9726 			os_atomic_thread_fence(acquire);
9727 			if (!unnested) {
9728 				atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9729 				    UNNEST_IN_PROGRESS_BIT(current_index), memory_order_relaxed);
9730 				/*
9731 				 * Issue a store-load barrier to ensure the UNNEST_IN_PROGRESS bit is visible to any pmap_enter()
9732 				 * operation that enters the epoch after this point.
9733 				 */
9734 				os_atomic_thread_fence(seq_cst);
9735 				pmap_epoch_prepare_drain();
9736 				pmap_epoch_drain();
9737 
9738 				unsigned int num_mappings = 0;
9739 				disable_preemption();
9740 				pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9741 				/*
9742 				 * We've marked the 'twig' region as being unnested.  Every mapping entered within
9743 				 * the nested pmap in this region will now be marked non-global.
9744 				 */
9745 				while (addr < vlim) {
9746 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9747 
9748 					sptm_pcpu->sptm_templates[num_mappings] = ARM_PTE_NG;
9749 					++num_mappings;
9750 
9751 					if (num_mappings == SPTM_MAPPING_LIMIT) {
9752 						pmap_epoch_enter();
9753 						/**
9754 						 * It's technically possible (though highly unlikely) for subord to
9755 						 * be concurrently trimmed, so re-check the bounds within the epoch to
9756 						 * avoid potentially issuing an SPTM operation against a deleted leaf
9757 						 * page table.  This assumes the following:
9758 						 * 1) The pmap_trim() code path always issues a barrier and an epoch
9759 						 *    drain in between updating subord's true bounds and actually
9760 						 *    trimming subord, effectively purging any operation here which
9761 						 *    may be using stale bounds.
9762 						 * 2) The true bounds, if set, will always be twig-aligned, thus
9763 						 *    the region we operate on here can never span the starting or
9764 						 *    ending bounds.
9765 						 */
9766 						if ((start >= subord->nested_region_true_start) &&
9767 						    (start < subord->nested_region_true_end)) {
9768 							sptm_update_region(subord->ttep, start, num_mappings,
9769 							    sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9770 						}
9771 						pmap_epoch_exit();
9772 						enable_preemption();
9773 						num_mappings = 0;
9774 						start = addr;
9775 						disable_preemption();
9776 						sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9777 					}
9778 				}
9779 				/**
9780 				 * The SPTM does not allow region updates to span multiple leaf page tables, so request
9781 				 * any remaining updates up to vlim before moving to the next page table page.
9782 				 */
9783 				if (num_mappings != 0) {
9784 					pmap_epoch_enter();
9785 					if ((start >= subord->nested_region_true_start) &&
9786 					    (start < subord->nested_region_true_end)) {
9787 						sptm_update_region(subord->ttep, start, num_mappings,
9788 						    sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9789 					}
9790 					pmap_epoch_exit();
9791 				}
9792 				enable_preemption();
9793 				atomic_bitmap_set((_Atomic bitmap_t*)subord->nested_region_unnested_table_bitmap,
9794 				    UNNEST_BIT(current_index), memory_order_release);
9795 			}
9796 			addr = start = vlim;
9797 		}
9798 	}
9799 
9800 	/*
9801 	 * invalidate all pdes for segment at vaddr in pmap grand
9802 	 */
9803 	addr = vaddr;
9804 
9805 	if (addr < subord->nested_region_true_start) {
9806 		addr = subord->nested_region_true_start;
9807 	}
9808 
9809 	if (true_end > subord->nested_region_true_end) {
9810 		true_end = subord->nested_region_true_end;
9811 	}
9812 
9813 	while (addr < true_end) {
9814 		vm_map_offset_t vlim = ((addr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9815 		if (vlim > true_end) {
9816 			vlim = true_end;
9817 		}
9818 		sptm_unnest_region(grand->ttep, subord->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift);
9819 		addr = vlim;
9820 	}
9821 }
9822 
9823 kern_return_t
9824 pmap_unnest_options(
9825 	pmap_t grand,
9826 	addr64_t vaddr,
9827 	uint64_t size,
9828 	unsigned int option)
9829 {
9830 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9831 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9832 
9833 	pmap_verify_preemptible();
9834 	pmap_unnest_options_internal(grand, vaddr, size, option);
9835 
9836 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9837 
9838 	return KERN_SUCCESS;
9839 }
9840 
9841 boolean_t
9842 pmap_adjust_unnest_parameters(
9843 	__unused pmap_t p,
9844 	__unused vm_map_offset_t *s,
9845 	__unused vm_map_offset_t *e)
9846 {
9847 	return TRUE; /* to get to log_unnest_badness()... */
9848 }
9849 
9850 /**
9851  * Perform any necessary pre-nesting of the parent's shared region at fork()
9852  * time.
9853  *
9854  * @note This should only be called from vm_map_fork().
9855  *
9856  * @param old_pmap The pmap of the parent task.
9857  * @param new_pmap The pmap of the child task.
9858  *
9859  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9860  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
9861  */
9862 kern_return_t
9863 pmap_fork_nest(pmap_t old_pmap, pmap_t new_pmap)
9864 {
9865 	if (old_pmap == NULL || new_pmap == NULL) {
9866 		return KERN_INVALID_ARGUMENT;
9867 	}
9868 	if (old_pmap->nested_pmap == NULL) {
9869 		return KERN_SUCCESS;
9870 	}
9871 	pmap_set_shared_region(new_pmap,
9872 	    old_pmap->nested_pmap,
9873 	    old_pmap->nested_region_addr,
9874 	    old_pmap->nested_region_size);
9875 	return KERN_SUCCESS;
9876 }
9877 
9878 /*
9879  * disable no-execute capability on
9880  * the specified pmap
9881  */
9882 #if DEVELOPMENT || DEBUG
9883 void
9884 pmap_disable_NX(
9885 	pmap_t pmap)
9886 {
9887 	pmap->nx_enabled = FALSE;
9888 }
9889 #else
9890 void
9891 pmap_disable_NX(
9892 	__unused pmap_t pmap)
9893 {
9894 }
9895 #endif
9896 
9897 /*
9898  * flush a range of hardware TLB entries.
9899  * NOTE: assumes the smallest TLB entry in use will be for
9900  * an ARM small page (4K).
9901  */
9902 
9903 #if __ARM_RANGE_TLBI__
9904 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9905 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
9906 #else
9907 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
9908 #endif // __ARM_RANGE_TLBI__
9909 
9910 static void
9911 flush_mmu_tlb_region_asid_async(
9912 	vm_offset_t va,
9913 	size_t length,
9914 	pmap_t pmap,
9915 	bool last_level_only __unused)
9916 {
9917 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
9918 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
9919 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
9920 	const uint16_t asid = PMAP_HWASID(pmap);
9921 
9922 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
9923 		boolean_t       flush_all = FALSE;
9924 
9925 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
9926 			flush_all = TRUE;
9927 		}
9928 		if (flush_all) {
9929 			flush_mmu_tlb_async();
9930 		} else {
9931 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, false);
9932 		}
9933 		return;
9934 	}
9935 #if __ARM_RANGE_TLBI__
9936 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
9937 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
9938 		if (pmap->type == PMAP_TYPE_NESTED) {
9939 			flush_mmu_tlb_allrange_async(va, last_level_only, false);
9940 		} else {
9941 			flush_mmu_tlb_range_async(va, last_level_only, false);
9942 		}
9943 		return;
9944 	}
9945 #endif
9946 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
9947 	va = tlbi_asid(asid) | tlbi_addr(va);
9948 
9949 	if (pmap->type == PMAP_TYPE_NESTED) {
9950 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, false);
9951 	} else {
9952 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, false);
9953 	}
9954 }
9955 
9956 void
9957 flush_mmu_tlb_region(
9958 	vm_offset_t va,
9959 	unsigned length)
9960 {
9961 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
9962 	sync_tlb_flush();
9963 }
9964 
9965 unsigned int
9966 pmap_cache_attributes(
9967 	ppnum_t pn)
9968 {
9969 	pmap_paddr_t    paddr;
9970 	unsigned int    pai;
9971 	unsigned int    result;
9972 	pp_attr_t       pp_attr_current;
9973 
9974 	paddr = ptoa(pn);
9975 
9976 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
9977 
9978 	if (!pa_valid(paddr)) {
9979 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
9980 		return (io_rgn == NULL || io_rgn->signature == 'SKIO') ? VM_WIMG_IO : io_rgn->wimg;
9981 	}
9982 
9983 	result = VM_WIMG_DEFAULT;
9984 
9985 	pai = pa_index(paddr);
9986 
9987 	pp_attr_current = pp_attr_table[pai];
9988 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9989 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
9990 	}
9991 	return result;
9992 }
9993 
9994 MARK_AS_PMAP_TEXT static void
9995 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
9996 {
9997 	if ((wimg_bits_prev != wimg_bits_new)
9998 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
9999 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10000 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
10001 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
10002 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10003 		pmap_sync_page_attributes_phys(pn);
10004 	}
10005 
10006 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10007 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10008 	}
10009 }
10010 
10011 MARK_AS_PMAP_TEXT __unused void
10012 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10013 {
10014 	pmap_paddr_t paddr = ptoa(pn);
10015 
10016 	if (__improbable(!pa_valid(paddr))) {
10017 		panic("%s called on non-managed page 0x%08x", __func__, pn);
10018 	}
10019 
10020 	pmap_set_cache_attributes_internal(pn, new_cacheattr, false);
10021 
10022 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10023 }
10024 
10025 static inline bool
10026 cacheattr_supports_compressor(unsigned int cacheattr)
10027 {
10028 	switch (cacheattr) {
10029 	case VM_WIMG_DEFAULT:
10030 		return true;
10031 #if HAS_MTE
10032 	case VM_WIMG_MTE:
10033 		return true;
10034 #endif /* HAS_MTE */
10035 	default:
10036 		return false;
10037 	}
10038 }
10039 
10040 void *
10041 pmap_map_compressor_page(ppnum_t pn)
10042 {
10043 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10044 	if (!cacheattr_supports_compressor(cacheattr)) {
10045 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10046 	}
10047 
10048 	return (void*)phystokv(ptoa(pn));
10049 }
10050 
10051 void
10052 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10053 {
10054 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10055 	if (!cacheattr_supports_compressor(cacheattr)) {
10056 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10057 	}
10058 }
10059 
10060 /**
10061  * Flushes TLB entries associated with the page specified by paddr, but do not
10062  * issue barriers yet.
10063  *
10064  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10065  */
10066 static void
10067 pmap_flush_tlb_for_paddr_async(pmap_paddr_t paddr)
10068 {
10069 	/* Flush the physical aperture mappings. */
10070 	const vm_offset_t kva = phystokv(paddr);
10071 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
10072 
10073 	/* Flush the mappings tracked in the ptes. */
10074 	const unsigned int pai = pa_index(paddr);
10075 	locked_pvh_t locked_pvh = pvh_lock(pai);
10076 
10077 	pt_entry_t *pte_p = PT_ENTRY_NULL;
10078 	pv_entry_t *pve_p = PV_ENTRY_NULL;
10079 
10080 	if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP)) {
10081 		pte_p = pvh_ptep(locked_pvh.pvh);
10082 	} else if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
10083 		pve_p = pvh_pve_list(locked_pvh.pvh);
10084 		pte_p = PT_ENTRY_NULL;
10085 	}
10086 
10087 	unsigned int nptes = 0;
10088 	int pve_ptep_idx = 0;
10089 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10090 		if (pve_p != PV_ENTRY_NULL) {
10091 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10092 			if (pte_p == PT_ENTRY_NULL) {
10093 				goto flush_tlb_skip_pte;
10094 			}
10095 		}
10096 
10097 		if (__improbable(nptes == SPTM_MAPPING_LIMIT)) {
10098 			pvh_lock_enter_sleep_mode(&locked_pvh);
10099 		}
10100 		++nptes;
10101 #ifdef PVH_FLAG_IOMMU
10102 		if (pvh_ptep_is_iommu(pte_p)) {
10103 			goto flush_tlb_skip_pte;
10104 		}
10105 #endif /* PVH_FLAG_IOMMU */
10106 		const pmap_t pmap = ptep_get_pmap(pte_p);
10107 		const vm_map_address_t va = ptep_get_va(pte_p);
10108 
10109 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
10110 
10111 flush_tlb_skip_pte:
10112 		pte_p = PT_ENTRY_NULL;
10113 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10114 			pve_ptep_idx = 0;
10115 			pve_p = pve_next(pve_p);
10116 		}
10117 	}
10118 	pvh_unlock(&locked_pvh);
10119 }
10120 
10121 /**
10122  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10123  *
10124  * @param pai The Physical Address Index of the entry.
10125  * @param cacheattr The new cache attribute.
10126  */
10127 MARK_AS_PMAP_TEXT static void
10128 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10129 {
10130 	pvh_assert_locked(pai);
10131 
10132 	pp_attr_t pp_attr_current, pp_attr_template;
10133 	do {
10134 		pp_attr_current = pp_attr_table[pai];
10135 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10136 
10137 		/**
10138 		 * WIMG bits should only be updated under the PVH lock, but we should do
10139 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10140 		 */
10141 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10142 }
10143 
10144 /**
10145  * Structure for tracking where we are during the collection of mappings for batch
10146  * cache attribute updates.
10147  *
10148  * @note We need to track where in the per-cpu ops table we are filling the next mappings into,
10149  *       because the collection routine can return with a not completely filled ops table when
10150  *       it exhausts the PV list for a page. In such case, the remaining slots in the ops table
10151  *       will be used for mappings of the next page.
10152  *
10153  * @note We also need to record where we are in the PV list, because the collection routine can
10154  *       also return when the ops table is filled but it's still in the middle of the PV list.
10155  *       Those remaining items in the PV list need to be handled by the next batch operation in
10156  *       a new ops table.
10157  */
10158 typedef struct {
10159 	/* Where we are in the sptm ops table. */
10160 	unsigned int sptm_ops_index;
10161 
10162 	/**
10163 	 * The last collected physical address from the previous full ops array (and in turn, SPTM
10164 	 * call). This is used to know whether the SPTM call for the latest full ops table should
10165 	 * skip updating the PAPT mapping (seeing as the last call would have handled updating it).
10166 	 */
10167 	pmap_paddr_t last_table_last_papt_pa;
10168 
10169 	/**
10170 	 * Where we are in the pv list.
10171 	 *
10172 	 * When ptep is non-null, there's only one mapping to the page and the ptep is the address
10173 	 * of it.
10174 	 *
10175 	 * When pvep is non-null, there's more than one mapping and the mappings are tracked by the
10176 	 * PV list.
10177 	 *
10178 	 * When they are both null, it indicates we are collecting for a new page and the collection
10179 	 * function will initialize them to be one of the two states above.
10180 	 *
10181 	 * It is undefined when they are both non-null.
10182 	 */
10183 	pt_entry_t *ptep;
10184 	pv_entry_t *pvep;
10185 	unsigned int pve_ptep_idx;
10186 } pmap_sptm_update_cache_attr_ops_collect_state_t;
10187 
10188 /**
10189  * Reports whether there is any pending ops in an sptm cache attr ops table.
10190  *
10191  * @param state A pmap_sptm_update_cache_attr_ops_collect_state_t structure.
10192  *
10193  * @return True if there's any outstanding cache attr op.
10194  *         False otherwise.
10195  */
10196 static inline bool
10197 pmap_is_sptm_update_cache_attr_ops_pending(pmap_sptm_update_cache_attr_ops_collect_state_t state)
10198 {
10199 	return state.sptm_ops_index > 0;
10200 }
10201 
10202 /**
10203  * Struct for encoding the collection status into pmap_sptm_update_cache_attr_ops_collect()'s
10204  * return value indicating what kind of attention it needs.
10205  */
10206 typedef enum {
10207 	OPS_COLLECT_NOTHING = 0x0,
10208 
10209 	/* The ops table is full, and the caller should commit the table to SPTM. */
10210 	OPS_COLLECT_RETURN_FULL_TABLE = 0x1,
10211 
10212 	/**
10213 	 * The page has its mappings completely collected, and the caller should
10214 	 * pass in a new page next time.
10215 	 */
10216 	OPS_COLLECT_RETURN_COMPLETED_PAGE = 0x2,
10217 } pmap_sptm_update_cache_attr_ops_collect_return_t;
10218 
10219 /**
10220  * Collects mappings of a physical page into an SPTM ops table for cache attribute updates.
10221  *
10222  * @note This routine returns either when the ops table is full or the page represented by
10223  *       pa has no more mapping to collect. The caller should call this routine again with
10224  *       a fresh ops table, or a new page, or both, depending on the return code.
10225  *
10226  * @note The PVH lock needs to be held for pa.
10227  *
10228  * @param state Tracks the state of PV list traversal and SPTM ops table filling. It is used
10229  *              by this routine to save the progress of the collection.
10230  * @param sptm_ops Pointer to the SPTM ops table.
10231  * @param pa The physical address whose mappings are to be collected.
10232  * @param attributes The new cache attributes.
10233  *
10234  * @return A pmap_sptm_update_cache_attr_ops_collect_return_t that encodes what the caller
10235  *         should do before calling this routine again. See the inline comments around
10236  *         pmap_sptm_update_cache_attr_ops_collect_return_t for details.
10237  */
10238 static pmap_sptm_update_cache_attr_ops_collect_return_t
10239 pmap_sptm_update_cache_attr_ops_collect(
10240 	pmap_sptm_update_cache_attr_ops_collect_state_t *state,
10241 	sptm_update_disjoint_multipage_op_t *sptm_ops,
10242 	pmap_paddr_t pa,
10243 	unsigned int attributes)
10244 {
10245 	if (state == NULL || sptm_ops == NULL) {
10246 		panic("%s: unexpected null arguments - state: %p, sptm_ops: %p", __func__, state, sptm_ops);
10247 	}
10248 
10249 	PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_START, pa, attributes, state->sptm_ops_index);
10250 
10251 	/* Copy the states into local variables. */
10252 	unsigned int sptm_ops_index = state->sptm_ops_index;
10253 	pmap_paddr_t last_table_last_papt_pa = state->last_table_last_papt_pa;
10254 	pv_entry_t *pvep = state->pvep;
10255 	pt_entry_t *ptep = state->ptep;
10256 	unsigned int pve_ptep_idx = state->pve_ptep_idx;
10257 
10258 	unsigned int pai = pa_index(pa);
10259 
10260 	/* We should at least have one free slot in the ops table. */
10261 	assert(sptm_ops_index < SPTM_MAPPING_LIMIT);
10262 
10263 	/* The PVH lock for pa has to be locked. */
10264 	pvh_assert_locked(pai);
10265 
10266 	/* If pvep and ptep are both null in the state, it's a new page. Initialize the states. */
10267 	if (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL) {
10268 		const uintptr_t pvh = pai_to_pvh(pai);
10269 		if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
10270 			ptep = PT_ENTRY_NULL;
10271 			pvep = pvh_pve_list(pvh);
10272 			pve_ptep_idx = 0;
10273 		} else if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
10274 			ptep  = pvh_ptep(pvh);
10275 			pvep = PV_ENTRY_NULL;
10276 			pve_ptep_idx = 0;
10277 		}
10278 	}
10279 
10280 	/**
10281 	 * The first entry filled in is always the PAPT header entry:
10282 	 *
10283 	 * 1) In the case of a fresh ops table, the first entry has to be a PAPT header.
10284 	 * 2) In the case of a fresh page, we need to insert a new PAPT header to request
10285 	 *    SPTM to operate on a new page.
10286 	 *
10287 	 * Remember the index of the PAPT header here so that we can update the number
10288 	 * of mappings field later when we finish collecting.
10289 	 */
10290 	const unsigned int papt_sptm_ops_index = sptm_ops_index;
10291 	unsigned int num_mappings = 0;
10292 
10293 	/* Assemble the PTE template for the PAPT mapping. */
10294 	const vm_address_t kva = phystokv(pa);
10295 	const pt_entry_t *papt_ptep = pmap_pte(kernel_pmap, kva);
10296 
10297 	pt_entry_t template = os_atomic_load(papt_ptep, relaxed);
10298 	template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10299 	template |= wimg_to_pte(attributes, pa);
10300 
10301 	/* Fill in the PAPT header entry. */
10302 	sptm_ops[papt_sptm_ops_index].per_paddr_header.paddr = pa;
10303 	sptm_ops[papt_sptm_ops_index].per_paddr_header.papt_pte_template = template;
10304 	sptm_ops[papt_sptm_ops_index].per_paddr_header.options = SPTM_UPDATE_SH | SPTM_UPDATE_MAIR | SPTM_UPDATE_DEFER_TLBI;
10305 
10306 	if ((papt_sptm_ops_index == 0) && (pa == last_table_last_papt_pa)) {
10307 		/**
10308 		 * If the previous SPTM call was made with an ops table that already included
10309 		 * updating the PA of the page that this table starts with, then we can assume
10310 		 * that call already updated the PAPT and we can safely skip it in this
10311 		 * upcoming one.
10312 		 */
10313 		sptm_ops[0].per_paddr_header.options |= SPTM_UPDATE_SKIP_PAPT;
10314 	}
10315 
10316 	sptm_ops_index++;
10317 
10318 	/**
10319 	 * Main loop for collecting the mappings into the ops table. It terminates either
10320 	 * when the ops table is full or the PV list is exhausted.
10321 	 */
10322 	while ((sptm_ops_index < SPTM_MAPPING_LIMIT) && (pvep != PV_ENTRY_NULL || ptep != PT_ENTRY_NULL)) {
10323 		/**
10324 		 * Update ptep. There are really two cases here:
10325 		 *
10326 		 * 1) pvep is PV_ENTRY_NULL. In this case, ptep holds the pointer to
10327 		 *    the only mapping to the page.
10328 		 * 2) pvep is not PV_ENTRY_NULL. In such case, ptep is updated accroding to
10329 		 *    pvep and pve_ptep_idx.
10330 		 */
10331 		if (pvep != PV_ENTRY_NULL) {
10332 			ptep = pve_get_ptep(pvep, pve_ptep_idx);
10333 
10334 			/* This pve is empty, so skip to next one. */
10335 			if (ptep == PT_ENTRY_NULL) {
10336 				goto sucaoc_skip_pte;
10337 			}
10338 		}
10339 
10340 #ifdef PVH_FLAG_IOMMU
10341 		/* Skip IOMMU pteps. */
10342 		if (pvh_ptep_is_iommu(ptep)) {
10343 			goto sucaoc_skip_pte;
10344 		}
10345 #endif
10346 		/* Assemble the PTE template for the mapping. */
10347 		const vm_address_t va = ptep_get_va(ptep);
10348 		const pmap_t pmap = ptep_get_pmap(ptep);
10349 
10350 		template = os_atomic_load(ptep, relaxed);
10351 		template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10352 		template |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, pa);
10353 
10354 		/* Fill into the ops table. */
10355 		sptm_ops[sptm_ops_index].disjoint_op.root_pt_paddr = pmap->ttep;
10356 		sptm_ops[sptm_ops_index].disjoint_op.vaddr = va;
10357 		sptm_ops[sptm_ops_index].disjoint_op.pte_template = template;
10358 
10359 		/* Move the sptm ops table cursor. */
10360 		sptm_ops_index++;
10361 
10362 		/* Increment the mappings counter. */
10363 		num_mappings++;
10364 
10365 sucaoc_skip_pte:
10366 		/**
10367 		 * Reset ptep to PT_ENTRY_NULL to keep the loop precondition of either ptep
10368 		 * or pvep is nonnull (not both, not neither) true.
10369 		 */
10370 		ptep = PT_ENTRY_NULL;
10371 
10372 		/* Advance to next pvep if we have exhausted the pteps in it. */
10373 		if ((pvep != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10374 			pve_ptep_idx = 0;
10375 			pvep = pve_next(pvep);
10376 		}
10377 	}
10378 
10379 	/* Update the PAPT header for the number of mappings. */
10380 	sptm_ops[papt_sptm_ops_index].per_paddr_header.num_mappings = num_mappings;
10381 
10382 	const bool full_table = (sptm_ops_index >= SPTM_MAPPING_LIMIT);
10383 	const bool collection_done_for_page = (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL);
10384 
10385 	/**
10386 	 * The ops table is full, so the caller should now invoke the SPTM before calling
10387 	 * into this function again.
10388 	 */
10389 	if (full_table) {
10390 		/* Update last_table_last_papt_pa to be the pa collected in this call. */
10391 		last_table_last_papt_pa = pa;
10392 
10393 		/* Reset sptm_ops_index. */
10394 		sptm_ops_index = 0;
10395 	}
10396 
10397 	/* Copy the updated collection states back to the parameter structure. */
10398 	state->sptm_ops_index = sptm_ops_index;
10399 	state->last_table_last_papt_pa = last_table_last_papt_pa;
10400 	state->pvep = pvep;
10401 	state->ptep = ptep;
10402 	state->pve_ptep_idx = pve_ptep_idx;
10403 
10404 	/* Assemble the return value. */
10405 	pmap_sptm_update_cache_attr_ops_collect_return_t retval = OPS_COLLECT_NOTHING;
10406 
10407 	if (full_table) {
10408 		retval |= OPS_COLLECT_RETURN_FULL_TABLE;
10409 	}
10410 
10411 	if (collection_done_for_page) {
10412 		retval |= OPS_COLLECT_RETURN_COMPLETED_PAGE;
10413 	}
10414 
10415 	PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_END, pa, attributes, sptm_ops_index);
10416 
10417 	return retval;
10418 }
10419 
10420 /* At least one PAPT header plus one mapping. */
10421 static_assert(SPTM_MAPPING_LIMIT >= 2);
10422 
10423 /**
10424  * Returns if a cache attribute is allowed (on managed pages).
10425  *
10426  * @param attributes A 32-bit value whose VM_WIMG_MASK bits represent the
10427  *                   cache attribute.
10428  *
10429  * @return True if the cache attribute is allowed on managed pages.
10430  *         False otherwise.
10431  */
10432 static bool
10433 pmap_is_cache_attribute_allowed(unsigned int attributes)
10434 {
10435 	if (pmap_panic_dev_wimg_on_managed) {
10436 		switch (attributes & VM_WIMG_MASK) {
10437 		/* supported on DRAM, but slow, so we disallow */
10438 		case VM_WIMG_IO:                        // nGnRnE
10439 		case VM_WIMG_POSTED:                    // nGnRE
10440 
10441 		/* unsupported on DRAM */
10442 		case VM_WIMG_POSTED_REORDERED:          // nGRE
10443 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10444 			return false;
10445 
10446 		default:
10447 			return true;
10448 		}
10449 	}
10450 
10451 	return true;
10452 }
10453 
10454 /**
10455  * Batch updates the cache attributes of a list of pages in three passes.
10456  *
10457  * In pass one, the pp_attr_table and the pte are updated (by SPTM) for the pages in the list.
10458  * In pass two, TLB entries are flushed for each page in the list if necessary.
10459  * In pass three, caches are cleaned for each page in the list if necessary.
10460  *
10461  * @param page_list List of pages to be updated.
10462  * @param cacheattr The new cache attributes.
10463  * @param update_attr_table Whether the pp_attr_table should be updated. This is useful for compressor
10464  *                          pages where it's desired to keep the old WIMG bits.
10465  */
10466 void
10467 pmap_batch_set_cache_attributes_internal(
10468 	const unified_page_list_t *page_list,
10469 	unsigned int cacheattr,
10470 	bool update_attr_table)
10471 {
10472 	bool tlb_flush_pass_needed = false;
10473 	bool rt_cache_flush_pass_needed = false;
10474 	bool preemption_disabled = false;
10475 
10476 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE1);
10477 
10478 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
10479 	sptm_update_disjoint_multipage_op_t *sptm_ops = NULL;
10480 
10481 	pmap_sptm_update_cache_attr_ops_collect_state_t state = {0};
10482 
10483 	unified_page_list_iterator_t iter;
10484 
10485 	for (unified_page_list_iterator_init(page_list, &iter);
10486 	    !unified_page_list_iterator_end(&iter);
10487 	    unified_page_list_iterator_next(&iter)) {
10488 		bool is_fictitious = false;
10489 		const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10490 		const pmap_paddr_t paddr = ptoa(pn);
10491 
10492 		/**
10493 		 * Skip if the page is not managed.
10494 		 *
10495 		 * We don't panic here because sometimes the user just blindly pass in
10496 		 * pages that are not managed. We need to handle that gracefully.
10497 		 */
10498 		if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10499 			continue;
10500 		}
10501 
10502 		const unsigned int pai = pa_index(paddr);
10503 		locked_pvh_t locked_pvh = {.pvh = 0};
10504 
10505 		if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10506 			/**
10507 			 * If we're partway through processing a multi-page batched call,
10508 			 * preemption will already be disabled so we can't simply call
10509 			 * pvh_lock() which may block.  Instead, we first try to acquire
10510 			 * the lock without waiting, which in most cases should succeed.
10511 			 * If it fails, we submit the pending batched operations to re-
10512 			 * enable preemption and then acquire the lock normally.
10513 			 */
10514 			locked_pvh = pvh_try_lock(pai);
10515 			if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
10516 				assert(preemption_disabled);
10517 				const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10518 				pmap_epoch_exit();
10519 				enable_preemption();
10520 				preemption_disabled = false;
10521 				if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10522 					tlb_flush_pass_needed = true;
10523 				}
10524 				state.sptm_ops_index = 0;
10525 				locked_pvh = pvh_lock(pai);
10526 			}
10527 		} else {
10528 			locked_pvh = pvh_lock(pai);
10529 		}
10530 		assert(locked_pvh.pvh != 0);
10531 
10532 		const pp_attr_t pp_attr_current = pp_attr_table[pai];
10533 
10534 		unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10535 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10536 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10537 		}
10538 
10539 		const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10540 
10541 		unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10542 		if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10543 			wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10544 		}
10545 
10546 		/**
10547 		 * When update_attr_table is false, we know that wimg_bits_prev read from pp_attr_table is not to be trusted,
10548 		 * and we should force update the cache attribute.
10549 		 */
10550 		const bool force_update = !update_attr_table;
10551 		/* Update the cache attributes in PTE and PP_ATTR table. */
10552 		if ((wimg_bits_new != wimg_bits_prev) || force_update) {
10553 			if (!pmap_is_cache_attribute_allowed(cacheattr)) {
10554 				panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, pn=%#x",
10555 				    __func__, cacheattr & VM_WIMG_MASK, pn);
10556 			}
10557 
10558 			/* Update PP_ATTR_TABLE */
10559 			if (update_attr_table) {
10560 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10561 			}
10562 
10563 			bool mapping_collection_done = false;
10564 			bool pvh_lock_sleep_mode_needed = false;
10565 			do {
10566 				if (__improbable(pvh_lock_sleep_mode_needed)) {
10567 					assert(!preemption_disabled);
10568 					pvh_lock_enter_sleep_mode(&locked_pvh);
10569 					pvh_lock_sleep_mode_needed = false;
10570 				}
10571 
10572 				/* Disable preemption to use the per-CPU structure safely. */
10573 				if (!preemption_disabled) {
10574 					preemption_disabled = true;
10575 					disable_preemption();
10576 					/**
10577 					 * Enter the pmap epoch while we gather the disjoint update arguments
10578 					 * and issue the SPTM call.  Since this operation may cover multiple physical
10579 					 * pages, we may construct the argument array and invoke the SPTM without holding
10580 					 * all relevant PVH locks, we need to record that we are collecting and modifying
10581 					 * mapping state so that e.g. pmap_page_protect() does not attempt to retype the
10582 					 * underlying pages and pmap_remove() does not attempt to free the page tables
10583 					 * used for these mappings without first draining our epoch.
10584 					 */
10585 					pmap_epoch_enter();
10586 
10587 					sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
10588 					sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops;
10589 				}
10590 
10591 				/* The return value indicates if we should call into SPTM in this iteration. */
10592 				pmap_sptm_update_cache_attr_ops_collect_return_t retval =
10593 				    pmap_sptm_update_cache_attr_ops_collect(&state, sptm_ops, paddr, cacheattr);
10594 
10595 				/* The collection routine should only return if it needs attention. */
10596 				assert(retval != OPS_COLLECT_NOTHING);
10597 
10598 				/* Gather information for next step from the return value. */
10599 				mapping_collection_done = retval & OPS_COLLECT_RETURN_COMPLETED_PAGE;
10600 				const bool call_sptm = retval & OPS_COLLECT_RETURN_FULL_TABLE;
10601 
10602 				if (call_sptm) {
10603 					/* Call into SPTM with this SPTM ops table. */
10604 					sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, SPTM_MAPPING_LIMIT);
10605 					/**
10606 					 * We may be submitting the batch and exiting the epoch partway through
10607 					 * processing the PV list for a page.  That's fine, because in that case we'll
10608 					 * hold the PV lock for that page, which will prevent mappings of that page from
10609 					 * being disconnected and will prevent the completion of pmap_remove() against
10610 					 * any of those mappings, thus also guaranteeing the relevant page table pages
10611 					 * can't be freed.  The epoch still protects mappings for any prior page in
10612 					 * the batch, whose PV locks are no longer held.
10613 					 */
10614 					pmap_epoch_exit();
10615 					/**
10616 					 * Balance out the explicit disable_preemption() made either at the beginning of
10617 					 * the function or on a prior iteration of the loop that placed the PVH lock in
10618 					 * sleep mode.  Note that enable_preemption() decrements a per-thread counter,
10619 					 * so if we still happen to hold the PVH lock in spin mode preemption won't
10620 					 * actually be re-enabled until we switch the lock over to sleep mode on
10621 					 * the next iteration.
10622 					 */
10623 					enable_preemption();
10624 					preemption_disabled = false;
10625 					pvh_lock_sleep_mode_needed = true;
10626 
10627 					if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10628 						tlb_flush_pass_needed = true;
10629 					}
10630 				}
10631 
10632 				/* We cannot be in a situation where we didn't call into SPTM while also having not finished walking the pv list. */
10633 				assert(call_sptm || mapping_collection_done);
10634 			} while (!mapping_collection_done);
10635 
10636 			/**
10637 			 * We could technically force the cache flush pass here when force_update is true, but
10638 			 * since the compressor mapping/unmapping path handles cache flushing itself, it's fine
10639 			 * leaving this as is.
10640 			 */
10641 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10642 				rt_cache_flush_pass_needed = true;
10643 			}
10644 		}
10645 
10646 		pvh_unlock(&locked_pvh);
10647 	}
10648 
10649 	if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
10650 		assert(preemption_disabled);
10651 		sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
10652 		pmap_epoch_exit();
10653 		if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
10654 			tlb_flush_pass_needed = true;
10655 		}
10656 
10657 		/**
10658 		 * This is the last sptm_update_cache_attr() call whatsoever, so it's
10659 		 * okay not to update the state variables.
10660 		 */
10661 
10662 		enable_preemption();
10663 	} else if (preemption_disabled) {
10664 		pmap_epoch_exit();
10665 		enable_preemption();
10666 	}
10667 
10668 	if (tlb_flush_pass_needed) {
10669 		/* Sync the PTE writes before potential TLB/Cache flushes. */
10670 		FLUSH_PTE_STRONG();
10671 
10672 		/**
10673 		 * Pass 2: for each physical page and for each mapping, we need to flush
10674 		 * the TLB for it.
10675 		 */
10676 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE2);
10677 		for (unified_page_list_iterator_init(page_list, &iter);
10678 		    !unified_page_list_iterator_end(&iter);
10679 		    unified_page_list_iterator_next(&iter)) {
10680 			bool is_fictitious = false;
10681 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10682 			const pmap_paddr_t paddr = ptoa(pn);
10683 
10684 			if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10685 				continue;
10686 			}
10687 
10688 			pmap_flush_tlb_for_paddr_async(paddr);
10689 		}
10690 
10691 #if HAS_FEAT_XS
10692 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10693 		arm64_sync_tlb(false);
10694 #else
10695 		/**
10696 		 * For targets that distinguish between mild and strong DSB, mild DSB
10697 		 * will not drain the prefetcher.  This can lead to prefetch-driven
10698 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
10699 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10700 		 */
10701 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10702 #endif
10703 	}
10704 
10705 	if (rt_cache_flush_pass_needed) {
10706 		/* Pass 3: Flush the cache if the page is recently set to RT */
10707 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE3);
10708 		/**
10709 		 * We disable preemption to ensure we are not preempted
10710 		 * in the state where DC by VA instructions remain enabled.
10711 		 */
10712 		disable_preemption();
10713 
10714 		assert(get_preemption_level() > 0);
10715 
10716 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10717 		/**
10718 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10719 		 * and the host will handle cache maintenance for it. So we don't need to
10720 		 * worry about enabling the ops here for AVP.
10721 		 */
10722 		enable_dc_mva_ops();
10723 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10724 		/**
10725 		 * DMB should be sufficient to ensure prior accesses to the memory in question are
10726 		 * correctly ordered relative to the upcoming cache maintenance operations.
10727 		 */
10728 		__builtin_arm_dmb(DMB_SY);
10729 
10730 		for (unified_page_list_iterator_init(page_list, &iter);
10731 		    !unified_page_list_iterator_end(&iter);) {
10732 			bool is_fictitious = false;
10733 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10734 			const pmap_paddr_t paddr = ptoa(pn);
10735 
10736 			if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10737 				unified_page_list_iterator_next(&iter);
10738 				continue;
10739 			}
10740 
10741 			CleanPoC_DcacheRegion_Force_nopreempt_nohid_nobarrier(phystokv(paddr), PAGE_SIZE);
10742 
10743 			unified_page_list_iterator_next(&iter);
10744 			if (__improbable(pmap_pending_preemption() && !unified_page_list_iterator_end(&iter))) {
10745 				__builtin_arm_dsb(DSB_SY);
10746 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10747 				disable_dc_mva_ops();
10748 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10749 				enable_preemption();
10750 				assert(preemption_enabled());
10751 				disable_preemption();
10752 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10753 				enable_dc_mva_ops();
10754 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10755 			}
10756 		}
10757 
10758 		/* Issue DSB to ensure cache maintenance is fully complete before subsequent accesses. */
10759 		__builtin_arm_dsb(DSB_SY);
10760 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10761 		disable_dc_mva_ops();
10762 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10763 
10764 		enable_preemption();
10765 	}
10766 
10767 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE4);
10768 }
10769 
10770 /**
10771  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10772  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10773  *
10774  * @param page_list List of pages to be updated.
10775  * @param cacheattr The new cache attribute.
10776  */
10777 void
10778 pmap_batch_set_cache_attributes(
10779 	const unified_page_list_t *page_list,
10780 	unsigned int cacheattr)
10781 {
10782 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10783 
10784 	/* Verify we are being called from a preemptible context. */
10785 	pmap_verify_preemptible();
10786 
10787 	pmap_batch_set_cache_attributes_internal(page_list, cacheattr, true);
10788 
10789 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10790 }
10791 
10792 MARK_AS_PMAP_TEXT void
10793 pmap_set_cache_attributes_internal(
10794 	ppnum_t pn,
10795 	unsigned int cacheattr,
10796 	bool update_attr_table)
10797 {
10798 	upl_page_info_t single_page_upl = { .phys_addr = pn };
10799 	const unified_page_list_t page_list = {
10800 		.upl = {.upl_info = &single_page_upl, .upl_size = 1},
10801 		.type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
10802 	};
10803 
10804 	pmap_batch_set_cache_attributes_internal(&page_list, cacheattr, update_attr_table);
10805 }
10806 
10807 void
10808 pmap_set_cache_attributes(
10809 	ppnum_t pn,
10810 	unsigned int cacheattr)
10811 {
10812 	pmap_set_cache_attributes_internal(pn, cacheattr, true);
10813 }
10814 
10815 void
10816 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10817     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10818 {
10819 	pmap_paddr_t data_pa = 0; // data address
10820 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10821 	pmap_paddr_t text_pa = 0; // text address
10822 
10823 	*kernel_data_addr = 0;
10824 	*kernel_text_addr = 0;
10825 	*user_text_addr = 0;
10826 
10827 	kern_return_t kr = pmap_page_alloc(&data_pa, PMAP_PAGE_ALLOCATE_NONE);
10828 	assert(kr == KERN_SUCCESS);
10829 
10830 	kr = pmap_page_alloc(&ro_data_pa, PMAP_PAGE_ALLOCATE_NONE);
10831 	assert(kr == KERN_SUCCESS);
10832 
10833 #if CONFIG_ARM_PFZ
10834 	kr = pmap_page_alloc(&text_pa, PMAP_PAGE_ALLOCATE_NONE);
10835 	assert(kr == KERN_SUCCESS);
10836 
10837 	/**
10838 	 *  User mapping of comm page text section for 64 bit mapping only
10839 	 *
10840 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10841 	 * user processes to get this page mapped in, they should never call into
10842 	 * this page.
10843 	 *
10844 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10845 	 * is slid in the same L3 as the data commpage.  It is either outside the
10846 	 * max of user VA or is pre-reserved in vm_map_exec(). This means that
10847 	 * it is reserved and unavailable to mach VM for future mappings.
10848 	 */
10849 	const int num_ptes = pt_attr_leaf_size(native_pt_attr) >> PTE_SHIFT;
10850 
10851 	do {
10852 		const int text_leaf_index = random() % num_ptes;
10853 
10854 		/**
10855 		 * Generate a VA for the commpage text with the same root and twig index as data
10856 		 * comm page, but with new leaf index we've just generated.
10857 		 */
10858 		commpage_text_user_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(native_pt_attr));
10859 		commpage_text_user_va |= (text_leaf_index << pt_attr_leaf_shift(native_pt_attr));
10860 	} while ((commpage_text_user_va == _COMM_PAGE64_BASE_ADDRESS) ||
10861 	    (commpage_text_user_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10862 
10863 	*user_text_addr = commpage_text_user_va;
10864 	*kernel_text_addr = phystokv(text_pa);
10865 #endif
10866 
10867 	/* For manipulation in kernel, go straight to physical page */
10868 	commpage_data_pa = data_pa;
10869 	*kernel_data_addr = phystokv(data_pa);
10870 	assert(commpage_ro_data_pa == 0);
10871 	commpage_ro_data_pa = ro_data_pa;
10872 	*kernel_ro_data_addr = phystokv(ro_data_pa);
10873 	assert(commpage_text_pa == 0);
10874 	commpage_text_pa = text_pa;
10875 }
10876 
10877 
10878 /*
10879  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10880  * with user controlled TTEs for regions that aren't explicitly reserved by the
10881  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10882  */
10883 #if (ARM_PGSHIFT == 14)
10884 /**
10885  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10886  * commpage completely above the maximum 32-bit userspace VA.
10887  */
10888 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10889 static_assert(_COMM_PAGE64_NESTING_START == SPTM_ARM64_COMMPAGE_REGION_START);
10890 static_assert(_COMM_PAGE64_NESTING_SIZE == SPTM_ARM64_COMMPAGE_REGION_SIZE);
10891 
10892 /**
10893  * Normally there'd be an assert to check that 64-bit devices with 64-bit
10894  * userspace VAs can nest the commpage completely above the maximum 64-bit
10895  * userpace VA, but that technically isn't true on macOS. On those systems, the
10896  * commpage lives within the userspace VA range, but is protected by the VM as
10897  * a reserved region (see vm_reserved_regions[] definition for more info).
10898  */
10899 
10900 #elif (ARM_PGSHIFT == 12)
10901 /**
10902  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10903  * above the maximum userspace VA.
10904  */
10905 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10906 #else
10907 #error Nested shared page mapping is unsupported on this config
10908 #endif
10909 
10910 MARK_AS_PMAP_TEXT kern_return_t
10911 pmap_insert_commpage_internal(
10912 	pmap_t pmap)
10913 {
10914 	kern_return_t kr = KERN_SUCCESS;
10915 	vm_offset_t commpage_vaddr;
10916 	pt_entry_t *ttep;
10917 	pmap_paddr_t commpage_table = commpage_default_table;
10918 
10919 	/* Validate the pmap input before accessing its data. */
10920 	validate_pmap_mutable(pmap);
10921 
10922 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10923 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10924 
10925 #if __ARM_MIXED_PAGE_SIZE__
10926 #if !__ARM_16K_PG__
10927 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
10928 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10929 #endif /* !__ARM_16K_PG__ */
10930 
10931 	/* Choose the correct shared page pmap to use. */
10932 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
10933 	if (pmap_page_size == 4096) {
10934 		if (pmap_is_64bit(pmap)) {
10935 			commpage_table = commpage_4k_table;
10936 		} else {
10937 			panic("32-bit 4k commpage not currently supported for SPTM configurations");
10938 			//commpage_table = commpage32_4k_table;
10939 		}
10940 	} else if (pmap_page_size != 16384) {
10941 		panic("No commpage table exists for the wanted page size: %llu", pmap_page_size);
10942 	} else
10943 #endif /* __ARM_MIXED_PAGE_SIZE__ */
10944 	{
10945 		if (pmap_is_64bit(pmap)) {
10946 			commpage_table = commpage_default_table;
10947 		} else {
10948 			commpage_table = commpage32_default_table;
10949 		}
10950 	}
10951 
10952 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
10953 #error We assume a single page.
10954 #endif
10955 
10956 	if (pmap_is_64bit(pmap)) {
10957 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10958 	} else {
10959 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10960 	}
10961 
10962 
10963 	pmap_lock(pmap, PMAP_LOCK_SHARED);
10964 
10965 	/*
10966 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
10967 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
10968 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
10969 	 * to "nest".
10970 	 *
10971 	 * Note: This is not "nesting" in the shared cache sense. This definition of
10972 	 * nesting just means inserting pointers to pre-allocated tables inside of
10973 	 * the passed in pmap to allow us to share page tables (which map the shared
10974 	 * page) for every task. This saves at least one page of memory per process
10975 	 * compared to creating new page tables in every process for mapping the
10976 	 * shared page.
10977 	 */
10978 
10979 	/**
10980 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
10981 	 * page's tables into place.
10982 	 */
10983 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
10984 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
10985 
10986 		kr = pmap_expand(pmap, commpage_vaddr, 0, commpage_level);
10987 
10988 		if (kr != KERN_SUCCESS) {
10989 			panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
10990 		}
10991 
10992 		pmap_lock(pmap, PMAP_LOCK_SHARED);
10993 	}
10994 
10995 	if (*ttep != ARM_PTE_EMPTY) {
10996 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
10997 	}
10998 
10999 	sptm_map_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level,
11000 	    (commpage_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID);
11001 
11002 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
11003 
11004 	return kr;
11005 }
11006 
11007 static void
11008 pmap_unmap_commpage(
11009 	pmap_t pmap)
11010 {
11011 	pt_entry_t *ptep;
11012 	vm_offset_t commpage_vaddr;
11013 
11014 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11015 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11016 	__assert_only pmap_paddr_t commpage_pa = commpage_data_pa;
11017 
11018 	if (pmap_is_64bit(pmap)) {
11019 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11020 	} else {
11021 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11022 	}
11023 
11024 
11025 	ptep = pmap_pte(pmap, commpage_vaddr);
11026 
11027 	if (ptep == NULL) {
11028 		return;
11029 	}
11030 
11031 	/* It had better be mapped to the shared page. */
11032 	if (pte_to_pa(*ptep) != commpage_pa) {
11033 		panic("%s: non-commpage PA 0x%llx mapped at VA 0x%llx in pmap %p; expected 0x%llx",
11034 		    __func__, (unsigned long long)pte_to_pa(*ptep), (unsigned long long)commpage_vaddr,
11035 		    pmap, (unsigned long long)commpage_pa);
11036 	}
11037 
11038 	sptm_unmap_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level);
11039 }
11040 
11041 void
11042 pmap_insert_commpage(
11043 	pmap_t pmap)
11044 {
11045 	pmap_insert_commpage_internal(pmap);
11046 }
11047 
11048 static boolean_t
11049 pmap_is_64bit(
11050 	pmap_t pmap)
11051 {
11052 	return pmap->is_64bit;
11053 }
11054 
11055 bool
11056 pmap_is_exotic(
11057 	pmap_t pmap __unused)
11058 {
11059 	return false;
11060 }
11061 
11062 
11063 /* ARMTODO -- an implementation that accounts for
11064  * holes in the physical map, if any.
11065  */
11066 boolean_t
11067 pmap_valid_page(
11068 	ppnum_t pn)
11069 {
11070 	return pa_valid(ptoa(pn));
11071 }
11072 
11073 boolean_t
11074 pmap_bootloader_page(
11075 	ppnum_t pn)
11076 {
11077 	pmap_paddr_t paddr = ptoa(pn);
11078 
11079 	if (pa_valid(paddr)) {
11080 		return FALSE;
11081 	}
11082 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11083 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11084 }
11085 
11086 MARK_AS_PMAP_TEXT boolean_t
11087 pmap_is_empty_internal(
11088 	pmap_t pmap,
11089 	vm_map_offset_t va_start,
11090 	vm_map_offset_t va_end)
11091 {
11092 	vm_map_offset_t block_start, block_end;
11093 	tt_entry_t *tte_p;
11094 
11095 	if (pmap == NULL) {
11096 		return TRUE;
11097 	}
11098 
11099 	validate_pmap(pmap);
11100 
11101 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11102 	unsigned int initial_not_in_kdp = not_in_kdp;
11103 
11104 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11105 		pmap_lock(pmap, PMAP_LOCK_SHARED);
11106 	}
11107 
11108 
11109 	/* TODO: This will be faster if we increment ttep at each level. */
11110 	block_start = va_start;
11111 
11112 	while (block_start < va_end) {
11113 		pt_entry_t     *bpte_p, *epte_p;
11114 		pt_entry_t     *pte_p;
11115 
11116 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11117 		if (block_end > va_end) {
11118 			block_end = va_end;
11119 		}
11120 
11121 		tte_p = pmap_tte(pmap, block_start);
11122 		if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
11123 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
11124 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11125 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
11126 
11127 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11128 				if (*pte_p != ARM_PTE_EMPTY) {
11129 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11130 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
11131 					}
11132 					return FALSE;
11133 				}
11134 			}
11135 		}
11136 		block_start = block_end;
11137 	}
11138 
11139 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11140 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11141 	}
11142 
11143 	return TRUE;
11144 }
11145 
11146 boolean_t
11147 pmap_is_empty(
11148 	pmap_t pmap,
11149 	vm_map_offset_t va_start,
11150 	vm_map_offset_t va_end)
11151 {
11152 	return pmap_is_empty_internal(pmap, va_start, va_end);
11153 }
11154 
11155 vm_map_offset_t
11156 pmap_max_offset(
11157 	boolean_t               is64,
11158 	unsigned int    option)
11159 {
11160 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11161 }
11162 
11163 vm_map_offset_t
11164 pmap_max_64bit_offset(
11165 	__unused unsigned int option)
11166 {
11167 	vm_map_offset_t max_offset_ret = 0;
11168 
11169 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11170 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11171 		max_offset_ret = arm64_pmap_max_offset_default;
11172 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11173 		max_offset_ret = min_max_offset;
11174 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11175 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11176 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11177 		if (arm64_pmap_max_offset_default) {
11178 			max_offset_ret = arm64_pmap_max_offset_default;
11179 		} else if (max_mem > 0xC0000000) {
11180 			// devices with > 3GB of memory
11181 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11182 		} else if (max_mem > 0x40000000) {
11183 			// devices with > 1GB and <= 3GB of memory
11184 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11185 		} else {
11186 			// devices with <= 1 GB of memory
11187 			max_offset_ret = min_max_offset;
11188 		}
11189 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11190 		if (arm64_pmap_max_offset_default) {
11191 			// Allow the boot-arg to override jumbo size
11192 			max_offset_ret = arm64_pmap_max_offset_default;
11193 		} else {
11194 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
11195 		}
11196 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
11197 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
11198 		max_offset_ret = MACH_VM_MAX_ADDRESS;
11199 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
11200 	} else {
11201 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
11202 	}
11203 
11204 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11205 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11206 		assert(max_offset_ret >= min_max_offset);
11207 	}
11208 
11209 	return max_offset_ret;
11210 }
11211 
11212 vm_map_offset_t
11213 pmap_max_32bit_offset(
11214 	unsigned int option)
11215 {
11216 	vm_map_offset_t max_offset_ret = 0;
11217 
11218 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11219 		max_offset_ret = arm_pmap_max_offset_default;
11220 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11221 		max_offset_ret = VM_MAX_ADDRESS;
11222 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11223 		max_offset_ret = VM_MAX_ADDRESS;
11224 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11225 		if (arm_pmap_max_offset_default) {
11226 			max_offset_ret = arm_pmap_max_offset_default;
11227 		} else if (max_mem > 0x20000000) {
11228 			max_offset_ret = VM_MAX_ADDRESS;
11229 		} else {
11230 			max_offset_ret = VM_MAX_ADDRESS;
11231 		}
11232 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11233 		max_offset_ret = VM_MAX_ADDRESS;
11234 	} else {
11235 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
11236 	}
11237 
11238 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11239 	return max_offset_ret;
11240 }
11241 
11242 #if CONFIG_DTRACE
11243 /*
11244  * Constrain DTrace copyin/copyout actions
11245  */
11246 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11247 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11248 
11249 kern_return_t
11250 dtrace_copyio_preflight(
11251 	__unused addr64_t va)
11252 {
11253 	if (current_map() == kernel_map) {
11254 		return KERN_FAILURE;
11255 	} else {
11256 		return KERN_SUCCESS;
11257 	}
11258 }
11259 
11260 kern_return_t
11261 dtrace_copyio_postflight(
11262 	__unused addr64_t va)
11263 {
11264 	return KERN_SUCCESS;
11265 }
11266 #endif /* CONFIG_DTRACE */
11267 
11268 
11269 void
11270 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11271 {
11272 }
11273 
11274 
11275 void
11276 pmap_flush(
11277 	__unused pmap_flush_context *cpus_to_flush)
11278 {
11279 	/* not implemented yet */
11280 	return;
11281 }
11282 
11283 /**
11284  * Perform basic validation checks on the destination only and
11285  * corresponding offset/sizes prior to writing to a read only allocation.
11286  *
11287  * @note Should be called before writing to an allocation from the read
11288  * only allocator.
11289  *
11290  * @param zid The ID of the zone the allocation belongs to.
11291  * @param va VA of element being modified (destination).
11292  * @param offset Offset being written to, in the element.
11293  * @param new_data_size Size of modification.
11294  *
11295  */
11296 
11297 MARK_AS_PMAP_TEXT static void
11298 pmap_ro_zone_validate_element_dst(
11299 	zone_id_t           zid,
11300 	vm_offset_t         va,
11301 	vm_offset_t         offset,
11302 	vm_size_t           new_data_size)
11303 {
11304 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11305 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11306 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11307 	}
11308 
11309 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11310 
11311 	/* Check element is from correct zone and properly aligned */
11312 	zone_require_ro(zid, elem_size, (void*)va);
11313 
11314 	if (__improbable(new_data_size > (elem_size - offset))) {
11315 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
11316 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11317 	}
11318 	if (__improbable(offset >= elem_size)) {
11319 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
11320 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11321 	}
11322 }
11323 
11324 
11325 /**
11326  * Perform basic validation checks on the source, destination and
11327  * corresponding offset/sizes prior to writing to a read only allocation.
11328  *
11329  * @note Should be called before writing to an allocation from the read
11330  * only allocator.
11331  *
11332  * @param zid The ID of the zone the allocation belongs to.
11333  * @param va VA of element being modified (destination).
11334  * @param offset Offset being written to, in the element.
11335  * @param new_data Pointer to new data (source).
11336  * @param new_data_size Size of modification.
11337  *
11338  */
11339 
11340 MARK_AS_PMAP_TEXT static void
11341 pmap_ro_zone_validate_element(
11342 	zone_id_t           zid,
11343 	vm_offset_t         va,
11344 	vm_offset_t         offset,
11345 	const vm_offset_t   new_data,
11346 	vm_size_t           new_data_size)
11347 {
11348 	vm_offset_t sum = 0;
11349 
11350 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
11351 		panic("%s: Integer addition overflow %p + %lu = %lu",
11352 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
11353 	}
11354 
11355 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
11356 }
11357 
11358 /**
11359  * Function to configure RO zone access permissions for a forthcoming write operation.
11360  */
11361 static void
11362 pmap_ro_zone_prepare_write(void)
11363 {
11364 }
11365 
11366 /**
11367  * Function to indicate that a preceding RO zone write operation is complete.
11368  */
11369 static void
11370 pmap_ro_zone_complete_write(void)
11371 {
11372 }
11373 
11374 /**
11375  * Function to align an address or size to the required RO zone mapping alignment.
11376  *
11377  * For the SPTM the RO zone region must be aligned on a twig boundary so that at least
11378  * the last-level kernel pagetable can be of the appropriate SPTM RO zone table type,
11379  * which allows the SPTM to enforce RO zone mapping permission restrictions.
11380  *
11381  * @param value the address or size to be aligned.
11382  *
11383  * @return the aligned value
11384  */
11385 vm_offset_t
11386 pmap_ro_zone_align(vm_offset_t value)
11387 {
11388 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(kernel_pmap);
11389 	return PMAP_ALIGN(value, pt_attr_twig_size(pt_attr));
11390 }
11391 
11392 /**
11393  * Function to copy kauth_cred from new_data to kv.
11394  * Function defined in "kern_prot.c"
11395  *
11396  * @note Will be removed upon completion of
11397  * <rdar://problem/72635194> Compiler PAC support for memcpy.
11398  *
11399  * @param kv Address to copy new data to.
11400  * @param new_data Pointer to new data.
11401  *
11402  */
11403 
11404 extern void
11405 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
11406 
11407 /**
11408  * Zalloc-specific memcpy that writes through the physical aperture
11409  * and ensures the element being modified is from a read-only zone.
11410  *
11411  * @note Designed to work only with the zone allocator's read-only submap.
11412  *
11413  * @param zid The ID of the zone to allocate from.
11414  * @param va VA of element to be modified.
11415  * @param offset Offset from element.
11416  * @param new_data Pointer to new data.
11417  * @param new_data_size	Size of modification.
11418  *
11419  */
11420 
11421 void
11422 pmap_ro_zone_memcpy(
11423 	zone_id_t           zid,
11424 	vm_offset_t         va,
11425 	vm_offset_t         offset,
11426 	const vm_offset_t   new_data,
11427 	vm_size_t           new_data_size)
11428 {
11429 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
11430 }
11431 
11432 MARK_AS_PMAP_TEXT void
11433 pmap_ro_zone_memcpy_internal(
11434 	zone_id_t             zid,
11435 	vm_offset_t           va,
11436 	vm_offset_t           offset,
11437 	const vm_offset_t     new_data,
11438 	vm_size_t             new_data_size)
11439 {
11440 	if (!new_data || new_data_size == 0) {
11441 		return;
11442 	}
11443 
11444 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11445 	const bool istate = ml_set_interrupts_enabled(FALSE);
11446 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
11447 	pmap_ro_zone_prepare_write();
11448 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
11449 	pmap_ro_zone_complete_write();
11450 	ml_set_interrupts_enabled(istate);
11451 }
11452 
11453 /**
11454  * Zalloc-specific function to atomically mutate fields of an element that
11455  * belongs to a read-only zone, via the physcial aperture.
11456  *
11457  * @note Designed to work only with the zone allocator's read-only submap.
11458  *
11459  * @param zid The ID of the zone the element belongs to.
11460  * @param va VA of element to be modified.
11461  * @param offset Offset in element.
11462  * @param op Atomic operation to perform.
11463  * @param value	Mutation value.
11464  *
11465  */
11466 
11467 uint64_t
11468 pmap_ro_zone_atomic_op(
11469 	zone_id_t             zid,
11470 	vm_offset_t           va,
11471 	vm_offset_t           offset,
11472 	zro_atomic_op_t       op,
11473 	uint64_t              value)
11474 {
11475 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
11476 }
11477 
11478 MARK_AS_PMAP_TEXT uint64_t
11479 pmap_ro_zone_atomic_op_internal(
11480 	zone_id_t             zid,
11481 	vm_offset_t           va,
11482 	vm_offset_t           offset,
11483 	zro_atomic_op_t       op,
11484 	uint64_t              value)
11485 {
11486 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11487 	vm_size_t value_size = op & 0xf;
11488 	const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11489 
11490 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
11491 	pmap_ro_zone_prepare_write();
11492 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
11493 	pmap_ro_zone_complete_write();
11494 	ml_set_interrupts_enabled(istate);
11495 
11496 	return value;
11497 }
11498 
11499 /**
11500  * bzero for allocations from read only zones, that writes through the
11501  * physical aperture.
11502  *
11503  * @note This is called by the zfree path of all allocations from read
11504  * only zones.
11505  *
11506  * @param zid The ID of the zone the allocation belongs to.
11507  * @param va VA of element to be zeroed.
11508  * @param offset Offset in the element.
11509  * @param size	Size of allocation.
11510  *
11511  */
11512 
11513 void
11514 pmap_ro_zone_bzero(
11515 	zone_id_t       zid,
11516 	vm_offset_t     va,
11517 	vm_offset_t     offset,
11518 	vm_size_t       size)
11519 {
11520 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
11521 }
11522 
11523 MARK_AS_PMAP_TEXT void
11524 pmap_ro_zone_bzero_internal(
11525 	zone_id_t       zid,
11526 	vm_offset_t     va,
11527 	vm_offset_t     offset,
11528 	vm_size_t       size)
11529 {
11530 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
11531 	const boolean_t istate = ml_set_interrupts_enabled(FALSE);
11532 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
11533 	pmap_ro_zone_prepare_write();
11534 	bzero((void*)phystokv(pa), size);
11535 	pmap_ro_zone_complete_write();
11536 	ml_set_interrupts_enabled(istate);
11537 }
11538 
11539 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
11540 
11541 MARK_AS_PMAP_TEXT mach_vm_size_t
11542 pmap_query_resident_internal(
11543 	pmap_t                  pmap,
11544 	vm_map_address_t        start,
11545 	vm_map_address_t        end,
11546 	mach_vm_size_t          *compressed_bytes_p)
11547 {
11548 	mach_vm_size_t  resident_bytes = 0;
11549 	mach_vm_size_t  compressed_bytes = 0;
11550 
11551 	pt_entry_t     *bpte, *epte;
11552 	pt_entry_t     *pte_p;
11553 	tt_entry_t     *tte_p;
11554 
11555 	if (pmap == NULL) {
11556 		return PMAP_RESIDENT_INVALID;
11557 	}
11558 
11559 	validate_pmap(pmap);
11560 
11561 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11562 
11563 	/* Ensure that this request is valid, and addresses exactly one TTE. */
11564 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
11565 	    (end % pt_attr_page_size(pt_attr)))) {
11566 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
11567 	}
11568 
11569 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
11570 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
11571 	}
11572 
11573 	pmap_lock(pmap, PMAP_LOCK_SHARED);
11574 	tte_p = pmap_tte(pmap, start);
11575 	if (tte_p == (tt_entry_t *) NULL) {
11576 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
11577 		return PMAP_RESIDENT_INVALID;
11578 	}
11579 	if (tte_is_valid_table(*tte_p)) {
11580 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
11581 		bpte = &pte_p[pte_index(pt_attr, start)];
11582 		epte = &pte_p[pte_index(pt_attr, end)];
11583 
11584 		for (; bpte < epte; bpte++) {
11585 			if (pte_is_compressed(*bpte, bpte)) {
11586 				compressed_bytes += pt_attr_page_size(pt_attr);
11587 			} else if (pa_valid(pte_to_pa(*bpte))) {
11588 				resident_bytes += pt_attr_page_size(pt_attr);
11589 			}
11590 		}
11591 	}
11592 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
11593 
11594 	if (compressed_bytes_p) {
11595 		*compressed_bytes_p += compressed_bytes;
11596 	}
11597 
11598 	return resident_bytes;
11599 }
11600 
11601 mach_vm_size_t
11602 pmap_query_resident(
11603 	pmap_t                  pmap,
11604 	vm_map_address_t        start,
11605 	vm_map_address_t        end,
11606 	mach_vm_size_t          *compressed_bytes_p)
11607 {
11608 	mach_vm_size_t          total_resident_bytes;
11609 	mach_vm_size_t          compressed_bytes;
11610 	vm_map_address_t        va;
11611 
11612 
11613 	if (pmap == PMAP_NULL) {
11614 		if (compressed_bytes_p) {
11615 			*compressed_bytes_p = 0;
11616 		}
11617 		return 0;
11618 	}
11619 
11620 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11621 
11622 	total_resident_bytes = 0;
11623 	compressed_bytes = 0;
11624 
11625 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
11626 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
11627 	    VM_KERNEL_ADDRHIDE(end));
11628 
11629 	va = start;
11630 	while (va < end) {
11631 		vm_map_address_t l;
11632 		mach_vm_size_t resident_bytes;
11633 
11634 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
11635 
11636 		if (l > end) {
11637 			l = end;
11638 		}
11639 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
11640 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
11641 			break;
11642 		}
11643 
11644 		total_resident_bytes += resident_bytes;
11645 
11646 		va = l;
11647 	}
11648 
11649 	if (compressed_bytes_p) {
11650 		*compressed_bytes_p = compressed_bytes;
11651 	}
11652 
11653 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
11654 	    total_resident_bytes);
11655 
11656 	return total_resident_bytes;
11657 }
11658 
11659 #if MACH_ASSERT
11660 static void
11661 pmap_check_ledgers(
11662 	pmap_t pmap)
11663 {
11664 	int     pid;
11665 	char    *procname;
11666 
11667 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
11668 		/*
11669 		 * This pmap was not or is no longer fully associated
11670 		 * with a task (e.g. the old pmap after a fork()/exec() or
11671 		 * spawn()).  Its "ledger" still points at a task that is
11672 		 * now using a different (and active) address space, so
11673 		 * we can't check that all the pmap ledgers are balanced here.
11674 		 *
11675 		 * If the "pid" is set, that means that we went through
11676 		 * pmap_set_process() in task_terminate_internal(), so
11677 		 * this task's ledger should not have been re-used and
11678 		 * all the pmap ledgers should be back to 0.
11679 		 */
11680 		return;
11681 	}
11682 
11683 	pid = pmap->pmap_pid;
11684 	procname = pmap->pmap_procname;
11685 
11686 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
11687 }
11688 #endif /* MACH_ASSERT */
11689 
11690 void
11691 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
11692 {
11693 }
11694 
11695 /**
11696  * The minimum shared region nesting size is used by the VM to determine when to
11697  * break up large mappings to nested regions. The smallest size that these
11698  * mappings can be broken into is determined by what page table level those
11699  * regions are being nested in at and the size of the page tables.
11700  *
11701  * For instance, if a nested region is nesting at L2 for a process utilizing
11702  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
11703  * block entry).
11704  *
11705  * @param pmap The target pmap to determine the block size based on whether it's
11706  *             using 16KB or 4KB page tables.
11707  */
11708 uint64_t
11709 pmap_shared_region_size_min(__unused pmap_t pmap)
11710 {
11711 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11712 
11713 	/**
11714 	 * We always nest the shared region at L2 (32MB for 16KB pages, 8MB for
11715 	 * 4KB pages). This means that a target pmap will contain L2 entries that
11716 	 * point to shared L3 page tables in the shared region pmap.
11717 	 */
11718 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
11719 	return pt_attr_twig_size(pt_attr) * page_ratio;
11720 }
11721 
11722 boolean_t
11723 pmap_enforces_execute_only(
11724 	pmap_t pmap)
11725 {
11726 	return pmap != kernel_pmap;
11727 }
11728 
11729 MARK_AS_PMAP_TEXT void
11730 pmap_set_vm_map_cs_enforced_internal(
11731 	pmap_t pmap,
11732 	bool new_value)
11733 {
11734 	validate_pmap_mutable(pmap);
11735 	pmap->pmap_vm_map_cs_enforced = new_value;
11736 }
11737 
11738 void
11739 pmap_set_vm_map_cs_enforced(
11740 	pmap_t pmap,
11741 	bool new_value)
11742 {
11743 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
11744 }
11745 
11746 extern int cs_process_enforcement_enable;
11747 bool
11748 pmap_get_vm_map_cs_enforced(
11749 	pmap_t pmap)
11750 {
11751 	if (cs_process_enforcement_enable) {
11752 		return true;
11753 	}
11754 	return pmap->pmap_vm_map_cs_enforced;
11755 }
11756 
11757 MARK_AS_PMAP_TEXT void
11758 pmap_set_jit_entitled_internal(
11759 	__unused pmap_t pmap)
11760 {
11761 }
11762 
11763 void
11764 pmap_set_jit_entitled(
11765 	pmap_t pmap)
11766 {
11767 	pmap_set_jit_entitled_internal(pmap);
11768 }
11769 
11770 bool
11771 pmap_get_jit_entitled(
11772 	__unused pmap_t pmap)
11773 {
11774 	return false;
11775 }
11776 
11777 MARK_AS_PMAP_TEXT void
11778 pmap_set_tpro_internal(
11779 	__unused pmap_t pmap)
11780 {
11781 	return;
11782 }
11783 
11784 void
11785 pmap_set_tpro(
11786 	pmap_t pmap)
11787 {
11788 	pmap_set_tpro_internal(pmap);
11789 }
11790 
11791 bool
11792 pmap_get_tpro(
11793 	__unused pmap_t pmap)
11794 {
11795 	return false;
11796 }
11797 
11798 #if HAS_MTE
11799 void
11800 pmap_set_tag_check_enabled(
11801 	pmap_t pmap)
11802 {
11803 	validate_pmap_mutable(pmap);
11804 
11805 	if (pmap->type == PMAP_TYPE_USER) {
11806 		sptm_configure_root(pmap->ttep, SPTM_ROOT_PT_FLAG_MTE, SPTM_ROOT_PT_FLAG_MTE);
11807 	}
11808 }
11809 
11810 void
11811 pmap_set_user_tag_check_faults_disabled(
11812 	pmap_t pmap)
11813 {
11814 	validate_pmap_mutable(pmap);
11815 
11816 	if (pmap->type != PMAP_TYPE_USER) {
11817 		return;
11818 	}
11819 
11820 	sptm_configure_root(pmap->ttep, SPTM_ROOT_PT_FLAG_NO_TAG_FAULT, SPTM_ROOT_PT_FLAG_NO_TAG_FAULT);
11821 	if (pmap == current_pmap()) {
11822 		/* SPTM defers reconfiguring TCF0 until the next sptm_switch_root() call */
11823 		sptm_return_t __assert_only ret = sptm_switch_root(pmap->ttep, 0, 0);
11824 		assert3u(ret & SPTM_SUCCESS, ==, SPTM_SUCCESS);
11825 	}
11826 }
11827 #endif /* HAS_MTE */
11828 
11829 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
11830 
11831 MARK_AS_PMAP_TEXT kern_return_t
11832 pmap_query_page_info_internal(
11833 	pmap_t          pmap,
11834 	vm_map_offset_t va,
11835 	int             *disp_p)
11836 {
11837 	pmap_paddr_t    pa;
11838 	int             disp;
11839 	unsigned int    pai;
11840 	pt_entry_t      *pte_p;
11841 	pv_entry_t      *pve_p;
11842 
11843 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
11844 		*disp_p = 0;
11845 		return KERN_INVALID_ARGUMENT;
11846 	}
11847 
11848 	validate_pmap(pmap);
11849 	pmap_lock(pmap, PMAP_LOCK_SHARED);
11850 
11851 try_again:
11852 	disp = 0;
11853 
11854 	pte_p = pmap_pte(pmap, va);
11855 	if (pte_p == PT_ENTRY_NULL) {
11856 		goto done;
11857 	}
11858 
11859 	const pt_entry_t pte = os_atomic_load(pte_p, relaxed);
11860 	pa = pte_to_pa(pte);
11861 	if (pa == 0) {
11862 		if (pte_is_compressed(pte, pte_p)) {
11863 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
11864 			if (pte & ARM_PTE_COMPRESSED_ALT) {
11865 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
11866 			}
11867 		}
11868 	} else {
11869 		disp |= PMAP_QUERY_PAGE_PRESENT;
11870 		pai = pa_index(pa);
11871 		if (!pa_valid(pa)) {
11872 			goto done;
11873 		}
11874 		locked_pvh_t locked_pvh = pvh_lock(pai);
11875 		if (__improbable(pte != os_atomic_load(pte_p, relaxed))) {
11876 			/* something changed: try again */
11877 			pvh_unlock(&locked_pvh);
11878 			pmap_query_page_info_retries++;
11879 			goto try_again;
11880 		}
11881 		pve_p = PV_ENTRY_NULL;
11882 		int pve_ptep_idx = 0;
11883 		if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
11884 			unsigned int npves = 0;
11885 			pve_p = pvh_pve_list(locked_pvh.pvh);
11886 			while (pve_p != PV_ENTRY_NULL &&
11887 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
11888 				if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
11889 					pvh_lock_enter_sleep_mode(&locked_pvh);
11890 				}
11891 				pve_p = pve_next(pve_p);
11892 				npves++;
11893 			}
11894 		}
11895 
11896 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
11897 			disp |= PMAP_QUERY_PAGE_ALTACCT;
11898 		} else if (ppattr_test_reusable(pai)) {
11899 			disp |= PMAP_QUERY_PAGE_REUSABLE;
11900 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
11901 			disp |= PMAP_QUERY_PAGE_INTERNAL;
11902 		}
11903 		pvh_unlock(&locked_pvh);
11904 	}
11905 
11906 done:
11907 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
11908 	*disp_p = disp;
11909 	return KERN_SUCCESS;
11910 }
11911 
11912 kern_return_t
11913 pmap_query_page_info(
11914 	pmap_t          pmap,
11915 	vm_map_offset_t va,
11916 	int             *disp_p)
11917 {
11918 	return pmap_query_page_info_internal(pmap, va, disp_p);
11919 }
11920 
11921 
11922 
11923 uint32_t
11924 pmap_user_va_bits(pmap_t pmap __unused)
11925 {
11926 #if __ARM_MIXED_PAGE_SIZE__
11927 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
11928 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
11929 #else
11930 	return 64 - T0SZ_BOOT;
11931 #endif
11932 }
11933 
11934 uint32_t
11935 pmap_kernel_va_bits(void)
11936 {
11937 	return 64 - T1SZ_BOOT;
11938 }
11939 
11940 static vm_map_size_t
11941 pmap_user_va_size(pmap_t pmap)
11942 {
11943 	return 1ULL << pmap_user_va_bits(pmap);
11944 }
11945 
11946 #if HAS_MTE || HAS_MTE_EMULATION_SHIMS
11947 static vm_map_address_t
11948 pmap_strip_user_addr(pmap_t pmap, vm_map_address_t ptr)
11949 {
11950 	assert(pmap && pmap != kernel_pmap);
11951 
11952 	/*
11953 	 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR0 address.
11954 	 * Ignore the strip request.
11955 	 */
11956 	if ((ptr & TTBR_SELECTOR) != 0) {
11957 		return ptr;
11958 	}
11959 
11960 	/* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
11961 	return ptr & (pmap->max - 1);
11962 }
11963 
11964 static vm_map_address_t
11965 pmap_strip_kernel_addr(pmap_t pmap, vm_map_address_t ptr)
11966 {
11967 	assert(pmap && pmap == kernel_pmap);
11968 
11969 	/*
11970 	 * TTBR_SELECTOR doesn't match our intention of canonicalizing a TTBR1 address.
11971 	 * Ignore the strip request.
11972 	 */
11973 	if ((ptr & TTBR_SELECTOR) == 0) {
11974 		return ptr;
11975 	}
11976 
11977 	/* This will reset the TTBR_SELECTOR, but we've confirmed above the value. */
11978 	return ptr | pmap->min;
11979 }
11980 
11981 vm_map_address_t
11982 pmap_strip_addr(pmap_t pmap, vm_map_address_t ptr)
11983 {
11984 	assert(pmap);
11985 
11986 	return pmap == kernel_pmap ? pmap_strip_kernel_addr(pmap, ptr) :
11987 	       pmap_strip_user_addr(pmap, ptr);
11988 }
11989 #endif /* HAS_MTE || HAS_MTE_EMULATION_SHIMS */
11990 
11991 
11992 bool
11993 pmap_in_ppl(void)
11994 {
11995 	return false;
11996 }
11997 
11998 MARK_AS_PMAP_TEXT void
11999 pmap_footprint_suspend_internal(
12000 	vm_map_t        map,
12001 	boolean_t       suspend)
12002 {
12003 #if DEVELOPMENT || DEBUG
12004 	if (suspend) {
12005 		current_thread()->pmap_footprint_suspended = TRUE;
12006 		map->pmap->footprint_was_suspended = TRUE;
12007 	} else {
12008 		current_thread()->pmap_footprint_suspended = FALSE;
12009 	}
12010 #else /* DEVELOPMENT || DEBUG */
12011 	(void) map;
12012 	(void) suspend;
12013 #endif /* DEVELOPMENT || DEBUG */
12014 }
12015 
12016 void
12017 pmap_footprint_suspend(
12018 	vm_map_t map,
12019 	boolean_t suspend)
12020 {
12021 	pmap_footprint_suspend_internal(map, suspend);
12022 }
12023 
12024 void
12025 pmap_nop(pmap_t pmap)
12026 {
12027 	validate_pmap_mutable(pmap);
12028 }
12029 
12030 pmap_t
12031 pmap_txm_kernel_pmap(void)
12032 {
12033 	return kernel_pmap;
12034 }
12035 
12036 TXMAddressSpace_t*
12037 pmap_txm_addr_space(const pmap_t pmap)
12038 {
12039 	if (pmap) {
12040 		return pmap->txm_addr_space;
12041 	}
12042 
12043 	/*
12044 	 * When the passed in PMAP is NULL, it means the caller wishes to operate
12045 	 * on the current_pmap(). We could resolve and return that, but it is actually
12046 	 * safer to return NULL since these TXM interfaces also accept NULL inputs
12047 	 * which causes TXM to resolve to the current_pmap() equivalent internally.
12048 	 */
12049 	return NULL;
12050 }
12051 
12052 void
12053 pmap_txm_set_addr_space(
12054 	pmap_t pmap,
12055 	TXMAddressSpace_t *txm_addr_space)
12056 {
12057 	assert(pmap != NULL);
12058 
12059 	if (pmap->txm_addr_space && txm_addr_space) {
12060 		/* Attempted to overwrite the address space in the PMAP */
12061 		panic("attempted ovewrite of TXM address space: %p | %p | %p",
12062 		    pmap, pmap->txm_addr_space, txm_addr_space);
12063 	} else if (!pmap->txm_addr_space && !txm_addr_space) {
12064 		/* This should never happen */
12065 		panic("attempted NULL overwrite of TXM address space: %p", pmap);
12066 	}
12067 
12068 	pmap->txm_addr_space = txm_addr_space;
12069 }
12070 
12071 void
12072 pmap_txm_set_trust_level(
12073 	pmap_t pmap,
12074 	CSTrust_t trust_level)
12075 {
12076 	assert(pmap != NULL);
12077 
12078 	CSTrust_t current_trust = pmap->txm_trust_level;
12079 	if (current_trust != kCSTrustUntrusted) {
12080 		panic("attempted to overwrite TXM trust on the pmap: %p", pmap);
12081 	}
12082 
12083 	pmap->txm_trust_level = trust_level;
12084 }
12085 
12086 kern_return_t
12087 pmap_txm_get_trust_level_kdp(
12088 	pmap_t pmap,
12089 	CSTrust_t *trust_level)
12090 {
12091 	if (pmap == NULL) {
12092 		return KERN_INVALID_ARGUMENT;
12093 	} else if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
12094 		return KERN_INVALID_ARGUMENT;
12095 	}
12096 
12097 	if (trust_level != NULL) {
12098 		*trust_level = pmap->txm_trust_level;
12099 	}
12100 	return KERN_SUCCESS;
12101 }
12102 
12103 kern_return_t
12104 pmap_txm_get_jit_address_range_kdp(
12105 	pmap_t pmap,
12106 	uintptr_t *jit_region_start,
12107 	uintptr_t *jit_region_end)
12108 {
12109 	if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
12110 		return KERN_INVALID_ARGUMENT;
12111 	}
12112 	TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
12113 	if (NULL == txm_addr_space) {
12114 		return KERN_INVALID_ARGUMENT;
12115 	}
12116 	if (ml_validate_nofault((vm_offset_t)txm_addr_space, sizeof(*txm_addr_space)) == false) {
12117 		return KERN_INVALID_ARGUMENT;
12118 	}
12119 	/**
12120 	 * It's a bit gross that we're dereferencing what is supposed to be an abstract type.
12121 	 * If we were running in the TXM, we would always perform additional checks on txm_addr_space,
12122 	 * but this isn't necessary here, since we are running in the kernel and only using the results for
12123 	 * diagnostic purposes, rather than any policy enforcement.
12124 	 */
12125 	if (txm_addr_space->jitRegion) {
12126 		if (ml_validate_nofault((vm_offset_t)txm_addr_space->jitRegion, sizeof(txm_addr_space->jitRegion)) == false) {
12127 			return KERN_INVALID_ARGUMENT;
12128 		}
12129 		if (txm_addr_space->jitRegion->addr && txm_addr_space->jitRegion->addrEnd) {
12130 			*jit_region_start = txm_addr_space->jitRegion->addr;
12131 			*jit_region_end = txm_addr_space->jitRegion->addrEnd;
12132 			return KERN_SUCCESS;
12133 		}
12134 	}
12135 	return KERN_NOT_FOUND;
12136 }
12137 
12138 static pmap_t
12139 _pmap_txm_resolve_pmap(pmap_t pmap)
12140 {
12141 	if (pmap == NULL) {
12142 		pmap = current_pmap();
12143 		if (pmap == kernel_pmap) {
12144 			return NULL;
12145 		}
12146 	}
12147 
12148 	return pmap;
12149 }
12150 
12151 void
12152 pmap_txm_acquire_shared_lock(pmap_t pmap)
12153 {
12154 	pmap = _pmap_txm_resolve_pmap(pmap);
12155 	if (!pmap) {
12156 		return;
12157 	}
12158 
12159 	lck_rw_lock_shared(&pmap->txm_lck);
12160 }
12161 
12162 void
12163 pmap_txm_release_shared_lock(pmap_t pmap)
12164 {
12165 	pmap = _pmap_txm_resolve_pmap(pmap);
12166 	if (!pmap) {
12167 		return;
12168 	}
12169 
12170 	lck_rw_unlock_shared(&pmap->txm_lck);
12171 }
12172 
12173 void
12174 pmap_txm_acquire_exclusive_lock(pmap_t pmap)
12175 {
12176 	pmap = _pmap_txm_resolve_pmap(pmap);
12177 	if (!pmap) {
12178 		return;
12179 	}
12180 
12181 	lck_rw_lock_exclusive(&pmap->txm_lck);
12182 }
12183 
12184 void
12185 pmap_txm_release_exclusive_lock(pmap_t pmap)
12186 {
12187 	pmap = _pmap_txm_resolve_pmap(pmap);
12188 	if (!pmap) {
12189 		return;
12190 	}
12191 
12192 	lck_rw_unlock_exclusive(&pmap->txm_lck);
12193 }
12194 
12195 static void
12196 _pmap_txm_transfer_page(const pmap_paddr_t addr)
12197 {
12198 	sptm_retype_params_t retype_params = {
12199 		.raw = SPTM_RETYPE_PARAMS_NULL
12200 	};
12201 
12202 	/* Retype through the SPTM */
12203 	sptm_retype(addr, XNU_DEFAULT, TXM_DEFAULT, retype_params);
12204 }
12205 
12206 /**
12207  * Prepare a page for retyping to TXM_DEFAULT by clearing its
12208  * internal flags.
12209  *
12210  * @param pa Physical address of the page.
12211  */
12212 static inline void
12213 _pmap_txm_retype_prepare(const pmap_paddr_t pa)
12214 {
12215 	const sptm_retype_params_t retype_params = {
12216 		.raw = SPTM_RETYPE_PARAMS_NULL
12217 	};
12218 
12219 	/**
12220 	 * SPTM allows XNU_DEFAULT pages to request deferral of TLB flushing
12221 	 * when their PTE is updated, which is an important performance
12222 	 * optimization. However, this also allows an attacker controlled
12223 	 * XNU to exploit a read reference with a stale write-enabled PTE in
12224 	 * TLB. This is fine as long as the page is not retyped and the damage
12225 	 * will be contained within XNU domain. However, when such a page needs
12226 	 * to be retyped, SPTM has to make sure there's no outstanding
12227 	 * reference, or there's no history of deferring TLBIs. Internally,
12228 	 * SPTM maintains a flag tracking past deferred TLBIs that only gets
12229 	 * cleared on retyping with no outstanding reference. Therefore, we
12230 	 * do a dummy retype to XNU_DEFAULT itself to clear the internal flag,
12231 	 * before we actually transfer this page to TXM domain. To make sure
12232 	 * SPTM won't throw a violation, all the mappings to the page have to
12233 	 * be removed before calling this.
12234 	 */
12235 	sptm_retype(pa, XNU_DEFAULT, XNU_DEFAULT, retype_params);
12236 }
12237 
12238 /**
12239  * Transfer an XNU owned page to TXM domain.
12240  *
12241  * @param addr Kernel virtual address of the page. It has to be page size
12242  *             aligned.
12243  */
12244 void
12245 pmap_txm_transfer_page(const vm_address_t addr)
12246 {
12247 	assert((addr & PAGE_MASK) == 0);
12248 
12249 	const pmap_paddr_t pa = kvtophys_nofail(addr);
12250 	const unsigned int pai = pa_index(pa);
12251 
12252 	/* Lock the PVH lock to prevent concurrent updates to the mappings during the self retype below. */
12253 	locked_pvh_t locked_pvh = pvh_lock(pai);
12254 
12255 	/* Disconnect the mapping to assure SPTM of no pending TLBI. */
12256 	pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
12257 	    PMAP_OPTIONS_PPO_PENDING_RETYPE, &locked_pvh, NULL);
12258 
12259 	/* Self retype to clear the SPTM internal flags tracking delayed TLBIs for revoked writes. */
12260 	_pmap_txm_retype_prepare(pa);
12261 
12262 	pvh_unlock(&locked_pvh);
12263 
12264 	/* XNU needs to hold an RO reference to the page despite the ownership being transferred to TXM. */
12265 	pmap_enter_addr(kernel_pmap, addr, pa, VM_PROT_READ, VM_PROT_NONE, 0, true, PMAP_MAPPING_TYPE_INFER);
12266 
12267 	/* Finally, retype the page to TXM_DEFAULT. */
12268 	_pmap_txm_transfer_page(pa);
12269 }
12270 
12271 struct vm_object txm_vm_object_storage VM_PAGE_PACKED_ALIGNED;
12272 SECURITY_READ_ONLY_LATE(vm_object_t) txm_vm_object = &txm_vm_object_storage;
12273 
12274 _Static_assert(sizeof(vm_map_address_t) == sizeof(pmap_paddr_t),
12275     "sizeof(vm_map_address_t) != sizeof(pmap_paddr_t)");
12276 
12277 vm_map_address_t
12278 pmap_txm_allocate_page(void)
12279 {
12280 	pmap_paddr_t phys_addr = 0;
12281 	vm_page_t page = VM_PAGE_NULL;
12282 	boolean_t thread_vm_privileged = false;
12283 
12284 	/* We are allowed to allocate privileged memory */
12285 	thread_vm_privileged = set_vm_privilege(true);
12286 
12287 	/* Allocate a page from the VM free list */
12288 	vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
12289 	while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
12290 		VM_PAGE_WAIT();
12291 	}
12292 
12293 	/* Wire all of the pages allocated for TXM */
12294 	vm_page_lock_queues();
12295 	vm_page_wire(page, VM_KERN_MEMORY_SECURITY, TRUE);
12296 	vm_page_unlock_queues();
12297 
12298 	phys_addr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page));
12299 	if (phys_addr == 0) {
12300 		panic("invalid VM page allocated for TXM: %llu", phys_addr);
12301 	}
12302 
12303 	/* Add the physical page to the TXM VM object */
12304 	vm_object_lock(txm_vm_object);
12305 	vm_page_insert_wired(
12306 		page,
12307 		txm_vm_object,
12308 		phys_addr - gPhysBase,
12309 		VM_KERN_MEMORY_SECURITY);
12310 	vm_object_unlock(txm_vm_object);
12311 
12312 	/* Reset thread privilege */
12313 	set_vm_privilege(thread_vm_privileged);
12314 
12315 	/* Retype the page */
12316 	_pmap_txm_transfer_page(phys_addr);
12317 
12318 	return phys_addr;
12319 }
12320 
12321 int
12322 pmap_cs_configuration(void)
12323 {
12324 	code_signing_config_t config = 0;
12325 
12326 	/* Compute the code signing configuration */
12327 	code_signing_configuration(NULL, &config);
12328 
12329 	return (int)config;
12330 }
12331 
12332 bool
12333 pmap_performs_stage2_translations(
12334 	__unused pmap_t pmap)
12335 {
12336 	return false;
12337 }
12338 
12339 bool
12340 pmap_has_iofilter_protected_write(void)
12341 {
12342 #if HAS_GUARDED_IO_FILTER
12343 	return true;
12344 #else
12345 	return false;
12346 #endif
12347 }
12348 
12349 #if HAS_GUARDED_IO_FILTER
12350 
12351 void
12352 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12353 {
12354 	/**
12355 	 * Even though this is done from EL1/2 for an address potentially owned by Guarded
12356 	 * Mode, we should be fine as mmu_kvtop uses "at s1e1r" checking for read access
12357 	 * only.
12358 	 */
12359 	const pmap_paddr_t pa = mmu_kvtop(addr);
12360 
12361 	if (!pa) {
12362 		panic("%s: addr 0x%016llx doesn't have a valid kernel mapping", __func__, (uint64_t) addr);
12363 	}
12364 
12365 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
12366 	if (frame_type == XNU_PROTECTED_IO) {
12367 		bool is_hibernating = false;
12368 		if (__improbable(is_hibernating)) {
12369 			/**
12370 			 * Default set to NO_PANICKING_DOMAIN and not to INVALID_DOMAIN since
12371 			 * INVALID_DOMAIN is set for panic in dispatch logic itself.
12372 			 */
12373 			sptm_domain_t panic_source = NO_PANICKING_DOMAIN;
12374 			(void)sptm_panic_source(&panic_source);
12375 
12376 			/**
12377 			 * If panic_source is invalid (NO_PANICKING_DOMAIN: sptm_panic_source() failed
12378 			 * or no panic occurred) OR if the panic_source is XNU_DOMAIN, then use the
12379 			 * hibernation-specific write.
12380 			 */
12381 			if (panic_source == NO_PANICKING_DOMAIN || panic_source == XNU_DOMAIN) {
12382 				sptm_hib_iofilter_protected_write(pa, value, width);
12383 			} else {
12384 				/* Panic source is valid (panic occurred) and not XNU_DOMAIN */
12385 				sptm_iofilter_protected_write(pa, value, width);
12386 			}
12387 		} else {
12388 			sptm_iofilter_protected_write(pa, value, width);
12389 		}
12390 	} else {
12391 		/* Mappings is valid but not specified by I/O filter. However, we still try
12392 		 * accessing the address from kernel mode. This allows addresses that are not
12393 		 * owned by SPTM to be accessed by this interface.
12394 		 */
12395 		switch (width) {
12396 		case 1:
12397 			*(volatile uint8_t *)addr = (uint8_t) value;
12398 			break;
12399 		case 2:
12400 			*(volatile uint16_t *)addr = (uint16_t) value;
12401 			break;
12402 		case 4:
12403 			*(volatile uint32_t *)addr = (uint32_t) value;
12404 			break;
12405 		case 8:
12406 			*(volatile uint64_t *)addr = (uint64_t) value;
12407 			break;
12408 		default:
12409 			panic("%s: width %llu not supported", __func__, width);
12410 		}
12411 	}
12412 }
12413 
12414 #else /* HAS_GUARDED_IO_FILTER */
12415 
12416 __attribute__((__noreturn__))
12417 void
12418 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12419 {
12420 	panic("%s called on an unsupported platform.", __FUNCTION__);
12421 }
12422 
12423 #endif /* HAS_GUARDED_IO_FILTER */
12424 
12425 void * __attribute__((noreturn))
12426 pmap_claim_reserved_ppl_page(void)
12427 {
12428 	panic("%s: function not supported in this environment", __FUNCTION__);
12429 }
12430 
12431 void __attribute__((noreturn))
12432 pmap_free_reserved_ppl_page(void __unused *kva)
12433 {
12434 	panic("%s: function not supported in this environment", __FUNCTION__);
12435 }
12436 
12437 bool
12438 pmap_lookup_in_loaded_trust_caches(__unused const uint8_t cdhash[CS_CDHASH_LEN])
12439 {
12440 	kern_return_t kr = query_trust_cache(
12441 		kTCQueryTypeLoadable,
12442 		cdhash,
12443 		NULL);
12444 
12445 	if (kr == KERN_SUCCESS) {
12446 		return true;
12447 	}
12448 	return false;
12449 }
12450 
12451 uint32_t
12452 pmap_lookup_in_static_trust_cache(__unused const uint8_t cdhash[CS_CDHASH_LEN])
12453 {
12454 	TrustCacheQueryToken_t query_token = {0};
12455 	kern_return_t kr = KERN_NOT_FOUND;
12456 	uint64_t flags = 0;
12457 	uint8_t hash_type = 0;
12458 
12459 	kr = query_trust_cache(
12460 		kTCQueryTypeStatic,
12461 		cdhash,
12462 		&query_token);
12463 
12464 	if (kr == KERN_SUCCESS) {
12465 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
12466 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
12467 
12468 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
12469 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
12470 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
12471 	}
12472 
12473 	return 0;
12474 }
12475 
12476 #if DEVELOPMENT || DEBUG
12477 
12478 struct page_table_dump_header {
12479 	uint64_t pa;
12480 	uint64_t num_entries;
12481 	uint64_t start_va;
12482 	uint64_t end_va;
12483 };
12484 
12485 static kern_return_t
12486 pmap_dump_page_tables_recurse(pmap_t pmap,
12487     const tt_entry_t *ttp,
12488     unsigned int cur_level,
12489     unsigned int level_mask,
12490     uint64_t start_va,
12491     void *buf_start,
12492     void *buf_end,
12493     size_t *bytes_copied)
12494 {
12495 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12496 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
12497 
12498 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
12499 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
12500 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
12501 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
12502 
12503 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
12504 
12505 	if (cur_level == pt_attr_root_level(pt_attr)) {
12506 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
12507 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
12508 	}
12509 
12510 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
12511 	const tt_entry_t *tt_end = &ttp[num_entries];
12512 
12513 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
12514 		return KERN_INSUFFICIENT_BUFFER_SIZE;
12515 	}
12516 
12517 	if (level_mask & (1U << cur_level)) {
12518 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
12519 		header->pa = kvtophys_nofail((vm_offset_t)ttp);
12520 		header->num_entries = num_entries;
12521 		header->start_va = start_va;
12522 		header->end_va = start_va + (num_entries * size);
12523 
12524 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
12525 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
12526 	}
12527 	uint64_t current_va = start_va;
12528 
12529 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
12530 		tt_entry_t tte = *ttep;
12531 
12532 		if (!(tte & valid_mask)) {
12533 			continue;
12534 		}
12535 
12536 		if ((tte & type_mask) == type_block) {
12537 			continue;
12538 		} else {
12539 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
12540 				panic("%s: corrupt entry %#llx at %p, "
12541 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
12542 				    __FUNCTION__, tte, ttep,
12543 				    ttp, cur_level, bufp, buf_end);
12544 			}
12545 
12546 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
12547 
12548 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
12549 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
12550 
12551 			if (recurse_result != KERN_SUCCESS) {
12552 				return recurse_result;
12553 			}
12554 		}
12555 	}
12556 
12557 	return KERN_SUCCESS;
12558 }
12559 
12560 kern_return_t
12561 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
12562 {
12563 	if (not_in_kdp) {
12564 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
12565 	}
12566 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
12567 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
12568 }
12569 
12570 #else /* DEVELOPMENT || DEBUG */
12571 
12572 kern_return_t
12573 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
12574     unsigned int level_mask __unused, size_t *bytes_copied __unused)
12575 {
12576 	return KERN_NOT_SUPPORTED;
12577 }
12578 #endif /* !(DEVELOPMENT || DEBUG) */
12579 
12580 
12581 #ifdef CONFIG_XNUPOST
12582 static volatile bool pmap_test_took_fault = false;
12583 
12584 static bool
12585 pmap_test_fault_handler(arm_saved_state_t * state)
12586 {
12587 	bool retval                 = false;
12588 	uint64_t esr                = get_saved_state_esr(state);
12589 	esr_exception_class_t class = ESR_EC(esr);
12590 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
12591 
12592 	if ((class == ESR_EC_DABORT_EL1) &&
12593 	    ((fsc == FSC_PERMISSION_FAULT_L3)
12594 	    || (fsc == FSC_ACCESS_FLAG_FAULT_L3)
12595 	    || (fsc == FSC_TRANSLATION_FAULT_L0))) {
12596 		pmap_test_took_fault = true;
12597 		/* return to the instruction immediately after the call to NX page */
12598 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
12599 		retval = true;
12600 	}
12601 
12602 	return retval;
12603 }
12604 
12605 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
12606 static NOKASAN bool
12607 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
12608 {
12609 	pmap_t old_pmap = NULL;
12610 	thread_t thread = current_thread();
12611 
12612 	pmap_test_took_fault = false;
12613 
12614 	/*
12615 	 * We're potentially switching pmaps without using the normal thread
12616 	 * mechanism; disable interrupts and preemption to avoid any unexpected
12617 	 * memory accesses.
12618 	 */
12619 	const boolean_t old_int_state = ml_set_interrupts_enabled(FALSE);
12620 	mp_disable_preemption();
12621 
12622 	if (pmap != NULL) {
12623 		old_pmap = current_pmap();
12624 		pmap_switch(pmap, thread);
12625 
12626 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
12627 #if __ARM_PAN_AVAILABLE__
12628 		__builtin_arm_wsr("pan", 0);
12629 #endif /* __ARM_PAN_AVAILABLE__ */
12630 	}
12631 
12632 	ml_expect_fault_begin(pmap_test_fault_handler, va);
12633 
12634 	if (is_write) {
12635 		*((volatile uint64_t*)(va)) = 0xdec0de;
12636 	} else {
12637 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
12638 		(void)tmp;
12639 	}
12640 
12641 	/* Save the fault bool, and undo the gross stuff we did. */
12642 	bool took_fault = pmap_test_took_fault;
12643 	ml_expect_fault_end();
12644 
12645 	if (pmap != NULL) {
12646 #if __ARM_PAN_AVAILABLE__
12647 		__builtin_arm_wsr("pan", 1);
12648 #endif /* __ARM_PAN_AVAILABLE__ */
12649 
12650 		pmap_switch(old_pmap, thread);
12651 	}
12652 
12653 	mp_enable_preemption();
12654 	ml_set_interrupts_enabled(old_int_state);
12655 	bool retval = (took_fault == should_fault);
12656 	return retval;
12657 }
12658 
12659 static bool
12660 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
12661 {
12662 	bool retval = pmap_test_access(pmap, va, should_fault, false);
12663 
12664 	if (!retval) {
12665 		T_FAIL("%s: %s, "
12666 		    "pmap=%p, va=%p, should_fault=%u",
12667 		    __func__, should_fault ? "did not fault" : "faulted",
12668 		    pmap, (void*)va, (unsigned)should_fault);
12669 	}
12670 
12671 	return retval;
12672 }
12673 
12674 static bool
12675 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
12676 {
12677 	bool retval = pmap_test_access(pmap, va, should_fault, true);
12678 
12679 	if (!retval) {
12680 		T_FAIL("%s: %s, "
12681 		    "pmap=%p, va=%p, should_fault=%u",
12682 		    __func__, should_fault ? "did not fault" : "faulted",
12683 		    pmap, (void*)va, (unsigned)should_fault);
12684 	}
12685 
12686 	return retval;
12687 }
12688 
12689 static bool
12690 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
12691 {
12692 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12693 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
12694 
12695 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
12696 
12697 	if (!retval) {
12698 		T_FAIL("%s: bits=%u, "
12699 		    "pa=%p, should_be_set=%u",
12700 		    __func__, bits,
12701 		    (void*)pa, should_be_set);
12702 	}
12703 
12704 	return retval;
12705 }
12706 
12707 static __attribute__((noinline)) bool
12708 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
12709 {
12710 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
12711 	return retval;
12712 }
12713 
12714 static int
12715 pmap_test_test_config(unsigned int flags)
12716 {
12717 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
12718 	unsigned int map_count = 0;
12719 	unsigned long page_ratio = 0;
12720 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
12721 
12722 	if (!pmap) {
12723 		panic("Failed to allocate pmap");
12724 	}
12725 
12726 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12727 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
12728 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
12729 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
12730 
12731 	if (pmap_page_size <= native_page_size) {
12732 		page_ratio = native_page_size / pmap_page_size;
12733 	} else {
12734 		/*
12735 		 * We claim to support a page_ratio of less than 1, which is
12736 		 * not currently supported by the pmap layer; panic.
12737 		 */
12738 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
12739 		    "flags=%u",
12740 		    __func__, native_page_size, pmap_page_size,
12741 		    flags);
12742 	}
12743 
12744 	if (PAGE_RATIO > 1) {
12745 		/*
12746 		 * The kernel is deliberately pretending to have 16KB pages.
12747 		 * The pmap layer has code that supports this, so pretend the
12748 		 * page size is larger than it is.
12749 		 */
12750 		pmap_page_size = PAGE_SIZE;
12751 		native_page_size = PAGE_SIZE;
12752 	}
12753 
12754 	/*
12755 	 * Get two pages from the VM; one to be mapped wired, and one to be
12756 	 * mapped nonwired.
12757 	 */
12758 	vm_page_t unwired_vm_page = vm_page_grab();
12759 	vm_page_t wired_vm_page = vm_page_grab();
12760 
12761 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
12762 		panic("Failed to grab VM pages");
12763 	}
12764 
12765 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
12766 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
12767 
12768 	pmap_paddr_t pa = ptoa(pn);
12769 	pmap_paddr_t wired_pa = ptoa(wired_pn);
12770 
12771 	/*
12772 	 * We'll start mappings at the second twig TT.  This keeps us from only
12773 	 * using the first entry in each TT, which would trivially be address
12774 	 * 0; one of the things we will need to test is retrieving the VA for
12775 	 * a given PTE.
12776 	 */
12777 	vm_map_address_t va_base = pmap_twig_size;
12778 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
12779 
12780 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
12781 		/*
12782 		 * Not exactly a functional failure, but this test relies on
12783 		 * there being a spare PTE slot we can use to pin the TT.
12784 		 */
12785 		panic("Cannot pin translation table");
12786 	}
12787 
12788 	/*
12789 	 * Create the wired mapping; this will prevent the pmap layer from
12790 	 * reclaiming our test TTs, which would interfere with this test
12791 	 * ("interfere" -> "make it panic").
12792 	 */
12793 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true, PMAP_MAPPING_TYPE_INFER);
12794 
12795 	T_LOG("Validate that kernel cannot write to SPTM memory.");
12796 	pt_entry_t * ptep = pmap_pte(pmap, va_base);
12797 	pmap_test_write(NULL, (vm_map_address_t)ptep, true);
12798 
12799 	/*
12800 	 * Create read-only mappings of the nonwired page; if the pmap does
12801 	 * not use the same page size as the kernel, create multiple mappings
12802 	 * so that the kernel page is fully mapped.
12803 	 */
12804 	for (map_count = 0; map_count < page_ratio; map_count++) {
12805 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)),
12806 		    VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12807 	}
12808 
12809 	/* Validate that all the PTEs have the expected PA and VA. */
12810 	for (map_count = 0; map_count < page_ratio; map_count++) {
12811 		ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
12812 
12813 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
12814 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
12815 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
12816 		}
12817 
12818 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
12819 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
12820 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
12821 		}
12822 	}
12823 
12824 	T_LOG("Validate that reads to our mapping do not fault.");
12825 	pmap_test_read(pmap, va_base, false);
12826 
12827 	T_LOG("Validate that writes to our mapping fault.");
12828 	pmap_test_write(pmap, va_base, true);
12829 
12830 	T_LOG("Make the first mapping writable.");
12831 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12832 
12833 	T_LOG("Validate that writes to our mapping do not fault.");
12834 	pmap_test_write(pmap, va_base, false);
12835 
12836 	/*
12837 	 * For page ratios of greater than 1: validate that writes to the other
12838 	 * mappings still fault.  Remove the mappings afterwards (we're done
12839 	 * with page ratio testing).
12840 	 */
12841 	for (map_count = 1; map_count < page_ratio; map_count++) {
12842 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
12843 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
12844 	}
12845 
12846 	/* Remove remaining mapping */
12847 	pmap_remove(pmap, va_base, va_base + pmap_page_size);
12848 
12849 	T_LOG("Test XO mapping");
12850 	kern_return_t kr = pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12851 	if (pmap_allows_xo(pmap)) {
12852 		if (kr != KERN_SUCCESS) {
12853 			T_FAIL("XO mapping returned 0x%x instead of KERN_SUCCESS", (unsigned int)kr);
12854 		}
12855 	} else if (kr != KERN_PROTECTION_FAILURE) {
12856 		T_FAIL("XO mapping returned 0x%x instead of KERN_PROTECTION_FAILURE", (unsigned int)kr);
12857 	}
12858 
12859 	T_LOG("Make the first mapping RX");
12860 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE | VM_PROT_READ, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12861 
12862 	T_LOG("Validate that reads to our mapping do not fault.");
12863 	pmap_test_read(pmap, va_base, false);
12864 
12865 	T_LOG("Validate that writes to our mapping fault.");
12866 	pmap_test_write(pmap, va_base, true);
12867 
12868 	pmap_remove(pmap, va_base, va_base + pmap_page_size);
12869 
12870 	T_LOG("Mark the page unreferenced and unmodified.");
12871 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12872 	pmap_test_check_refmod(pa, 0);
12873 	pmap_recycle_page(atop(pa));
12874 
12875 	/*
12876 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
12877 	 * different protection/fault_type settings, and confirm that the
12878 	 * ref/mod state matches our expectations at each step.
12879 	 */
12880 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
12881 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12882 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12883 
12884 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
12885 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12886 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12887 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12888 
12889 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
12890 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12891 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12892 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12893 
12894 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
12895 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12896 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12897 
12898 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
12899 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12900 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12901 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12902 
12903 	/*
12904 	 * Shared memory testing; we'll have two mappings; one read-only,
12905 	 * one read-write.
12906 	 */
12907 	vm_map_address_t rw_base = va_base;
12908 	vm_map_address_t ro_base = va_base + pmap_page_size;
12909 
12910 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12911 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12912 
12913 	/*
12914 	 * Test that we take faults as expected for unreferenced/unmodified
12915 	 * pages.  Also test the arm_fast_fault interface, to ensure that
12916 	 * mapping permissions change as expected.
12917 	 */
12918 	T_LOG("!ref/!mod: expect no access");
12919 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12920 	pmap_test_read_write(pmap, ro_base, false, false);
12921 	pmap_test_read_write(pmap, rw_base, false, false);
12922 
12923 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
12924 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
12925 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12926 	pmap_test_read_write(pmap, ro_base, true, false);
12927 	pmap_test_read_write(pmap, rw_base, true, false);
12928 
12929 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
12930 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12931 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12932 	pmap_test_read_write(pmap, ro_base, true, false);
12933 	pmap_test_read_write(pmap, rw_base, true, true);
12934 
12935 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
12936 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12937 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12938 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12939 	pmap_test_read_write(pmap, ro_base, true, false);
12940 	pmap_test_read_write(pmap, rw_base, true, true);
12941 
12942 	T_LOG("RW protect both mappings; should not change protections.");
12943 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12944 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12945 	pmap_test_read_write(pmap, ro_base, true, false);
12946 	pmap_test_read_write(pmap, rw_base, true, true);
12947 
12948 	T_LOG("Read protect both mappings; RW mapping should become RO.");
12949 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
12950 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
12951 	pmap_test_read_write(pmap, ro_base, true, false);
12952 	pmap_test_read_write(pmap, rw_base, true, false);
12953 
12954 	T_LOG("RW protect the page; mappings should not change protections.");
12955 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12956 	pmap_page_protect(pn, VM_PROT_ALL);
12957 	pmap_test_read_write(pmap, ro_base, true, false);
12958 	pmap_test_read_write(pmap, rw_base, true, true);
12959 
12960 	T_LOG("Read protect the page; RW mapping should become RO.");
12961 	pmap_page_protect(pn, VM_PROT_READ);
12962 	pmap_test_read_write(pmap, ro_base, true, false);
12963 	pmap_test_read_write(pmap, rw_base, true, false);
12964 
12965 	T_LOG("Validate that disconnect removes all known mappings of the page.");
12966 	pmap_disconnect(pn);
12967 	if (!pmap_verify_free(pn)) {
12968 		T_FAIL("Page still has mappings");
12969 	}
12970 
12971 #if defined(ARM_LARGE_MEMORY)
12972 #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */
12973 #if !defined(ARM_LARGE_MEMORY_KERNONLY)
12974 
12975 	T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY.");
12976 	pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12977 	pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true);
12978 	pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size);
12979 #else /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12980 	/* Using kernel-only large memory. Make sure user pmap will fail. */
12981 	T_LOG("Expect wired mapping to fault in ARM_LARGE_MEMORY when using KERNONLY.");
12982 
12983 	/* The mapping should be rejected, it's outside of T0SZ */
12984 	kr = pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa,
12985 	    VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12986 	T_QUIET; T_ASSERT_NE_INT(kr, KERN_SUCCESS, NULL);
12987 
12988 	/* Addressing outside of T0SZ should result in a L0 xlate fault */
12989 	const bool did_fault = pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, false, false);
12990 	T_QUIET; T_ASSERT(did_fault, NULL);
12991 #endif /* !defined(ARM_LARGE_MEMORY_KERNONLY) */
12992 #endif /* ARM_LARGE_MEMORY */
12993 
12994 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
12995 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
12996 	pmap_destroy(pmap);
12997 
12998 	T_LOG("Release the pages back to the VM.");
12999 	vm_page_lock_queues();
13000 	vm_page_free(unwired_vm_page);
13001 	vm_page_free(wired_vm_page);
13002 	vm_page_unlock_queues();
13003 
13004 	T_LOG("Testing successful!");
13005 	return 0;
13006 }
13007 
13008 kern_return_t
13009 pmap_test(void)
13010 {
13011 	T_LOG("Starting pmap_tests");
13012 	const int flags = PMAP_CREATE_TEST | PMAP_CREATE_64BIT;
13013 
13014 #if __ARM_MIXED_PAGE_SIZE__
13015 	T_LOG("Testing VM_PAGE_SIZE_4KB");
13016 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
13017 	T_LOG("Testing VM_PAGE_SIZE_16KB");
13018 	pmap_test_test_config(flags);
13019 #else /* __ARM_MIXED_PAGE_SIZE__ */
13020 	pmap_test_test_config(flags);
13021 #endif /* __ARM_MIXED_PAGE_SIZE__ */
13022 
13023 	T_PASS("completed pmap_test successfully");
13024 	return KERN_SUCCESS;
13025 }
13026 #endif /* CONFIG_XNUPOST */
13027 
13028 /*
13029  * The following function should never make it to RELEASE code, since
13030  * it provides a way to get the PPL to modify text pages.
13031  */
13032 #if DEVELOPMENT || DEBUG
13033 
13034 /**
13035  * Forcibly overwrite executable text with an illegal instruction.
13036  *
13037  * @note Only used for xnu unit testing.
13038  *
13039  * @param pa The physical address to corrupt.
13040  *
13041  * @return KERN_SUCCESS on success.
13042  */
13043 kern_return_t
13044 pmap_test_text_corruption(pmap_paddr_t pa __unused)
13045 {
13046 	/*
13047 	 * SPTM TODO: implement an SPTM version of this.
13048 	 * The physical apertue is owned by the SPTM and text
13049 	 * pages have RO physical aperture mappings.
13050 	 */
13051 	return KERN_SUCCESS;
13052 }
13053 
13054 #endif /* DEVELOPMENT || DEBUG */
13055 
13056