xref: /xnu-11417.140.69/osfmk/arm64/sptm/pmap/pmap.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2011-2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32 
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39 
40 #include <mach/boolean.h>
41 #include <kern/backtrace.h>
42 #include <kern/bits.h>
43 #include <kern/ecc.h>
44 #include <kern/thread.h>
45 #include <kern/sched.h>
46 #include <kern/zalloc.h>
47 #include <kern/zalloc_internal.h>
48 #include <kern/kalloc.h>
49 #include <kern/spl.h>
50 #include <kern/startup.h>
51 #include <kern/trap_telemetry.h>
52 #include <kern/trustcache.h>
53 
54 #include <os/overflow.h>
55 
56 #include <vm/pmap.h>
57 #include <vm/pmap_cs.h>
58 #include <vm/vm_map_xnu.h>
59 #include <vm/vm_kern.h>
60 #include <vm/vm_protos.h>
61 #include <vm/vm_object_internal.h>
62 #include <vm/vm_page_internal.h>
63 #include <vm/vm_pageout.h>
64 #include <vm/cpm_internal.h>
65 
66 
67 #include <libkern/section_keywords.h>
68 #include <sys/errno.h>
69 
70 #include <libkern/amfi/amfi.h>
71 #include <sys/trusted_execution_monitor.h>
72 #include <sys/trust_caches.h>
73 #include <sys/code_signing.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/thread.h>
77 #include <machine/lowglobals.h>
78 
79 #include <arm/caches_internal.h>
80 #include <arm/cpu_data.h>
81 #include <arm/cpu_data_internal.h>
82 #include <arm/cpu_capabilities.h>
83 #include <arm/cpu_number.h>
84 #include <arm/machine_cpu.h>
85 #include <arm/misc_protos.h>
86 #include <arm/trap_internal.h>
87 #include <arm64/sptm/pmap/pmap_internal.h>
88 #include <arm64/sptm/sptm.h>
89 
90 #include <arm64/proc_reg.h>
91 #include <pexpert/arm64/boot.h>
92 #include <arm64/ppl/uat.h>
93 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
94 #include <arm64/amcc_rorgn.h>
95 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
96 
97 #include <pexpert/device_tree.h>
98 
99 #include <san/kasan.h>
100 #include <sys/cdefs.h>
101 
102 #if defined(HAS_APPLE_PAC)
103 #include <ptrauth.h>
104 #endif
105 
106 #ifdef CONFIG_XNUPOST
107 #include <tests/xnupost.h>
108 #endif
109 
110 
111 #if HIBERNATION
112 #include <IOKit/IOHibernatePrivate.h>
113 #endif /* HIBERNATION */
114 
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 
117 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
118 
119 
120 /**
121  * Per-CPU data used to do setup and post-processing for SPTM calls.
122  * On the setup side, this structure is used to store parameters for batched SPTM operations.
123  * These parameters may be large (upwards of 1K), and given that SPTM calls are generally
124  * issued from preemption-disabled contexts anyway, it's better to store them in per-CPU
125  * data rather than the local stack.
126  * On the post-processing side, this structure exposes a pointer to the SPTM's per-CPU array
127  * of 'prev_ptes', that is the prior value encountered in each PTE at the time of the SPTM's
128  * atomic update of that PTE.
129  */
130 pmap_sptm_percpu_data_t PERCPU_DATA(pmap_sptm_percpu);
131 
132 /**
133  * Reference group for global tracking of all outstanding pmap references.
134  */
135 os_refgrp_decl(static, pmap_refgrp, "pmap", NULL);
136 
137 /* Boot-arg to enable/disable the use of XNU_KERNEL_RESTRICTED type in SPTM. */
138 TUNABLE(bool, use_xnu_restricted, "xnu_restricted", true);
139 
140 extern u_int32_t random(void); /* from <libkern/libkern.h> */
141 
142 static bool alloc_asid(pmap_t pmap);
143 static void free_asid(pmap_t pmap);
144 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only);
145 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
146 
147 const struct page_table_ops native_pt_ops =
148 {
149 	.alloc_id = alloc_asid,
150 	.free_id = free_asid,
151 	.flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
152 	.wimg_to_pte = wimg_to_pte,
153 };
154 
155 const struct page_table_level_info pmap_table_level_info_16k[] =
156 {
157 	[0] = {
158 		.size       = ARM_16K_TT_L0_SIZE,
159 		.offmask    = ARM_16K_TT_L0_OFFMASK,
160 		.shift      = ARM_16K_TT_L0_SHIFT,
161 		.index_mask = ARM_16K_TT_L0_INDEX_MASK,
162 		.valid_mask = ARM_TTE_VALID,
163 		.type_mask  = ARM_TTE_TYPE_MASK,
164 		.type_block = ARM_TTE_TYPE_BLOCK
165 	},
166 	[1] = {
167 		.size       = ARM_16K_TT_L1_SIZE,
168 		.offmask    = ARM_16K_TT_L1_OFFMASK,
169 		.shift      = ARM_16K_TT_L1_SHIFT,
170 		.index_mask = ARM_16K_TT_L1_INDEX_MASK,
171 		.valid_mask = ARM_TTE_VALID,
172 		.type_mask  = ARM_TTE_TYPE_MASK,
173 		.type_block = ARM_TTE_TYPE_BLOCK
174 	},
175 	[2] = {
176 		.size       = ARM_16K_TT_L2_SIZE,
177 		.offmask    = ARM_16K_TT_L2_OFFMASK,
178 		.shift      = ARM_16K_TT_L2_SHIFT,
179 		.index_mask = ARM_16K_TT_L2_INDEX_MASK,
180 		.valid_mask = ARM_TTE_VALID,
181 		.type_mask  = ARM_TTE_TYPE_MASK,
182 		.type_block = ARM_TTE_TYPE_BLOCK
183 	},
184 	[3] = {
185 		.size       = ARM_16K_TT_L3_SIZE,
186 		.offmask    = ARM_16K_TT_L3_OFFMASK,
187 		.shift      = ARM_16K_TT_L3_SHIFT,
188 		.index_mask = ARM_16K_TT_L3_INDEX_MASK,
189 		.valid_mask = ARM_PTE_TYPE_VALID,
190 		.type_mask  = ARM_TTE_TYPE_MASK,
191 		.type_block = ARM_TTE_TYPE_L3BLOCK
192 	}
193 };
194 
195 const struct page_table_level_info pmap_table_level_info_4k[] =
196 {
197 	[0] = {
198 		.size       = ARM_4K_TT_L0_SIZE,
199 		.offmask    = ARM_4K_TT_L0_OFFMASK,
200 		.shift      = ARM_4K_TT_L0_SHIFT,
201 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
202 		.valid_mask = ARM_TTE_VALID,
203 		.type_mask  = ARM_TTE_TYPE_MASK,
204 		.type_block = ARM_TTE_TYPE_BLOCK
205 	},
206 	[1] = {
207 		.size       = ARM_4K_TT_L1_SIZE,
208 		.offmask    = ARM_4K_TT_L1_OFFMASK,
209 		.shift      = ARM_4K_TT_L1_SHIFT,
210 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
211 		.valid_mask = ARM_TTE_VALID,
212 		.type_mask  = ARM_TTE_TYPE_MASK,
213 		.type_block = ARM_TTE_TYPE_BLOCK
214 	},
215 	[2] = {
216 		.size       = ARM_4K_TT_L2_SIZE,
217 		.offmask    = ARM_4K_TT_L2_OFFMASK,
218 		.shift      = ARM_4K_TT_L2_SHIFT,
219 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
220 		.valid_mask = ARM_TTE_VALID,
221 		.type_mask  = ARM_TTE_TYPE_MASK,
222 		.type_block = ARM_TTE_TYPE_BLOCK
223 	},
224 	[3] = {
225 		.size       = ARM_4K_TT_L3_SIZE,
226 		.offmask    = ARM_4K_TT_L3_OFFMASK,
227 		.shift      = ARM_4K_TT_L3_SHIFT,
228 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
229 		.valid_mask = ARM_PTE_TYPE_VALID,
230 		.type_mask  = ARM_TTE_TYPE_MASK,
231 		.type_block = ARM_TTE_TYPE_L3BLOCK
232 	}
233 };
234 
235 const struct page_table_level_info pmap_table_level_info_4k_stage2[] =
236 {
237 	[0] = { /* Unused */
238 		.size       = ARM_4K_TT_L0_SIZE,
239 		.offmask    = ARM_4K_TT_L0_OFFMASK,
240 		.shift      = ARM_4K_TT_L0_SHIFT,
241 		.index_mask = ARM_4K_TT_L0_INDEX_MASK,
242 		.valid_mask = ARM_TTE_VALID,
243 		.type_mask  = ARM_TTE_TYPE_MASK,
244 		.type_block = ARM_TTE_TYPE_BLOCK
245 	},
246 	[1] = { /* Concatenated, so index mask is larger than normal */
247 		.size       = ARM_4K_TT_L1_SIZE,
248 		.offmask    = ARM_4K_TT_L1_OFFMASK,
249 		.shift      = ARM_4K_TT_L1_SHIFT,
250 #ifdef ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK
251 		.index_mask = ARM_4K_TT_L1_40_BIT_CONCATENATED_INDEX_MASK,
252 #else
253 		.index_mask = ARM_4K_TT_L1_INDEX_MASK,
254 #endif
255 		.valid_mask = ARM_TTE_VALID,
256 		.type_mask  = ARM_TTE_TYPE_MASK,
257 		.type_block = ARM_TTE_TYPE_BLOCK
258 	},
259 	[2] = {
260 		.size       = ARM_4K_TT_L2_SIZE,
261 		.offmask    = ARM_4K_TT_L2_OFFMASK,
262 		.shift      = ARM_4K_TT_L2_SHIFT,
263 		.index_mask = ARM_4K_TT_L2_INDEX_MASK,
264 		.valid_mask = ARM_TTE_VALID,
265 		.type_mask  = ARM_TTE_TYPE_MASK,
266 		.type_block = ARM_TTE_TYPE_BLOCK
267 	},
268 	[3] = {
269 		.size       = ARM_4K_TT_L3_SIZE,
270 		.offmask    = ARM_4K_TT_L3_OFFMASK,
271 		.shift      = ARM_4K_TT_L3_SHIFT,
272 		.index_mask = ARM_4K_TT_L3_INDEX_MASK,
273 		.valid_mask = ARM_PTE_TYPE_VALID,
274 		.type_mask  = ARM_TTE_TYPE_MASK,
275 		.type_block = ARM_TTE_TYPE_L3BLOCK
276 	}
277 };
278 
279 const struct page_table_attr pmap_pt_attr_4k = {
280 	.pta_level_info = pmap_table_level_info_4k,
281 	.pta_root_level = (T0SZ_BOOT - 16) / 9,
282 #if __ARM_MIXED_PAGE_SIZE__
283 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
284 #else /* __ARM_MIXED_PAGE_SIZE__ */
285 #if __ARM_16K_PG__
286 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
287 #else /* __ARM_16K_PG__ */
288 	.pta_commpage_level = PMAP_TT_L1_LEVEL,
289 #endif /* __ARM_16K_PG__ */
290 #endif /* __ARM_MIXED_PAGE_SIZE__ */
291 	.pta_max_level  = PMAP_TT_L3_LEVEL,
292 	.pta_ops = &native_pt_ops,
293 	.ap_ro = ARM_PTE_AP(AP_RORO),
294 	.ap_rw = ARM_PTE_AP(AP_RWRW),
295 	.ap_rona = ARM_PTE_AP(AP_RONA),
296 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
297 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
298 	.ap_x = ARM_PTE_PNX,
299 #if __ARM_MIXED_PAGE_SIZE__
300 	.pta_tcr_value  = TCR_EL1_4KB,
301 #endif /* __ARM_MIXED_PAGE_SIZE__ */
302 	.pta_page_size  = 4096,
303 	.pta_page_shift = 12,
304 	.geometry_id = SPTM_PT_GEOMETRY_4K,
305 };
306 
307 const struct page_table_attr pmap_pt_attr_16k = {
308 	.pta_level_info = pmap_table_level_info_16k,
309 	.pta_root_level = PMAP_TT_L1_LEVEL,
310 	.pta_commpage_level = PMAP_TT_L2_LEVEL,
311 	.pta_max_level  = PMAP_TT_L3_LEVEL,
312 	.pta_ops = &native_pt_ops,
313 	.ap_ro = ARM_PTE_AP(AP_RORO),
314 	.ap_rw = ARM_PTE_AP(AP_RWRW),
315 	.ap_rona = ARM_PTE_AP(AP_RONA),
316 	.ap_rwna = ARM_PTE_AP(AP_RWNA),
317 	.ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
318 	.ap_x = ARM_PTE_PNX,
319 #if __ARM_MIXED_PAGE_SIZE__
320 	.pta_tcr_value  = TCR_EL1_16KB,
321 #endif /* __ARM_MIXED_PAGE_SIZE__ */
322 	.pta_page_size  = 16384,
323 	.pta_page_shift = 14,
324 	.geometry_id = SPTM_PT_GEOMETRY_16K,
325 };
326 
327 #if __ARM_16K_PG__
328 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
329 #else /* !__ARM_16K_PG__ */
330 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
331 #endif /* !__ARM_16K_PG__ */
332 
333 
334 #if DEVELOPMENT || DEBUG
335 int vm_footprint_suspend_allowed = 1;
336 
337 extern int pmap_ledgers_panic;
338 extern int pmap_ledgers_panic_leeway;
339 
340 #endif /* DEVELOPMENT || DEBUG */
341 
342 #if DEVELOPMENT || DEBUG
343 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
344 	(current_thread()->pmap_footprint_suspended)
345 #else /* DEVELOPMENT || DEBUG */
346 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
347 #endif /* DEVELOPMENT || DEBUG */
348 
349 #define PMAP_TT_ALLOCATE_NOWAIT         0x1
350 
351 
352 /* Keeps track of whether the pmap has been bootstrapped */
353 SECURITY_READ_ONLY_LATE(bool) pmap_bootstrapped = false;
354 
355 /*
356  * Represents a tlb range that will be flushed before returning from the pmap.
357  * Used by phys_attribute_clear_range to defer flushing pages in this range until
358  * the end of the operation, and to accumulate batched operations for submission
359  * to the SPTM as a performance optimization.
360  */
361 typedef struct pmap_tlb_flush_range {
362 	/* Address space in which the flush region resides */
363 	pmap_t ptfr_pmap;
364 
365 	/* Page-aligned beginning of the flush region */
366 	vm_map_address_t ptfr_start;
367 
368 	/* Page-aligned non-inclusive end of the flush region */
369 	vm_map_address_t ptfr_end;
370 
371 	/**
372 	 * Address of current PTE position in ptfr_pmap's [ptfr_start, ptfr_end) region.
373 	 * This is meant to be set up by the caller of pmap_page_protect_options_with_flush_range()
374 	 * or arm_force_fast_fault_with_flush_range(), and used by those functions to determine
375 	 * when a given mapping can be added to the SPTM's per-CPU region templates array vs.
376 	 * the more complex task of adding it to the disjoint ops array.
377 	 */
378 	pt_entry_t *current_ptep;
379 
380 	/**
381 	 * Starting VA for any not-yet-submitted per-CPU region templates.  This is meant to be
382 	 * set up by the caller of pmap_page_protect_options_with_flush_range() or
383 	 * arm_force_fast_fault_with_flush_range() and used by pmap_multipage_op_submit_region()
384 	 * when issuing the SPTM call to purge any pending region ops.
385 	 */
386 	vm_map_address_t pending_region_start;
387 
388 	/**
389 	 * Number of entries in the per-CPU SPTM region templates array which have not
390 	 * yet been submitted to the SPTM.
391 	 */
392 	unsigned int pending_region_entries;
393 
394 	/**
395 	 * Indicates whether at least one region entry was added to the per-CPU region ops
396 	 * array since the last time this field was checked.  Intended to be cleared by the
397 	 * caller.
398 	 */
399 	bool region_entry_added;
400 
401 	/**
402 	 * Marker for the current paddr "header" entry in the per-CPU SPTM disjoint ops array.
403 	 * This field is intended to be modified only by pmap_multipage_op_submit_disjoint()
404 	 * and pmap_multipage_op_add_page(), and should be treated as opaque by callers
405 	 * of those functions.
406 	 */
407 	sptm_update_disjoint_multipage_op_t *current_header;
408 
409 	/**
410 	 * Position in the per-CPU SPTM ops array of the first ordinary
411 	 * sptm_disjoint_op_t entry following [current_header].  This is the starting
412 	 * point at which mappings should be inserted for the page described by
413 	 * [current_header].
414 	 */
415 	unsigned int current_header_first_mapping_index;
416 
417 	/**
418 	 * Number of entries in the per-CPU SPTM disjoint ops array, including paddr headers,
419 	 * which have not yet been submitted to the SPTM.
420 	 */
421 	unsigned int pending_disjoint_entries;
422 
423 	/**
424 	 * This field is used by the preemption check interval logic on the
425 	 * phys_attribute_clear_range() path to determine when sufficient
426 	 * forward progress has been made to check for and (if necessary)
427 	 * handle pending preemption.
428 	 */
429 	unsigned int processed_entries;
430 
431 	/**
432 	 * Indicates whether the top-level caller needs to flush the TLB for
433 	 * the region in [ptfr_pmap] described by [ptfr_start, ptfr_end).
434 	 * This will be set if the SPTM indicates that it needed to alter
435 	 * any valid mapping within this region and SPTM_UPDATE_DEFER_TLBI
436 	 * was passed to the relevant SPTM call(s).
437 	 */
438 	bool ptfr_flush_needed;
439 } pmap_tlb_flush_range_t;
440 
441 
442 
443 /* Virtual memory region for early allocation */
444 #define VREGION1_HIGH_WINDOW    (PE_EARLY_BOOT_VA)
445 #define VREGION1_START          ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
446 #define VREGION1_SIZE           (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
447 
448 extern uint8_t bootstrap_pagetables[];
449 
450 extern unsigned int not_in_kdp;
451 
452 extern vm_offset_t first_avail;
453 
454 extern vm_offset_t     virtual_space_start;     /* Next available kernel VA */
455 extern vm_offset_t     virtual_space_end;       /* End of kernel address space */
456 extern vm_offset_t     static_memory_end;
457 
458 extern const vm_map_address_t physmap_base;
459 extern const vm_map_address_t physmap_end;
460 
461 extern int maxproc, hard_maxproc;
462 
463 extern bool sdsb_io_rgns_present;
464 
465 vm_address_t MARK_AS_PMAP_DATA image4_slab = 0;
466 vm_address_t MARK_AS_PMAP_DATA image4_late_slab = 0;
467 
468 /* The number of address bits one TTBR can cover. */
469 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
470 
471 /*
472  * The bounds on our TTBRs.  These are for sanity checking that
473  * an address is accessible by a TTBR before we attempt to map it.
474  */
475 
476 /* The level of the root of a page table. */
477 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
478 
479 /* The number of entries in the root TT of a page table. */
480 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
481 
482 struct pmap     kernel_pmap_store MARK_AS_PMAP_DATA;
483 const pmap_t    kernel_pmap = &kernel_pmap_store;
484 
485 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone;  /* zone of pmap structures */
486 
487 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
488 queue_head_t    map_pmap_list MARK_AS_PMAP_DATA;
489 
490 typedef struct tt_free_entry {
491 	struct tt_free_entry    *next;
492 } tt_free_entry_t;
493 
494 unsigned int    inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
495 unsigned int    inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
496 unsigned int    inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0;  /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
497 unsigned int    inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
498 unsigned int    inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
499 unsigned int    inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
500 _Atomic unsigned int inuse_iommu_pages_count[SPTM_IOMMUS_N_IDS] = {0}; /* number of active pages for each IOMMU class */
501 
502 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte  = 0;
503 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
504 
505 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte  = 0;                     /* set by arm_vm_init() - keep out of bss */
506 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0;                     /* set by arm_vm_init() - phys tte addr */
507 
508 /* Lock group used for all pmap object locks. */
509 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
510 
511 #if DEVELOPMENT || DEBUG
512 int nx_enabled = 1;                                     /* enable no-execute protection */
513 int allow_data_exec  = 0;                               /* No apps may execute data */
514 int allow_stack_exec = 0;                               /* No apps may execute from the stack */
515 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
516 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
517 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
518 unsigned long pmap_speculation_restrictions MARK_AS_PMAP_DATA = 0;
519 #else /* DEVELOPMENT || DEBUG */
520 const int nx_enabled = 1;                                       /* enable no-execute protection */
521 const int allow_data_exec  = 0;                         /* No apps may execute data */
522 const int allow_stack_exec = 0;                         /* No apps may execute from the stack */
523 #endif /* DEVELOPMENT || DEBUG */
524 
525 
526 #if MACH_ASSERT
527 static void pmap_check_ledgers(pmap_t pmap);
528 #else
529 static inline void
pmap_check_ledgers(__unused pmap_t pmap)530 pmap_check_ledgers(__unused pmap_t pmap)
531 {
532 }
533 #endif /* MACH_ASSERT */
534 
535 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
536 
537 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_first_phys = (pmap_paddr_t) 0;
538 SECURITY_READ_ONLY_LATE(pmap_paddr_t)   vm_last_phys = (pmap_paddr_t) 0;
539 
540 SECURITY_READ_ONLY_LATE(boolean_t)      pmap_initialized = FALSE;       /* Has pmap_init completed? */
541 
542 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default  = 0x0;
543 
544 /* end of shared region + 512MB for various purposes */
545 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
546 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
547     "Minimum address space size outside allowable range");
548 
549 // Max offset is 15.375GB for devices with "large" memory config
550 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
551 // Max offset is 11.375GB for devices with "small" memory config
552 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
553 
554 
555 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
556     "Large device address space size outside allowable range");
557 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
558     "Small device address space size outside allowable range");
559 
560 #  ifdef XNU_TARGET_OS_OSX
561 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
562 #  else
563 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
564 #  endif
565 
566 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
567 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = TRUE;
568 #else
569 SECURITY_READ_ONLY_LATE(boolean_t)   pmap_panic_dev_wimg_on_managed = FALSE;
570 #endif
571 
572 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
573 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
574 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
575 #if !HAS_16BIT_ASID
576 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
577 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
578 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
579 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
580 #else
581 static uint16_t last_allocated_asid = 0;
582 #endif /* !HAS_16BIT_ASID */
583 
584 
585 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_default_table;
586 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_default_table;
587 #if __ARM_MIXED_PAGE_SIZE__
588 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_4k_table;
589 //SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage32_4k_table;
590 #endif
591 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_data_pa = 0;
592 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_text_pa = 0;
593 SECURITY_READ_ONLY_LATE(static vm_map_address_t) commpage_text_user_va = 0;
594 SECURITY_READ_ONLY_LATE(static pmap_paddr_t) commpage_ro_data_pa = 0;
595 
596 
597 #if (DEVELOPMENT || DEBUG)
598 /* Caches whether the SPTM sysreg API has been enabled by the SPTM */
599 SECURITY_READ_ONLY_LATE(static bool) sptm_sysreg_available = false;
600 #endif /* (DEVELOPMENT || DEBUG) */
601 
602 /* PTE Define Macros */
603 
604 #ifndef SPTM_PTE_IN_FLIGHT_MARKER
605 /* SPTM TODO: Get rid of this once we export SPTM_PTE_IN_FLIGHT_MARKER from the SPTM. */
606 #define SPTM_PTE_IN_FLIGHT_MARKER 0x80U
607 #endif /* SPTM_PTE_IN_FLIGHT_MARKER */
608 
609 /**
610  * Determine whether a PTE has been marked as compressed.  This function also panics if
611  * the PTE contains bits that shouldn't be present in a compressed PTE, which is most of them.
612  *
613  * @param pte the PTE contents to check
614  * @param ptep the address of the PTE contents, for diagnostic purposes only
615  *
616  * @return true if the PTE is compressed, false otherwise
617  */
618 static inline bool
pte_is_compressed(pt_entry_t pte,pt_entry_t * ptep)619 pte_is_compressed(pt_entry_t pte, pt_entry_t *ptep)
620 {
621 	const bool compressed = (!pte_is_valid(pte) && (pte & ARM_PTE_COMPRESSED));
622 	/**
623 	 * Check for bits that shouldn't be present in a compressed PTE.  This is everything except the
624 	 * compressed/compressed-alt bits, as well as the SPTM's in-flight marker which may be set while
625 	 * the SPTM is in the process of flushing the TLBs after marking a previously-valid PTE as
626 	 * compressed.
627 	 */
628 	if (__improbable(compressed && (pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER)))) {
629 		panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?",
630 		    ptep, pte, pte & ~(ARM_PTE_COMPRESSED_MASK | SPTM_PTE_IN_FLIGHT_MARKER));
631 	}
632 	return compressed;
633 }
634 
635 #define pte_is_wired(pte)                                                               \
636 	(((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
637 
638 #define pte_was_writeable(pte) \
639 	(((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
640 
641 #define pte_set_was_writeable(pte, was_writeable) \
642 	do {                                         \
643 	        if ((was_writeable)) {               \
644 	                (pte) |= ARM_PTE_WRITEABLE;  \
645 	        } else {                             \
646 	                (pte) &= ~ARM_PTE_WRITEABLE; \
647 	        }                                    \
648 	} while(0)
649 
650 
651 /**
652  * Updated wired-mapping accountings in the PTD and ledger.
653  *
654  * @param pmap The pmap against which to update accounting
655  * @param pte_p The PTE whose wired state is being changed
656  * @param wired Indicates whether the PTE is being wired or unwired.
657  */
658 static inline void
pte_update_wiredcnt(pmap_t pmap,pt_entry_t * pte_p,boolean_t wired)659 pte_update_wiredcnt(pmap_t pmap, pt_entry_t *pte_p, boolean_t wired)
660 {
661 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
662 	unsigned short *ptd_wiredcnt_ptr = &(ptep_get_info(pte_p)->wiredcnt);
663 	if (wired) {
664 		if (__improbable(os_atomic_inc_orig(ptd_wiredcnt_ptr, relaxed) == UINT16_MAX)) {
665 			panic("pmap %p (pte %p): wired count overflow", pmap, pte_p);
666 		}
667 		pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
668 	} else {
669 		if (__improbable(os_atomic_dec_orig(ptd_wiredcnt_ptr, relaxed) == 0)) {
670 			panic("pmap %p (pte %p): wired count underflow", pmap, pte_p);
671 		}
672 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
673 	}
674 }
675 
676 /*
677  * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
678  * therefore not requiring TLBI.  Use a store-load barrier to ensure subsequent loads
679  * will observe the updated PTE.
680  */
681 #define FLUSH_PTE()                                                                     \
682 	__builtin_arm_dmb(DMB_ISH);
683 
684 /*
685  * Synchronize updates to PTEs that were previously valid and thus may be cached in
686  * TLBs.  DSB is required to ensure the PTE stores have completed prior to the ensuing
687  * TLBI.  This should only require a store-store barrier, as subsequent accesses in
688  * program order will not issue until the DSB completes.  Prior loads may be reordered
689  * after the barrier, but their behavior should not be materially affected by the
690  * reordering.  For fault-driven PTE updates such as COW, PTE contents should not
691  * matter for loads until the access is re-driven well after the TLB update is
692  * synchronized.   For "involuntary" PTE access restriction due to paging lifecycle,
693  * we should be in a position to handle access faults.  For "voluntary" PTE access
694  * restriction due to unmapping or protection, the decision to restrict access should
695  * have a data dependency on prior loads in order to avoid a data race.
696  */
697 #define FLUSH_PTE_STRONG()                                                             \
698 	__builtin_arm_dsb(DSB_ISHST);
699 
700 /**
701  * Write enough page table entries to map a single VM page. On systems where the
702  * VM page size does not match the hardware page size, multiple page table
703  * entries will need to be written.
704  *
705  * @note This function does not emit a barrier to ensure these page table writes
706  *       have completed before continuing. This is commonly needed. In the case
707  *       where a DMB or DSB barrier is needed, then use the write_pte() and
708  *       write_pte_strong() functions respectively instead of this one.
709  *
710  * @param ptep Pointer to the first page table entry to update.
711  * @param pte The value to write into each page table entry. In the case that
712  *            multiple PTEs are updated to a non-empty value, then the address
713  *            in this value will automatically be incremented for each PTE
714  *            write.
715  */
716 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)717 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
718 {
719 	/**
720 	 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
721 	 * systems, which is why it's checked at runtime instead of compile time.
722 	 * The "unreachable" warning needs to be suppressed because it still is a
723 	 * compile time constant on some systems.
724 	 */
725 	__unreachable_ok_push
726 	if (TEST_PAGE_RATIO_4) {
727 		if (((uintptr_t)ptep) & 0x1f) {
728 			panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
729 			    __func__, ptep, (void*)pte);
730 		}
731 
732 		if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
733 			/**
734 			 * If we're writing an empty/compressed PTE value, then don't
735 			 * auto-increment the address for each PTE write.
736 			 */
737 			*ptep = pte;
738 			*(ptep + 1) = pte;
739 			*(ptep + 2) = pte;
740 			*(ptep + 3) = pte;
741 		} else {
742 			*ptep = pte;
743 			*(ptep + 1) = pte | 0x1000;
744 			*(ptep + 2) = pte | 0x2000;
745 			*(ptep + 3) = pte | 0x3000;
746 		}
747 	} else {
748 		*ptep = pte;
749 	}
750 	__unreachable_ok_pop
751 }
752 
753 /**
754  * Writes enough page table entries to map a single VM page and then ensures
755  * those writes complete by executing a Data Memory Barrier.
756  *
757  * @note The DMB issued by this function is not strong enough to protect against
758  *       TLB invalidates from being reordered above the PTE writes. If a TLBI
759  *       instruction is going to immediately be called after this write, it's
760  *       recommended to call write_pte_strong() instead of this function.
761  *
762  * See the function header for write_pte_fast() for more details on the
763  * parameters.
764  */
765 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)766 write_pte(pt_entry_t *ptep, pt_entry_t pte)
767 {
768 	write_pte_fast(ptep, pte);
769 	FLUSH_PTE();
770 }
771 
772 /**
773  * Retrieve the pmap structure for the thread running on the current CPU.
774  */
775 pmap_t
current_pmap()776 current_pmap()
777 {
778 	const pmap_t current = vm_map_pmap(current_thread()->map);
779 	assert(current != NULL);
780 	return current;
781 }
782 
783 #if DEVELOPMENT || DEBUG
784 
785 /*
786  * Trace levels are controlled by a bitmask in which each
787  * level can be enabled/disabled by the (1<<level) position
788  * in the boot arg
789  * Level 0: PPL extension functionality
790  * Level 1: pmap lifecycle (create/destroy/switch)
791  * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
792  * Level 3: internal state management (attributes/fast-fault)
793  * Level 4-7: TTE traces for paging levels 0-3.  TTBs are traced at level 4.
794  */
795 
796 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
797 
798 #define PMAP_TRACE(level, ...) \
799 	if (__improbable((1 << (level)) & pmap_trace_mask)) { \
800 	        KDBG_RELEASE(__VA_ARGS__); \
801 	}
802 #else /* DEVELOPMENT || DEBUG */
803 
804 #define PMAP_TRACE(level, ...)
805 
806 #endif /* DEVELOPMENT || DEBUG */
807 
808 
809 /*
810  * Internal function prototypes (forward declarations).
811  */
812 
813 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
814 
815 static void pmap_set_reference(ppnum_t pn);
816 
817 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
818 
819 static kern_return_t pmap_expand(
820 	pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
821 
822 static void pmap_remove_range(pmap_t, vm_map_address_t, vm_map_address_t);
823 
824 static tt_entry_t *pmap_tt1_allocate(pmap_t, uint8_t);
825 
826 static void pmap_tt1_deallocate(pmap_t, tt_entry_t *);
827 
828 static kern_return_t pmap_tt_allocate(
829 	pmap_t, tt_entry_t **, unsigned int, unsigned int);
830 
831 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
832 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
833 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
834 
835 static void pmap_unmap_commpage(
836 	pmap_t pmap);
837 
838 static boolean_t
839 pmap_is_64bit(pmap_t);
840 
841 
842 static void pmap_flush_tlb_for_paddr_async(pmap_paddr_t);
843 
844 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
845 
846 static boolean_t arm_clear_fast_fault(
847 	ppnum_t ppnum,
848 	vm_prot_t fault_type,
849 	uintptr_t pvh,
850 	pt_entry_t *pte_p,
851 	pp_attr_t attrs_to_clear);
852 
853 static void pmap_trim_self(pmap_t pmap);
854 static void pmap_trim_subord(pmap_t subord);
855 
856 
857 /*
858  * Temporary prototypes, while we wait for pmap_enter to move to taking an
859  * address instead of a page number.
860  */
861 kern_return_t
862 pmap_enter(
863 	pmap_t pmap,
864 	vm_map_address_t v,
865 	ppnum_t pn,
866 	vm_prot_t prot,
867 	vm_prot_t fault_type,
868 	unsigned int flags,
869 	boolean_t wired,
870 	pmap_mapping_type_t mapping_type);
871 
872 static kern_return_t
873 pmap_enter_addr(
874 	pmap_t pmap,
875 	vm_map_address_t v,
876 	pmap_paddr_t pa,
877 	vm_prot_t prot,
878 	vm_prot_t fault_type,
879 	unsigned int flags,
880 	boolean_t wired,
881 	pmap_mapping_type_t mapping_type);
882 
883 kern_return_t
884 pmap_enter_options_addr(
885 	pmap_t pmap,
886 	vm_map_address_t v,
887 	pmap_paddr_t pa,
888 	vm_prot_t prot,
889 	vm_prot_t fault_type,
890 	unsigned int flags,
891 	boolean_t wired,
892 	unsigned int options,
893 	__unused void   *arg,
894 	pmap_mapping_type_t mapping_type);
895 
896 #ifdef CONFIG_XNUPOST
897 kern_return_t pmap_test(void);
898 #endif /* CONFIG_XNUPOST */
899 
900 PMAP_SUPPORT_PROTOTYPES(
901 	kern_return_t,
902 	arm_fast_fault, (pmap_t pmap,
903 	vm_map_address_t va,
904 	vm_prot_t fault_type,
905 	bool was_af_fault,
906 	bool from_user), ARM_FAST_FAULT_INDEX);
907 
908 PMAP_SUPPORT_PROTOTYPES(
909 	boolean_t,
910 	arm_force_fast_fault, (ppnum_t ppnum,
911 	vm_prot_t allow_mode,
912 	int options), ARM_FORCE_FAST_FAULT_INDEX);
913 
914 MARK_AS_PMAP_TEXT static boolean_t
915 arm_force_fast_fault_with_flush_range(
916 	ppnum_t ppnum,
917 	vm_prot_t allow_mode,
918 	int options,
919 	locked_pvh_t *locked_pvh,
920 	pp_attr_t bits_to_clear,
921 	pmap_tlb_flush_range_t *flush_range);
922 
923 PMAP_SUPPORT_PROTOTYPES(
924 	void,
925 	pmap_batch_set_cache_attributes, (
926 		const unified_page_list_t * page_list,
927 		unsigned int cacheattr,
928 		bool update_attr_table), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
929 
930 PMAP_SUPPORT_PROTOTYPES(
931 	void,
932 	pmap_change_wiring, (pmap_t pmap,
933 	vm_map_address_t v,
934 	boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
935 
936 PMAP_SUPPORT_PROTOTYPES(
937 	pmap_t,
938 	pmap_create_options, (ledger_t ledger,
939 	vm_map_size_t size,
940 	unsigned int flags,
941 	kern_return_t * kr), PMAP_CREATE_INDEX);
942 
943 PMAP_SUPPORT_PROTOTYPES(
944 	void,
945 	pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
946 
947 PMAP_SUPPORT_PROTOTYPES(
948 	kern_return_t,
949 	pmap_enter_options, (pmap_t pmap,
950 	vm_map_address_t v,
951 	pmap_paddr_t pa,
952 	vm_prot_t prot,
953 	vm_prot_t fault_type,
954 	unsigned int flags,
955 	boolean_t wired,
956 	unsigned int options,
957 	pmap_mapping_type_t mapping_type), PMAP_ENTER_OPTIONS_INDEX);
958 
959 PMAP_SUPPORT_PROTOTYPES(
960 	pmap_paddr_t,
961 	pmap_find_pa, (pmap_t pmap,
962 	addr64_t va), PMAP_FIND_PA_INDEX);
963 
964 PMAP_SUPPORT_PROTOTYPES(
965 	kern_return_t,
966 	pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
967 
968 
969 PMAP_SUPPORT_PROTOTYPES(
970 	boolean_t,
971 	pmap_is_empty, (pmap_t pmap,
972 	vm_map_offset_t va_start,
973 	vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
974 
975 
976 PMAP_SUPPORT_PROTOTYPES(
977 	unsigned int,
978 	pmap_map_cpu_windows_copy, (ppnum_t pn,
979 	vm_prot_t prot,
980 	unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
981 
982 PMAP_SUPPORT_PROTOTYPES(
983 	void,
984 	pmap_ro_zone_memcpy, (zone_id_t zid,
985 	vm_offset_t va,
986 	vm_offset_t offset,
987 	const vm_offset_t new_data,
988 	vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
989 
990 PMAP_SUPPORT_PROTOTYPES(
991 	uint64_t,
992 	pmap_ro_zone_atomic_op, (zone_id_t zid,
993 	vm_offset_t va,
994 	vm_offset_t offset,
995 	zro_atomic_op_t op,
996 	uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
997 
998 PMAP_SUPPORT_PROTOTYPES(
999 	void,
1000 	pmap_ro_zone_bzero, (zone_id_t zid,
1001 	vm_offset_t va,
1002 	vm_offset_t offset,
1003 	vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
1004 
1005 PMAP_SUPPORT_PROTOTYPES(
1006 	kern_return_t,
1007 	pmap_nest, (pmap_t grand,
1008 	pmap_t subord,
1009 	addr64_t vstart,
1010 	uint64_t size), PMAP_NEST_INDEX);
1011 
1012 PMAP_SUPPORT_PROTOTYPES(
1013 	void,
1014 	pmap_page_protect_options, (ppnum_t ppnum,
1015 	vm_prot_t prot,
1016 	unsigned int options,
1017 	void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1018 
1019 PMAP_SUPPORT_PROTOTYPES(
1020 	vm_map_address_t,
1021 	pmap_protect_options, (pmap_t pmap,
1022 	vm_map_address_t start,
1023 	vm_map_address_t end,
1024 	vm_prot_t prot,
1025 	unsigned int options,
1026 	void *args), PMAP_PROTECT_OPTIONS_INDEX);
1027 
1028 PMAP_SUPPORT_PROTOTYPES(
1029 	kern_return_t,
1030 	pmap_query_page_info, (pmap_t pmap,
1031 	vm_map_offset_t va,
1032 	int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1033 
1034 PMAP_SUPPORT_PROTOTYPES(
1035 	mach_vm_size_t,
1036 	pmap_query_resident, (pmap_t pmap,
1037 	vm_map_address_t start,
1038 	vm_map_address_t end,
1039 	mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1040 
1041 PMAP_SUPPORT_PROTOTYPES(
1042 	void,
1043 	pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1044 
1045 PMAP_SUPPORT_PROTOTYPES(
1046 	vm_map_address_t,
1047 	pmap_remove_options, (pmap_t pmap,
1048 	vm_map_address_t start,
1049 	vm_map_address_t end,
1050 	int options), PMAP_REMOVE_OPTIONS_INDEX);
1051 
1052 
1053 PMAP_SUPPORT_PROTOTYPES(
1054 	void,
1055 	pmap_set_cache_attributes, (ppnum_t pn,
1056 	unsigned int cacheattr,
1057 	bool update_attr_table), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1058 
1059 PMAP_SUPPORT_PROTOTYPES(
1060 	void,
1061 	pmap_update_compressor_page, (ppnum_t pn,
1062 	unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1063 
1064 PMAP_SUPPORT_PROTOTYPES(
1065 	void,
1066 	pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1067 
1068 #if MACH_ASSERT
1069 PMAP_SUPPORT_PROTOTYPES(
1070 	void,
1071 	pmap_set_process, (pmap_t pmap,
1072 	int pid,
1073 	char *procname), PMAP_SET_PROCESS_INDEX);
1074 #endif
1075 
1076 PMAP_SUPPORT_PROTOTYPES(
1077 	void,
1078 	pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1079 
1080 PMAP_SUPPORT_PROTOTYPES(
1081 	void,
1082 	pmap_unnest_options, (pmap_t grand,
1083 	addr64_t vaddr,
1084 	uint64_t size,
1085 	unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1086 
1087 PMAP_SUPPORT_PROTOTYPES(
1088 	void,
1089 	phys_attribute_set, (ppnum_t pn,
1090 	unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1091 
1092 PMAP_SUPPORT_PROTOTYPES(
1093 	void,
1094 	phys_attribute_clear, (ppnum_t pn,
1095 	unsigned int bits,
1096 	int options,
1097 	void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1098 
1099 #if __ARM_RANGE_TLBI__
1100 PMAP_SUPPORT_PROTOTYPES(
1101 	vm_map_address_t,
1102 	phys_attribute_clear_range, (pmap_t pmap,
1103 	vm_map_address_t start,
1104 	vm_map_address_t end,
1105 	unsigned int bits,
1106 	unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1107 #endif /* __ARM_RANGE_TLBI__ */
1108 
1109 
1110 PMAP_SUPPORT_PROTOTYPES(
1111 	void,
1112 	pmap_switch, (pmap_t pmap, thread_t thread), PMAP_SWITCH_INDEX);
1113 
1114 PMAP_SUPPORT_PROTOTYPES(
1115 	void,
1116 	pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1117 
1118 PMAP_SUPPORT_PROTOTYPES(
1119 	void,
1120 	pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1121 
1122 PMAP_SUPPORT_PROTOTYPES(
1123 	void,
1124 	pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1125 
1126 PMAP_SUPPORT_PROTOTYPES(
1127 	void,
1128 	pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1129 
1130 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1131 PMAP_SUPPORT_PROTOTYPES(
1132 	void,
1133 	pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1134 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1135 
1136 PMAP_SUPPORT_PROTOTYPES(
1137 	void,
1138 	pmap_trim, (pmap_t grand,
1139 	pmap_t subord,
1140 	addr64_t vstart,
1141 	uint64_t size), PMAP_TRIM_INDEX);
1142 
1143 #if HAS_APPLE_PAC
1144 PMAP_SUPPORT_PROTOTYPES(
1145 	void *,
1146 	pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1147 PMAP_SUPPORT_PROTOTYPES(
1148 	void *,
1149 	pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1150 #endif /* HAS_APPLE_PAC */
1151 
1152 
1153 void pmap_footprint_suspend(vm_map_t    map,
1154     boolean_t   suspend);
1155 PMAP_SUPPORT_PROTOTYPES(
1156 	void,
1157 	pmap_footprint_suspend, (vm_map_t map,
1158 	boolean_t suspend),
1159 	PMAP_FOOTPRINT_SUSPEND_INDEX);
1160 
1161 
1162 
1163 
1164 
1165 /*
1166  * The low global vector page is mapped at a fixed alias.
1167  * Since the page size is 16k for H8 and newer we map the globals to a 16k
1168  * aligned address. Readers of the globals (e.g. lldb, panic server) need
1169  * to check both addresses anyway for backward compatibility. So for now
1170  * we leave H6 and H7 where they were.
1171  */
1172 #if (ARM_PGSHIFT == 14)
1173 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1174 #else
1175 #define LOWGLOBAL_ALIAS         (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1176 #endif
1177 
1178 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1179 PMAP_ZINFO_PALLOC(
1180 	pmap_t pmap, int bytes)
1181 {
1182 	pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1183 }
1184 
1185 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1186 PMAP_ZINFO_PFREE(
1187 	pmap_t pmap,
1188 	int bytes)
1189 {
1190 	pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1191 }
1192 
1193 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1194 pmap_tt_ledger_credit(
1195 	pmap_t          pmap,
1196 	vm_size_t       size)
1197 {
1198 	if (pmap != kernel_pmap) {
1199 		pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1200 		pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1201 	}
1202 }
1203 
1204 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1205 pmap_tt_ledger_debit(
1206 	pmap_t          pmap,
1207 	vm_size_t       size)
1208 {
1209 	if (pmap != kernel_pmap) {
1210 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1211 		pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1212 	}
1213 }
1214 
1215 static inline void
pmap_update_plru(uint16_t asid_index __unused)1216 pmap_update_plru(uint16_t asid_index __unused)
1217 {
1218 #if !HAS_16BIT_ASID
1219 	if (__probable(pmap_asid_plru)) {
1220 		unsigned plru_index = asid_index >> 6;
1221 		if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1222 			asid_plru_generation[plru_index] = ++asid_plru_gencount;
1223 			asid_plru_bitmap[plru_index] = ((plru_index == 0) ? ~1ULL : UINT64_MAX);
1224 		}
1225 	}
1226 #endif /* !HAS_16BIT_ASID */
1227 }
1228 
1229 static bool
alloc_asid(pmap_t pmap)1230 alloc_asid(pmap_t pmap)
1231 {
1232 	int vasid = -1;
1233 
1234 	pmap_simple_lock(&asid_lock);
1235 
1236 #if !HAS_16BIT_ASID
1237 	if (__probable(pmap_asid_plru)) {
1238 		unsigned plru_index = 0;
1239 		uint64_t lowest_gen = asid_plru_generation[0];
1240 		uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1241 		for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1242 			if (asid_plru_generation[i] < lowest_gen) {
1243 				plru_index = i;
1244 				lowest_gen = asid_plru_generation[i];
1245 				lowest_gen_bitmap = asid_plru_bitmap[i];
1246 			}
1247 		}
1248 
1249 		for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += (MAX_HW_ASIDS >> 6)) {
1250 			uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1251 			if (temp_plru) {
1252 				vasid = (plru_index << 6) + lsb_first(temp_plru);
1253 #if DEVELOPMENT || DEBUG
1254 				++pmap_asid_hits;
1255 #endif
1256 				break;
1257 			}
1258 		}
1259 	}
1260 #else
1261 	/**
1262 	 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1263 	 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1264 	 * However, we first try to allocate starting from the position of the most-recently allocated
1265 	 * ASID.  This is done both as an allocator performance optimization (as it avoids crowding the
1266 	 * lower bit positions and then re-checking those same lower positions every time we allocate
1267 	 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1268 	 * reuse.  This increases the difficulty of leveraging ASID reuse to train branch predictor
1269 	 * logic, without requiring prohibitively expensive RCTX instructions.
1270 	 */
1271 	vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1272 #endif /* !HAS_16BIT_ASID */
1273 	if (__improbable(vasid < 0)) {
1274 		// bitmap_first() returns highest-order bits first, but a 0-based scheme works
1275 		// slightly better with the collision detection scheme used by pmap_switch_internal().
1276 		vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1277 #if DEVELOPMENT || DEBUG
1278 		++pmap_asid_misses;
1279 #endif
1280 	}
1281 	if (__improbable(vasid < 0)) {
1282 		pmap_simple_unlock(&asid_lock);
1283 		return false;
1284 	}
1285 	assert((uint32_t)vasid < pmap_max_asids);
1286 	assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1287 	bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1288 	const uint16_t hw_asid = (uint16_t)(vasid & (MAX_HW_ASIDS - 1));
1289 #if HAS_16BIT_ASID
1290 	last_allocated_asid = hw_asid;
1291 #endif /* HAS_16BIT_ASID */
1292 	pmap_simple_unlock(&asid_lock);
1293 	assert(hw_asid != 0); // Should never alias kernel ASID
1294 	pmap->asid = (uint16_t)vasid;
1295 	pmap_update_plru(hw_asid);
1296 	return true;
1297 }
1298 
1299 static void
free_asid(pmap_t pmap)1300 free_asid(pmap_t pmap)
1301 {
1302 	const uint16_t vasid = os_atomic_xchg(&pmap->asid, 0, relaxed);
1303 	if (__improbable(vasid == 0)) {
1304 		return;
1305 	}
1306 
1307 #if !HAS_16BIT_ASID
1308 	if (pmap_asid_plru) {
1309 		const uint16_t hw_asid = vasid & (MAX_HW_ASIDS - 1);
1310 		os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1311 	}
1312 #endif /* !HAS_16BIT_ASID */
1313 	pmap_simple_lock(&asid_lock);
1314 	assert(!bitmap_test(&asid_bitmap[0], vasid));
1315 	bitmap_set(&asid_bitmap[0], vasid);
1316 	pmap_simple_unlock(&asid_lock);
1317 }
1318 
1319 
1320 boolean_t
pmap_valid_address(pmap_paddr_t addr)1321 pmap_valid_address(
1322 	pmap_paddr_t addr)
1323 {
1324 	return pa_valid(addr);
1325 }
1326 
1327 
1328 
1329 
1330 
1331 
1332 /*
1333  *      Map memory at initialization.  The physical addresses being
1334  *      mapped are not managed and are never unmapped.
1335  *
1336  *      For now, VM is already on, we only need to map the
1337  *      specified memory.
1338  */
1339 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1340 pmap_map(
1341 	vm_map_address_t virt,
1342 	vm_offset_t start,
1343 	vm_offset_t end,
1344 	vm_prot_t prot,
1345 	unsigned int flags)
1346 {
1347 	kern_return_t   kr;
1348 	vm_size_t       ps;
1349 
1350 	ps = PAGE_SIZE;
1351 	while (start < end) {
1352 		kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1353 		    prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1354 
1355 		if (kr != KERN_SUCCESS) {
1356 			panic("%s: failed pmap_enter, "
1357 			    "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1358 			    __FUNCTION__,
1359 			    (void *) virt, (void *) start, (void *) end, prot, flags);
1360 		}
1361 
1362 		virt += ps;
1363 		start += ps;
1364 	}
1365 
1366 
1367 	return virt;
1368 }
1369 
1370 /**
1371  * Force the permission of a PTE to be kernel RO if a page has XNU_PROTECTED_IO type.
1372  *
1373  * @param paddr The physical address of the page.
1374  * @param tmplate The PTE value to be evaluated.
1375  *
1376  * @return A new PTE value with permission bits modified.
1377  */
1378 static inline
1379 pt_entry_t
pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr,pt_entry_t tmplate)1380 pmap_force_pte_kernel_ro_if_protected_io(pmap_paddr_t paddr, pt_entry_t tmplate)
1381 {
1382 	/**
1383 	 * When requesting RW mappings to an XNU_PROTECTED_IO frame, downgrade
1384 	 * the mapping to RO. This is required because IOKit relies on this
1385 	 * behavior currently in the PPL.
1386 	 */
1387 	const sptm_frame_type_t frame_type = sptm_get_frame_type(paddr);
1388 	if (frame_type == XNU_PROTECTED_IO) {
1389 		/* SPTM to own the page by converting KERN_RW to PPL_RW. */
1390 		const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1391 		switch (xprr_perm) {
1392 		case XPRR_KERN_RO_PERM:
1393 			break;
1394 		case XPRR_KERN_RW_PERM:
1395 			tmplate &= ~ARM_PTE_XPRR_MASK;
1396 			tmplate |= xprr_perm_to_pte(XPRR_KERN_RO_PERM);
1397 			break;
1398 		default:
1399 			panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1400 		}
1401 	}
1402 
1403 	return tmplate;
1404 }
1405 
1406 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1407 pmap_map_bd_with_options(
1408 	vm_map_address_t virt,
1409 	vm_offset_t start,
1410 	vm_offset_t end,
1411 	vm_prot_t prot,
1412 	int32_t options)
1413 {
1414 	pt_entry_t      tmplate;
1415 	vm_map_address_t vaddr;
1416 	vm_offset_t     paddr;
1417 	pt_entry_t      mem_attr;
1418 
1419 	switch (options & PMAP_MAP_BD_MASK) {
1420 	case PMAP_MAP_BD_WCOMB:
1421 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1422 		mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1423 		break;
1424 	case PMAP_MAP_BD_POSTED:
1425 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1426 		break;
1427 	case PMAP_MAP_BD_POSTED_REORDERED:
1428 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1429 		break;
1430 	case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1431 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1432 		break;
1433 	default:
1434 		mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1435 		break;
1436 	}
1437 
1438 	tmplate = ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1439 	    mem_attr | ARM_PTE_TYPE_VALID | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1440 
1441 #if __ARM_KERNEL_PROTECT__
1442 	tmplate |= ARM_PTE_NG;
1443 #endif /* __ARM_KERNEL_PROTECT__ */
1444 
1445 	vaddr = virt;
1446 	paddr = start;
1447 	while (paddr < end) {
1448 		__assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, vaddr, pmap_force_pte_kernel_ro_if_protected_io(paddr, tmplate) | pa_to_pte(paddr));
1449 		assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1450 
1451 		vaddr += PAGE_SIZE;
1452 		paddr += PAGE_SIZE;
1453 	}
1454 
1455 	return vaddr;
1456 }
1457 
1458 /*
1459  *      Back-door routine for mapping kernel VM at initialization.
1460  *      Useful for mapping memory outside the range
1461  *      [vm_first_phys, vm_last_phys] (i.e., devices).
1462  *      Otherwise like pmap_map.
1463  */
1464 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1465 pmap_map_bd(
1466 	vm_map_address_t virt,
1467 	vm_offset_t start,
1468 	vm_offset_t end,
1469 	vm_prot_t prot)
1470 {
1471 	return pmap_map_bd_with_options(virt, start, end, prot, 0);
1472 }
1473 
1474 /*
1475  *      Back-door routine for mapping kernel VM at initialization.
1476  *      Useful for mapping memory specific physical addresses in early
1477  *      boot (i.e., before kernel_map is initialized).
1478  *
1479  *      Maps are in the VM_HIGH_KERNEL_WINDOW area.
1480  */
1481 
1482 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1483 pmap_map_high_window_bd(
1484 	vm_offset_t pa_start,
1485 	vm_size_t len,
1486 	vm_prot_t prot)
1487 {
1488 	pt_entry_t              *ptep, pte;
1489 	vm_map_address_t        va_start = VREGION1_START;
1490 	vm_map_address_t        va_max = VREGION1_START + VREGION1_SIZE;
1491 	vm_map_address_t        va_end;
1492 	vm_map_address_t        va;
1493 	vm_size_t               offset;
1494 
1495 	offset = pa_start & PAGE_MASK;
1496 	pa_start -= offset;
1497 	len += offset;
1498 
1499 	if (len > (va_max - va_start)) {
1500 		panic("%s: area too large, "
1501 		    "pa_start=%p, len=%p, prot=0x%x",
1502 		    __FUNCTION__,
1503 		    (void*)pa_start, (void*)len, prot);
1504 	}
1505 
1506 scan:
1507 	for (; va_start < va_max; va_start += PAGE_SIZE) {
1508 		ptep = pmap_pte(kernel_pmap, va_start);
1509 		assert(!pte_is_compressed(*ptep, ptep));
1510 		if (!pte_is_valid(*ptep)) {
1511 			break;
1512 		}
1513 	}
1514 	if (va_start > va_max) {
1515 		panic("%s: insufficient pages, "
1516 		    "pa_start=%p, len=%p, prot=0x%x",
1517 		    __FUNCTION__,
1518 		    (void*)pa_start, (void*)len, prot);
1519 	}
1520 
1521 	for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
1522 		ptep = pmap_pte(kernel_pmap, va_end);
1523 		assert(!pte_is_compressed(*ptep, ptep));
1524 		if (pte_is_valid(*ptep)) {
1525 			va_start = va_end + PAGE_SIZE;
1526 			goto scan;
1527 		}
1528 	}
1529 
1530 	for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
1531 		ptep = pmap_pte(kernel_pmap, va);
1532 		pte = pa_to_pte(pa_start)
1533 		    | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1534 		    | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1535 		    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT)
1536 		    | ARM_PTE_SH(SH_OUTER_MEMORY);
1537 #if __ARM_KERNEL_PROTECT__
1538 		pte |= ARM_PTE_NG;
1539 #endif /* __ARM_KERNEL_PROTECT__ */
1540 		__assert_only sptm_return_t ret = sptm_map_page(kernel_pmap->ttep, va, pte);
1541 		assert((ret == SPTM_SUCCESS) || (ret == SPTM_MAP_VALID));
1542 	}
1543 #if KASAN
1544 	kasan_notify_address(va_start, len);
1545 #endif
1546 	return va_start;
1547 }
1548 
1549 /*
1550  * pmap_get_arm64_prot
1551  *
1552  * return effective armv8 VMSA block protections including
1553  * table AP/PXN/XN overrides of a pmap entry
1554  *
1555  */
1556 
1557 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)1558 pmap_get_arm64_prot(
1559 	pmap_t pmap,
1560 	vm_offset_t addr)
1561 {
1562 	tt_entry_t tte = 0;
1563 	unsigned int level = 0;
1564 	uint64_t effective_prot_bits = 0;
1565 	uint64_t aggregate_tte = 0;
1566 	uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
1567 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
1568 
1569 	for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
1570 		tte = *pmap_ttne(pmap, level, addr);
1571 
1572 		if (!(tte & ARM_TTE_VALID)) {
1573 			return 0;
1574 		}
1575 
1576 		if ((level == pt_attr->pta_max_level) || tte_is_block(tte)) {
1577 			/* Block or page mapping; both have the same protection bit layout. */
1578 			break;
1579 		} else if (tte_is_table(tte)) {
1580 			/* All of the table bits we care about are overrides, so just OR them together. */
1581 			aggregate_tte |= tte;
1582 		}
1583 	}
1584 
1585 	table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
1586 	table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
1587 	table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
1588 
1589 	/* Start with the PTE bits. */
1590 	effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
1591 
1592 	/* Table AP bits mask out block/page AP bits */
1593 	effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
1594 
1595 	/* XN/PXN bits can be OR'd in. */
1596 	effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
1597 	effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
1598 
1599 	return effective_prot_bits;
1600 }
1601 
1602 /*
1603  *	Bootstrap the system enough to run with virtual memory.
1604  *
1605  *	The early VM initialization code has already allocated
1606  *	the first CPU's translation table and made entries for
1607  *	all the one-to-one mappings to be found there.
1608  *
1609  *	We must set up the kernel pmap structures, the
1610  *	physical-to-virtual translation lookup tables for the
1611  *	physical memory to be managed (between avail_start and
1612  *	avail_end).
1613  *
1614  *	Map the kernel's code and data, and allocate the system page table.
1615  *	Page_size must already be set.
1616  *
1617  *	Parameters:
1618  *	first_avail	first available physical page -
1619  *			   after kernel page tables
1620  *	avail_start	PA of first managed physical page
1621  *	avail_end	PA of last managed physical page
1622  */
1623 
1624 void
pmap_bootstrap(vm_offset_t vstart)1625 pmap_bootstrap(
1626 	vm_offset_t vstart)
1627 {
1628 	vm_map_offset_t maxoffset;
1629 
1630 	lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
1631 
1632 #if DEVELOPMENT || DEBUG
1633 	if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
1634 		kprintf("Kernel traces for pmap operations enabled\n");
1635 	}
1636 #endif
1637 
1638 	/*
1639 	 *	Initialize the kernel pmap.
1640 	 */
1641 #if ARM_PARAMETERIZED_PMAP
1642 	kernel_pmap->pmap_pt_attr = native_pt_attr;
1643 #endif /* ARM_PARAMETERIZED_PMAP */
1644 #if HAS_APPLE_PAC
1645 	kernel_pmap->disable_jop = 0;
1646 #endif /* HAS_APPLE_PAC */
1647 	kernel_pmap->tte = cpu_tte;
1648 	kernel_pmap->ttep = cpu_ttep;
1649 	kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
1650 	kernel_pmap->max = UINTPTR_MAX;
1651 	os_ref_init_count_raw(&kernel_pmap->ref_count, &pmap_refgrp, 1);
1652 	kernel_pmap->nx_enabled = TRUE;
1653 	kernel_pmap->is_64bit = TRUE;
1654 #if CONFIG_ROSETTA
1655 	kernel_pmap->is_rosetta = FALSE;
1656 #endif
1657 
1658 #if ARM_PARAMETERIZED_PMAP
1659 	kernel_pmap->pmap_pt_attr = native_pt_attr;
1660 #endif /* ARM_PARAMETERIZED_PMAP */
1661 
1662 	kernel_pmap->nested_region_addr = 0x0ULL;
1663 	kernel_pmap->nested_region_size = 0x0ULL;
1664 	kernel_pmap->nested_region_unnested_table_bitmap = NULL;
1665 	kernel_pmap->type = PMAP_TYPE_KERNEL;
1666 
1667 	kernel_pmap->asid = 0;
1668 
1669 	pmap_lock_init(kernel_pmap);
1670 
1671 	pmap_max_asids = SPTMArgs->num_asids;
1672 
1673 	const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
1674 
1675 	/**
1676 	 * Bootstrap the core pmap data structures (e.g., pv_head_table,
1677 	 * pp_attr_table, etc). This function will use `avail_start` to allocate
1678 	 * space for these data structures.
1679 	 * */
1680 	pmap_data_bootstrap();
1681 
1682 	/**
1683 	 * Bootstrap any necessary UAT data structures and values needed from the device tree.
1684 	 */
1685 	uat_bootstrap();
1686 
1687 	/**
1688 	 * Don't make any assumptions about the alignment of avail_start before this
1689 	 * point (i.e., pmap_data_bootstrap() performs allocations).
1690 	 */
1691 	avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
1692 
1693 	const pmap_paddr_t pmap_struct_start = avail_start;
1694 
1695 	asid_bitmap = (bitmap_t*)phystokv(avail_start);
1696 	avail_start = round_page(avail_start + asid_table_size);
1697 
1698 	memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
1699 
1700 	queue_init(&map_pmap_list);
1701 	queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
1702 
1703 	virtual_space_start = vstart;
1704 	virtual_space_end = VM_MAX_KERNEL_ADDRESS;
1705 
1706 	bitmap_full(&asid_bitmap[0], pmap_max_asids);
1707 	/* Clear the ASIDs which will alias the reserved kernel ASID of 0. */
1708 	for (unsigned int i = 0; i < pmap_max_asids; i += MAX_HW_ASIDS) {
1709 		bitmap_clear(&asid_bitmap[0], i);
1710 	}
1711 
1712 
1713 #if !HAS_16BIT_ASID
1714 	/**
1715 	 * Align the range of available hardware ASIDs to a multiple of 64 to enable the
1716 	 * masking used by the PLRU scheme.  This means we must handle the case in which
1717 	 * the returned hardware ASID is 0, which we do by clearing all vASIDs that will
1718 	 * alias the kernel ASID.
1719 	 */
1720 	pmap_max_asids = pmap_max_asids & ~63ul;
1721 	if (__improbable(pmap_max_asids == 0)) {
1722 		panic("%s: insufficient number of ASIDs (%u) supplied by SPTM", __func__, (unsigned int)pmap_max_asids);
1723 	}
1724 	pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
1725 	PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
1726 	_Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
1727 	_Static_assert((MAX_HW_ASIDS % 64) == 0, "MAX_HW_ASIDS is not divisible by 64");
1728 	bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
1729 	bitmap_clear(&asid_plru_bitmap[0], 0);
1730 #endif /* !HAS_16BIT_ASID */
1731 
1732 
1733 	if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
1734 		maxoffset = trunc_page(maxoffset);
1735 		if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
1736 		    && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
1737 			arm_pmap_max_offset_default = maxoffset;
1738 		}
1739 	}
1740 	if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
1741 		maxoffset = trunc_page(maxoffset);
1742 		if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
1743 		    && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
1744 			arm64_pmap_max_offset_default = maxoffset;
1745 		}
1746 	}
1747 
1748 	PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
1749 
1750 
1751 #if DEVELOPMENT || DEBUG
1752 	PE_parse_boot_argn("vm_footprint_suspend_allowed",
1753 	    &vm_footprint_suspend_allowed,
1754 	    sizeof(vm_footprint_suspend_allowed));
1755 #endif /* DEVELOPMENT || DEBUG */
1756 
1757 #if KASAN
1758 	/* Shadow the CPU copy windows, as they fall outside of the physical aperture */
1759 	kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
1760 #endif /* KASAN */
1761 
1762 	/**
1763 	 * Ensure that avail_start is always left on a page boundary. The calling
1764 	 * code might not perform any alignment before allocating page tables so
1765 	 * this is important.
1766 	 */
1767 	avail_start = round_page(avail_start);
1768 
1769 
1770 #if (DEVELOPMENT || DEBUG)
1771 	(void)sptm_features_available(SPTM_FEATURE_SYSREG, &sptm_sysreg_available);
1772 #endif /* (DEVELOPMENT || DEBUG) */
1773 
1774 	/* Signal that the pmap has been bootstrapped */
1775 	pmap_bootstrapped = true;
1776 }
1777 
1778 /**
1779  * Helper for creating a populated commpage table
1780  *
1781  * In order to avoid burning extra pages on mapping the commpage, we create a
1782  * dedicated table hierarchy for the commpage.  We forcibly nest the translation tables from
1783  * this pmap into other pmaps.  The level we will nest at depends on the MMU configuration (page
1784  * size, TTBR range, etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
1785  *
1786  * @note that this is NOT "the nested pmap" (which is used to nest the shared cache).
1787  *
1788  * @param rw_va Virtual address at which to insert a mapping to the kernel R/W commpage
1789  * @param ro_va Virtual address at which to insert a mapping to the kernel R/O commpage
1790  * @param rw_pa Physical address of kernel R/W commpage
1791  * @param ro_pa Physical address of kernel R/O commpage, may be 0 if not supported in this
1792  *              configuration
1793  * @param rx_pa Physical address of user executable (and kernel R/O) commpage, may be 0 if
1794  *              not supported in this configuration
1795  * @param pmap_create_flags Control flags for the temporary pmap created by this function
1796  *
1797  * @return the physical address of the created commpage table, typed as
1798  *         XNU_PAGE_TABLE_COMMPAGE and containing all relevant commpage mappings.
1799  */
1800 static pmap_paddr_t
pmap_create_commpage_table(vm_map_address_t rw_va,vm_map_address_t ro_va,pmap_paddr_t rw_pa,pmap_paddr_t ro_pa,pmap_paddr_t rx_pa,unsigned int pmap_create_flags)1801 pmap_create_commpage_table(vm_map_address_t rw_va, vm_map_address_t ro_va,
1802     pmap_paddr_t rw_pa, pmap_paddr_t ro_pa, pmap_paddr_t rx_pa, unsigned int pmap_create_flags)
1803 {
1804 	pmap_t temp_commpage_pmap = pmap_create_options(NULL, 0, pmap_create_flags);
1805 	assert(temp_commpage_pmap != NULL);
1806 	assert(rw_pa != 0);
1807 	const pt_attr_t *pt_attr = pmap_get_pt_attr(temp_commpage_pmap);
1808 
1809 	/*
1810 	 * We only use pmap_expand to expand the pmap up to the commpage nesting level.  At that level
1811 	 * and beyond, all the newly created tables will be nested directly into the userspace region
1812 	 * for each process, and as such they must be of the dedicated SPTM commpage table type so that
1813 	 * the SPTM can enforce the commpage security model which forbids random replacement of commpage
1814 	 * mappings.
1815 	 */
1816 	kern_return_t kr = pmap_expand(temp_commpage_pmap, rw_va, 0, pt_attr_commpage_level(pt_attr));
1817 	assert(kr == KERN_SUCCESS);
1818 
1819 	pmap_paddr_t commpage_table_pa = 0;
1820 	for (unsigned int i = pt_attr_commpage_level(pt_attr); i < pt_attr_leaf_level(pt_attr); i++) {
1821 		pmap_paddr_t new_table = 0;
1822 		kr = pmap_page_alloc(&new_table, 0);
1823 		assert((kr == KERN_SUCCESS) && (new_table != 0));
1824 		if (commpage_table_pa == 0) {
1825 			commpage_table_pa = new_table;
1826 		}
1827 
1828 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1829 		retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
1830 		sptm_retype(new_table, XNU_DEFAULT, XNU_PAGE_TABLE_COMMPAGE, retype_params);
1831 
1832 		const sptm_tte_t table_tte = (new_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
1833 
1834 		sptm_map_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, i, rw_va),
1835 		    (sptm_pt_level_t)i, table_tte);
1836 	}
1837 
1838 	/*
1839 	 * Note the lack of ARM_PTE_NG here: commpage mappings are at fixed addresses and
1840 	 * frequently accessed, so we map them global to avoid unnecessary TLB pressure.
1841 	 */
1842 	static const sptm_pte_t commpage_pte_template = ARM_PTE_TYPE_VALID
1843 	    | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK)
1844 	    | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX
1845 	    | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF;
1846 
1847 	sptm_return_t sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, rw_va,
1848 	    commpage_pte_template | ARM_PTE_NX | pa_to_pte(rw_pa));
1849 	assert(sptm_ret == SPTM_SUCCESS);
1850 
1851 	if (ro_pa != 0) {
1852 		assert((ro_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1853 		sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, ro_va,
1854 		    commpage_pte_template | ARM_PTE_NX | pa_to_pte(ro_pa));
1855 		assert(sptm_ret == SPTM_SUCCESS);
1856 	}
1857 
1858 	if (rx_pa != 0) {
1859 		assert((commpage_text_user_va & ~pt_attr_twig_offmask(pt_attr)) == (rw_va & ~pt_attr_twig_offmask(pt_attr)));
1860 		assert((commpage_text_user_va != rw_va) && (commpage_text_user_va != ro_va));
1861 		sptm_ret = sptm_map_page(temp_commpage_pmap->ttep, commpage_text_user_va, commpage_pte_template | pa_to_pte(rx_pa));
1862 		assert(sptm_ret == SPTM_SUCCESS);
1863 	}
1864 
1865 
1866 	sptm_unmap_table(temp_commpage_pmap->ttep, pt_attr_align_va(pt_attr, pt_attr_commpage_level(pt_attr), rw_va),
1867 	    (sptm_pt_level_t)pt_attr_commpage_level(pt_attr));
1868 	pmap_destroy(temp_commpage_pmap);
1869 
1870 	return commpage_table_pa;
1871 }
1872 
1873 /**
1874  * Helper for creating all commpage tables applicable to the current configuration.
1875  *
1876  * @note This function is intended to be called during bootstrap.
1877  * @note This function assumes that pmap_create_commpages has already executed, and therefore
1878  *       the commpage_*_pa variables have been assigned to their final values.  commpage_data_pa
1879  *       is the kernel RW commpage and is assumed to be present on all configurations, so it
1880  *       therefore must be non-zero at this point.  The other variables are considered optional
1881  *       depending upon configuration and may be zero.
1882  */
1883 void pmap_prepare_commpages(void);
1884 void
pmap_prepare_commpages(void)1885 pmap_prepare_commpages(void)
1886 {
1887 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
1888 	assert(commpage_data_pa != 0);
1889 	sptm_retype(commpage_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RW, retype_params);
1890 	if (commpage_ro_data_pa != 0) {
1891 		sptm_retype(commpage_ro_data_pa, XNU_DEFAULT, XNU_COMMPAGE_RO, retype_params);
1892 	}
1893 	if (commpage_text_pa != 0) {
1894 		sptm_retype(commpage_text_pa, XNU_DEFAULT, XNU_COMMPAGE_RX, retype_params);
1895 	}
1896 
1897 	/*
1898 	 * User mapping of comm page text section for 64 bit mapping only
1899 	 *
1900 	 * We don't insert the text commpage into the 32 bit mapping because we don't want
1901 	 * 32-bit user processes to get this page mapped in, they should never call into
1902 	 * this page.
1903 	 */
1904 	commpage_default_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
1905 	    commpage_data_pa, commpage_ro_data_pa, commpage_text_pa, 0);
1906 
1907 	/*
1908 	 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
1909 	 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
1910 	 *
1911 	 * commpage32_default_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
1912 	 *    commpage_data_pa, commpage_ro_data_pa, 0, 0);
1913 	 */
1914 #if __ARM_MIXED_PAGE_SIZE__
1915 	commpage_4k_table = pmap_create_commpage_table(_COMM_PAGE64_BASE_ADDRESS, _COMM_PAGE64_RO_ADDRESS,
1916 	    commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
1917 
1918 	/*
1919 	 * SPTM TODO: Enable this, along with the appropriate 32-bit commpage address checks and flushes in the
1920 	 * SPTM, if we ever need to support arm64_32 processes in the SPTM.
1921 	 * commpage32_4k_table = pmap_create_commpage_table(_COMM_PAGE32_BASE_ADDRESS, _COMM_PAGE32_RO_ADDRESS,
1922 	 *    commpage_data_pa, commpage_ro_data_pa, 0, PMAP_CREATE_FORCE_4K_PAGES);
1923 	 */
1924 #endif /* __ARM_MIXED_PAGE_SIZE__ */
1925 
1926 }
1927 
1928 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)1929 pmap_virtual_space(
1930 	vm_offset_t *startp,
1931 	vm_offset_t *endp
1932 	)
1933 {
1934 	*startp = virtual_space_start;
1935 	*endp = virtual_space_end;
1936 }
1937 
1938 
1939 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)1940 pmap_virtual_region(
1941 	unsigned int region_select,
1942 	vm_map_offset_t *startp,
1943 	vm_map_size_t *size
1944 	)
1945 {
1946 	boolean_t       ret = FALSE;
1947 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
1948 	if (region_select == 0) {
1949 		/*
1950 		 * In this config, the bootstrap mappings should occupy their own L2
1951 		 * TTs, as they should be immutable after boot.  Having the associated
1952 		 * TTEs and PTEs in their own pages allows us to lock down those pages,
1953 		 * while allowing the rest of the kernel address range to be remapped.
1954 		 */
1955 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
1956 #if defined(ARM_LARGE_MEMORY)
1957 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
1958 #else
1959 		*size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
1960 #endif
1961 		ret = TRUE;
1962 	}
1963 
1964 #if defined(ARM_LARGE_MEMORY)
1965 	if (region_select == 1) {
1966 		*startp = VREGION1_START;
1967 		*size = VREGION1_SIZE;
1968 		ret = TRUE;
1969 	}
1970 #endif
1971 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
1972 #if defined(ARM_LARGE_MEMORY)
1973 	/* For large memory systems with no KTRR/CTRR such as virtual machines */
1974 	if (region_select == 0) {
1975 		*startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
1976 		*size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
1977 		ret = TRUE;
1978 	}
1979 
1980 	if (region_select == 1) {
1981 		*startp = VREGION1_START;
1982 		*size = VREGION1_SIZE;
1983 		ret = TRUE;
1984 	}
1985 #else /* !defined(ARM_LARGE_MEMORY) */
1986 	unsigned long low_global_vr_mask = 0;
1987 	vm_map_size_t low_global_vr_size = 0;
1988 
1989 	if (region_select == 0) {
1990 		/* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
1991 		if (!TEST_PAGE_SIZE_4K) {
1992 			*startp = gVirtBase & 0xFFFFFFFFFE000000;
1993 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
1994 		} else {
1995 			*startp = gVirtBase & 0xFFFFFFFFFF800000;
1996 			*size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
1997 		}
1998 		ret = TRUE;
1999 	}
2000 	if (region_select == 1) {
2001 		*startp = VREGION1_START;
2002 		*size = VREGION1_SIZE;
2003 		ret = TRUE;
2004 	}
2005 	/* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2006 	if (!TEST_PAGE_SIZE_4K) {
2007 		low_global_vr_mask = 0xFFFFFFFFFE000000;
2008 		low_global_vr_size = 0x2000000;
2009 	} else {
2010 		low_global_vr_mask = 0xFFFFFFFFFF800000;
2011 		low_global_vr_size = 0x800000;
2012 	}
2013 
2014 	if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2015 		*startp = LOW_GLOBAL_BASE_ADDRESS;
2016 		*size = low_global_vr_size;
2017 		ret = TRUE;
2018 	}
2019 
2020 	if (region_select == 3) {
2021 		/* In this config, we allow the bootstrap mappings to occupy the same
2022 		 * page table pages as the heap.
2023 		 */
2024 		*startp = VM_MIN_KERNEL_ADDRESS;
2025 		*size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2026 		ret = TRUE;
2027 	}
2028 #endif /* defined(ARM_LARGE_MEMORY) */
2029 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2030 	return ret;
2031 }
2032 
2033 /*
2034  * Routines to track and allocate physical pages during early boot.
2035  * On most systems that memory runs from first_avail through to avail_end
2036  * with no gaps.
2037  *
2038  * If the system supports ECC and ecc_bad_pages_count > 0, we
2039  * need to skip those pages.
2040  */
2041 
2042 static unsigned int avail_page_count = 0;
2043 static bool need_ram_ranges_init = true;
2044 
2045 
2046 /**
2047  * Checks to see if a given page is in
2048  * the array of known bad pages
2049  *
2050  * @param ppn page number to check
2051  */
2052 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2053 pmap_is_bad_ram(__unused ppnum_t ppn)
2054 {
2055 	return false;
2056 }
2057 
2058 /**
2059  * Prepare bad ram pages to be skipped.
2060  */
2061 
2062 /*
2063  * Initialize the count of available pages. No lock needed here,
2064  * as this code is called while kernel boot up is single threaded.
2065  */
2066 static void
initialize_ram_ranges(void)2067 initialize_ram_ranges(void)
2068 {
2069 	pmap_paddr_t first = first_avail;
2070 	pmap_paddr_t end = avail_end;
2071 
2072 	assert(first <= end);
2073 	assert(first == (first & ~PAGE_MASK));
2074 	assert(end == (end & ~PAGE_MASK));
2075 	avail_page_count = atop(end - first);
2076 
2077 	need_ram_ranges_init = false;
2078 
2079 }
2080 
2081 unsigned int
pmap_free_pages(void)2082 pmap_free_pages(
2083 	void)
2084 {
2085 	if (need_ram_ranges_init) {
2086 		initialize_ram_ranges();
2087 	}
2088 	return avail_page_count;
2089 }
2090 
2091 unsigned int
pmap_free_pages_span(void)2092 pmap_free_pages_span(
2093 	void)
2094 {
2095 	if (need_ram_ranges_init) {
2096 		initialize_ram_ranges();
2097 	}
2098 	return (unsigned int)atop(avail_end - first_avail);
2099 }
2100 
2101 
2102 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2103 pmap_next_page_hi(
2104 	ppnum_t            * pnum,
2105 	__unused boolean_t might_free)
2106 {
2107 	return pmap_next_page(pnum);
2108 }
2109 
2110 
2111 boolean_t
pmap_next_page(ppnum_t * pnum)2112 pmap_next_page(
2113 	ppnum_t *pnum)
2114 {
2115 	if (need_ram_ranges_init) {
2116 		initialize_ram_ranges();
2117 	}
2118 
2119 
2120 	if (first_avail != avail_end) {
2121 		*pnum = (ppnum_t)atop(first_avail);
2122 		first_avail += PAGE_SIZE;
2123 		assert(avail_page_count > 0);
2124 		--avail_page_count;
2125 		return TRUE;
2126 	}
2127 	assert(avail_page_count == 0);
2128 	return FALSE;
2129 }
2130 
2131 
2132 
2133 
2134 /**
2135  * Helper function to check wheter the given physical
2136  * page number is a restricted page.
2137  *
2138  * @param pn the physical page number to query.
2139  */
2140 bool
pmap_is_page_restricted(ppnum_t pn)2141 pmap_is_page_restricted(ppnum_t pn)
2142 {
2143 	sptm_frame_type_t frame_type = sptm_get_frame_type(ptoa(pn));
2144 	return frame_type == XNU_KERNEL_RESTRICTED;
2145 }
2146 
2147 /*
2148  *	Initialize the pmap module.
2149  *	Called by vm_init, to initialize any structures that the pmap
2150  *	system needs to map virtual memory.
2151  */
2152 void
pmap_init(void)2153 pmap_init(
2154 	void)
2155 {
2156 	/*
2157 	 *	Protect page zero in the kernel map.
2158 	 *	(can be overruled by permanent transltion
2159 	 *	table entries at page zero - see arm_vm_init).
2160 	 */
2161 	vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2162 
2163 	pmap_initialized = TRUE;
2164 
2165 	/*
2166 	 *	Create the zone of physical maps
2167 	 *	and the physical-to-virtual entries.
2168 	 */
2169 	pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2170 	    ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2171 
2172 
2173 	/*
2174 	 *	Initialize the pmap object (for tracking the vm_page_t
2175 	 *	structures for pages we allocate to be page tables in
2176 	 *	pmap_expand().
2177 	 */
2178 	_vm_object_allocate(mem_size, pmap_object, VM_MAP_SERIAL_SPECIAL);
2179 	pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2180 
2181 	/*
2182 	 *	Initialize the TXM VM object in the same way as the
2183 	 *	PMAP VM object.
2184 	 */
2185 	_vm_object_allocate(mem_size, txm_vm_object, VM_MAP_SERIAL_SPECIAL);
2186 	txm_vm_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2187 
2188 	/*
2189 	 * The values of [hard_]maxproc may have been scaled, make sure
2190 	 * they are still less than the value of pmap_max_asids.
2191 	 */
2192 	if ((uint32_t)maxproc > pmap_max_asids) {
2193 		maxproc = pmap_max_asids;
2194 	}
2195 	if ((uint32_t)hard_maxproc > pmap_max_asids) {
2196 		hard_maxproc = pmap_max_asids;
2197 	}
2198 }
2199 
2200 /**
2201  * Verify that a given physical page contains no mappings (outside of the
2202  * default physical aperture mapping).
2203  *
2204  * @param ppnum Physical page number to check there are no mappings to.
2205  *
2206  * @return True if there are no mappings, false otherwise or if the page is not
2207  *         kernel-managed.
2208  */
2209 bool
pmap_verify_free(ppnum_t ppnum)2210 pmap_verify_free(ppnum_t ppnum)
2211 {
2212 	const pmap_paddr_t pa = ptoa(ppnum);
2213 
2214 	assert(pa != vm_page_fictitious_addr);
2215 
2216 	/* Only mappings to kernel-managed physical memory are tracked. */
2217 	if (!pa_valid(pa)) {
2218 		return false;
2219 	}
2220 
2221 	const unsigned int pai = pa_index(pa);
2222 
2223 	return pvh_test_type(pai_to_pvh(pai), PVH_TYPE_NULL);
2224 }
2225 
2226 #if MACH_ASSERT
2227 /**
2228  * Verify that a given physical page contains no mappings (outside of the
2229  * default physical aperture mapping) and if it does, then panic.
2230  *
2231  * @note It's recommended to use pmap_verify_free() directly when operating in
2232  *       the PPL since the PVH lock isn't getting grabbed here (due to this code
2233  *       normally being called from outside of the PPL, and the pv_head_table
2234  *       can't be modified outside of the PPL).
2235  *
2236  * @param ppnum Physical page number to check there are no mappings to.
2237  */
2238 void
pmap_assert_free(ppnum_t ppnum)2239 pmap_assert_free(ppnum_t ppnum)
2240 {
2241 	const pmap_paddr_t pa = ptoa(ppnum);
2242 
2243 	/* Only mappings to kernel-managed physical memory are tracked. */
2244 	if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2245 		return;
2246 	}
2247 
2248 	const unsigned int pai = pa_index(pa);
2249 	const uintptr_t pvh = pai_to_pvh(pai);
2250 
2251 	/**
2252 	 * This function is always called from outside of the PPL. Because of this,
2253 	 * the PVH entry can't be locked. This function is generally only called
2254 	 * before the VM reclaims a physical page and shouldn't be creating new
2255 	 * mappings. Even if a new mapping is created while parsing the hierarchy,
2256 	 * the worst case is that the system will panic in another way, and we were
2257 	 * already about to panic anyway.
2258 	 */
2259 
2260 	/**
2261 	 * Since pmap_verify_free() returned false, that means there is at least one
2262 	 * mapping left. Let's get some extra info on the first mapping we find to
2263 	 * dump in the panic string (the common case is that there is one spare
2264 	 * mapping that was never unmapped).
2265 	 */
2266 	pt_entry_t *first_ptep = PT_ENTRY_NULL;
2267 
2268 	if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2269 		first_ptep = pvh_ptep(pvh);
2270 	} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2271 		pv_entry_t *pvep = pvh_pve_list(pvh);
2272 
2273 		/* Each PVE can contain multiple PTEs. Let's find the first one. */
2274 		for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2275 			first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2276 			if (first_ptep != PT_ENTRY_NULL) {
2277 				break;
2278 			}
2279 		}
2280 
2281 		/* The PVE should have at least one valid PTE. */
2282 		assert(first_ptep != PT_ENTRY_NULL);
2283 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2284 		panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2285 		    __func__, (void*)pvh, pai);
2286 	} else {
2287 		/**
2288 		 * The mapping disappeared between here and the pmap_verify_free() call.
2289 		 * The only way that can happen is if the VM was racing this call with
2290 		 * a call that unmaps PTEs. Operations on this page should not be
2291 		 * occurring at the same time as this check, and unfortunately we can't
2292 		 * lock the PVH entry to prevent it, so just panic instead.
2293 		 */
2294 		panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2295 		    "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2296 		    __func__, (void*)pvh, pai);
2297 	}
2298 
2299 	/* Panic with a unique string identifying the first bad mapping and owner. */
2300 	{
2301 		/* First PTE is mapped by the main CPUs. */
2302 		pmap_t pmap = ptep_get_pmap(first_ptep);
2303 		const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2304 
2305 		panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2306 		    "%s CPU mapping (pmap: %p)",
2307 		    __func__, (uint64_t)pa, first_ptep, type, pmap);
2308 	}
2309 }
2310 #endif
2311 
2312 
2313 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2314 pmap_root_alloc_size(pmap_t pmap)
2315 {
2316 #pragma unused(pmap)
2317 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2318 	unsigned int root_level = pt_attr_root_level(pt_attr);
2319 	return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2320 }
2321 
2322 /*
2323  *	Create and return a physical map.
2324  *
2325  *	If the size specified for the map
2326  *	is zero, the map is an actual physical
2327  *	map, and may be referenced by the
2328  *	hardware.
2329  *
2330  *	If the size specified is non-zero,
2331  *	the map will be used in software only, and
2332  *	is bounded by that size.
2333  */
2334 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2335 pmap_create_options_internal(
2336 	ledger_t ledger,
2337 	vm_map_size_t size,
2338 	unsigned int flags,
2339 	kern_return_t *kr)
2340 {
2341 	pmap_t          p;
2342 	bool is_64bit = flags & PMAP_CREATE_64BIT;
2343 #if defined(HAS_APPLE_PAC)
2344 	bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2345 #endif /* defined(HAS_APPLE_PAC) */
2346 	kern_return_t   local_kr = KERN_SUCCESS;
2347 	__unused uint8_t sptm_root_flags = SPTM_ROOT_PT_FLAGS_DEFAULT;
2348 	TXMAddressSpaceFlags_t txm_flags = kTXMAddressSpaceFlagInit;
2349 	const bool is_stage2 = false;
2350 
2351 	if (size != 0) {
2352 		{
2353 			// Size parameter should only be set for stage 2.
2354 			return PMAP_NULL;
2355 		}
2356 	}
2357 
2358 	if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2359 		return PMAP_NULL;
2360 	}
2361 
2362 	/*
2363 	 *	Allocate a pmap struct from the pmap_zone.  Then allocate
2364 	 *	the translation table of the right size for the pmap.
2365 	 */
2366 	if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2367 		local_kr = KERN_RESOURCE_SHORTAGE;
2368 		goto pmap_create_fail;
2369 	}
2370 
2371 	p->ledger = ledger;
2372 
2373 
2374 	p->pmap_vm_map_cs_enforced = false;
2375 	p->min = 0;
2376 
2377 
2378 #if CONFIG_ROSETTA
2379 	if (flags & PMAP_CREATE_ROSETTA) {
2380 		p->is_rosetta = TRUE;
2381 	} else {
2382 		p->is_rosetta = FALSE;
2383 	}
2384 #endif /* CONFIG_ROSETTA */
2385 #if defined(HAS_APPLE_PAC)
2386 	p->disable_jop = disable_jop;
2387 
2388 	if (p->disable_jop) {
2389 		sptm_root_flags &= ~SPTM_ROOT_PT_FLAG_JOP;
2390 	}
2391 #endif /* defined(HAS_APPLE_PAC) */
2392 
2393 	p->nested_region_true_start = 0;
2394 	p->nested_region_true_end = ~0;
2395 
2396 	p->nx_enabled = true;
2397 	p->is_64bit = is_64bit;
2398 	p->nested_pmap = PMAP_NULL;
2399 	p->type = PMAP_TYPE_USER;
2400 
2401 #if ARM_PARAMETERIZED_PMAP
2402 	/* Default to the native pt_attr */
2403 	p->pmap_pt_attr = native_pt_attr;
2404 #endif /* ARM_PARAMETERIZED_PMAP */
2405 #if __ARM_MIXED_PAGE_SIZE__
2406 	if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2407 		p->pmap_pt_attr = &pmap_pt_attr_4k;
2408 	}
2409 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2410 	p->max = pmap_user_va_size(p);
2411 
2412 	if (!pmap_get_pt_ops(p)->alloc_id(p)) {
2413 		local_kr = KERN_NO_SPACE;
2414 		goto id_alloc_fail;
2415 	}
2416 
2417 	/**
2418 	 * We expect top level translation tables to always fit into a single
2419 	 * physical page. This would also catch a misconfiguration if 4K
2420 	 * concatenated page tables needed more than one physical tt1 page.
2421 	 */
2422 	vm_size_t pmap_root_size = pmap_root_alloc_size(p);
2423 	if (__improbable(pmap_root_size > PAGE_SIZE)) {
2424 		panic("%s: translation tables do not fit into a single physical page %u", __FUNCTION__, (unsigned)pmap_root_size);
2425 	}
2426 
2427 	pmap_lock_init(p);
2428 
2429 	p->tte = pmap_tt1_allocate(p, sptm_root_flags);
2430 	if (!(p->tte)) {
2431 		local_kr = KERN_RESOURCE_SHORTAGE;
2432 		goto tt1_alloc_fail;
2433 	}
2434 
2435 	p->ttep = kvtophys_nofail((vm_offset_t)p->tte);
2436 	PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
2437 
2438 	/*
2439 	 *  initialize the rest of the structure
2440 	 */
2441 	p->nested_region_addr = 0x0ULL;
2442 	p->nested_region_size = 0x0ULL;
2443 	p->nested_region_unnested_table_bitmap = NULL;
2444 
2445 	p->nested_has_no_bounds_ref = false;
2446 	p->nested_no_bounds_refcnt = 0;
2447 	p->nested_bounds_set = false;
2448 
2449 	p->associated_vm_map_serial_id = VM_MAP_SERIAL_NONE;
2450 
2451 #if MACH_ASSERT
2452 	p->pmap_pid = 0;
2453 	strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
2454 #endif /* MACH_ASSERT */
2455 #if DEVELOPMENT || DEBUG
2456 	p->footprint_was_suspended = FALSE;
2457 #endif /* DEVELOPMENT || DEBUG */
2458 
2459 	os_ref_init_count_raw(&p->ref_count, &pmap_refgrp, 1);
2460 	pmap_simple_lock(&pmaps_lock);
2461 	queue_enter(&map_pmap_list, p, pmap_t, pmaps);
2462 	pmap_simple_unlock(&pmaps_lock);
2463 
2464 	/**
2465 	 * The SPTM pmap's concurrency model can sometimes allow ledger balances to transiently
2466 	 * go negative.  Note that we still check overall ledger balance on pmap destruction.
2467 	 */
2468 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
2469 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
2470 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
2471 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
2472 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
2473 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
2474 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
2475 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
2476 	ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
2477 
2478 	if (!is_stage2) {
2479 		/*
2480 		 * Complete initialization for the TXM address space. This needs to be done
2481 		 * after the SW ASID has been registered with the SPTM.
2482 		 * TXM enforcement does not apply to virtual machines.
2483 		 */
2484 		if (flags & PMAP_CREATE_TEST) {
2485 			txm_flags |= kTXMAddressSpaceFlagTest;
2486 		}
2487 
2488 		pmap_txmlock_init(p);
2489 		txm_register_address_space(p, p->asid, txm_flags);
2490 		p->txm_trust_level = kCSTrustUntrusted;
2491 	}
2492 
2493 	return p;
2494 
2495 tt1_alloc_fail:
2496 	pmap_get_pt_ops(p)->free_id(p);
2497 id_alloc_fail:
2498 	zfree(pmap_zone, p);
2499 pmap_create_fail:
2500 	*kr = local_kr;
2501 	return PMAP_NULL;
2502 }
2503 
2504 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)2505 pmap_create_options(
2506 	ledger_t ledger,
2507 	vm_map_size_t size,
2508 	unsigned int flags)
2509 {
2510 	pmap_t pmap;
2511 	kern_return_t kr = KERN_SUCCESS;
2512 
2513 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
2514 
2515 	ledger_reference(ledger);
2516 
2517 	pmap = pmap_create_options_internal(ledger, size, flags, &kr);
2518 
2519 	if (pmap == PMAP_NULL) {
2520 		ledger_dereference(ledger);
2521 	}
2522 
2523 	PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2524 
2525 	return pmap;
2526 }
2527 
2528 #if MACH_ASSERT
2529 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)2530 pmap_set_process_internal(
2531 	__unused pmap_t pmap,
2532 	__unused int pid,
2533 	__unused char *procname)
2534 {
2535 	if (pmap == NULL || pmap->pmap_pid == -1) {
2536 		return;
2537 	}
2538 
2539 	validate_pmap_mutable(pmap);
2540 
2541 	pmap->pmap_pid = pid;
2542 	strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
2543 }
2544 #endif /* MACH_ASSERT */
2545 
2546 #if MACH_ASSERT
2547 void
pmap_set_process(pmap_t pmap,int pid,char * procname)2548 pmap_set_process(
2549 	pmap_t pmap,
2550 	int pid,
2551 	char *procname)
2552 {
2553 	pmap_set_process_internal(pmap, pid, procname);
2554 }
2555 #endif /* MACH_ASSERT */
2556 
2557 /*
2558  * pmap_deallocate_all_leaf_tts:
2559  *
2560  * Recursive function for deallocating all leaf TTEs.  Walks the given TT,
2561  * removing and deallocating all TTEs.
2562  */
2563 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,vm_map_address_t start_va,unsigned level)2564 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, vm_map_address_t start_va, unsigned level)
2565 {
2566 	tt_entry_t tte = ARM_TTE_EMPTY;
2567 	tt_entry_t * ttep = NULL;
2568 	tt_entry_t * last_ttep = NULL;
2569 
2570 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2571 	const uint64_t size = pt_attr->pta_level_info[level].size;
2572 
2573 	assert(level < pt_attr_leaf_level(pt_attr));
2574 
2575 	last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
2576 
2577 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
2578 	vm_map_address_t va = start_va;
2579 	for (ttep = first_ttep; ttep <= last_ttep; ttep += page_ratio, va += (size * page_ratio)) {
2580 		if (!(*ttep & ARM_TTE_VALID)) {
2581 			continue;
2582 		}
2583 
2584 		for (unsigned i = 0; i < page_ratio; i++) {
2585 			tte = ttep[i];
2586 
2587 			if (!(tte & ARM_TTE_VALID)) {
2588 				panic("%s: found unexpectedly invalid tte, ttep=%p, tte=%p, "
2589 				    "pmap=%p, first_ttep=%p, level=%u",
2590 				    __FUNCTION__, ttep + i, (void *)tte,
2591 				    pmap, first_ttep, level);
2592 			}
2593 
2594 			if (tte_is_block(tte)) {
2595 				panic("%s: found block mapping, ttep=%p, tte=%p, "
2596 				    "pmap=%p, first_ttep=%p, level=%u",
2597 				    __FUNCTION__, ttep + i, (void *)tte,
2598 				    pmap, first_ttep, level);
2599 			}
2600 
2601 			/* Must be valid, type table */
2602 			if (level < pt_attr_twig_level(pt_attr)) {
2603 				/* If we haven't reached the twig level, recurse to the next level. */
2604 				pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK),
2605 				    va + (size * i), level + 1);
2606 			}
2607 		}
2608 
2609 		/* Remove the TTE. */
2610 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
2611 		pmap_tte_deallocate(pmap, va, ttep, level);
2612 	}
2613 }
2614 
2615 /*
2616  * We maintain stats and ledgers so that a task's physical footprint is:
2617  * phys_footprint = ((internal - alternate_accounting)
2618  *                   + (internal_compressed - alternate_accounting_compressed)
2619  *                   + iokit_mapped
2620  *                   + purgeable_nonvolatile
2621  *                   + purgeable_nonvolatile_compressed
2622  *                   + page_table)
2623  * where "alternate_accounting" includes "iokit" and "purgeable" memory.
2624  */
2625 
2626 /*
2627  *	Retire the given physical map from service.
2628  *	Should only be called if the map contains
2629  *	no valid mappings.
2630  */
2631 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)2632 pmap_destroy_internal(
2633 	pmap_t pmap)
2634 {
2635 	if (pmap == PMAP_NULL) {
2636 		return;
2637 	}
2638 
2639 	validate_pmap(pmap);
2640 
2641 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2642 	const bool is_stage2_pmap = false;
2643 
2644 	if (os_ref_release_raw(&pmap->ref_count, &pmap_refgrp) > 0) {
2645 		return;
2646 	}
2647 
2648 	if (!is_stage2_pmap) {
2649 		/*
2650 		 * Complete all clean up required for TXM. This needs to happen before the
2651 		 * SW ASID has been unregistered with the SPTM.
2652 		 */
2653 		txm_unregister_address_space(pmap);
2654 		pmap_txmlock_destroy(pmap);
2655 	}
2656 
2657 	/**
2658 	 * Drain any concurrent retype-sensitive SPTM operations.  This is needed to
2659 	 * ensure that we don't unmap and retype the page tables while those operations
2660 	 * are still finishing on other CPUs, leading to an SPTM violation.  In particular,
2661 	 * the multipage batched cacheability/attribute update code may issue SPTM calls
2662 	 * without holding the relevant PVH or pmap locks, so we can't guarantee those
2663 	 * calls have actually completed despite observing refcnt == 0.
2664 	 *
2665 	 * At this point, we CAN guarantee that:
2666 	 * 1) All prior PTE removals required to empty the pmap have completed and
2667 	 *    been synchronized with DSB, *except* the commpage removal which doesn't
2668 	 *    involve pages that can ever be retyped.  Subsequent calls not already
2669 	 *    in the retype epoch will no longer observe these mappings.
2670 	 * 2) The pmap now has a zero refcount, so in a correctly functioning system
2671 	 *    no further mappings will be requested for it.
2672 	 */
2673 	pmap_retype_epoch_prepare_drain();
2674 
2675 	if (!is_stage2_pmap) {
2676 		pmap_unmap_commpage(pmap);
2677 	}
2678 
2679 	pmap_simple_lock(&pmaps_lock);
2680 	queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
2681 	pmap_simple_unlock(&pmaps_lock);
2682 
2683 	pmap_retype_epoch_drain();
2684 
2685 	pmap_trim_self(pmap);
2686 
2687 	/*
2688 	 *	Free the memory maps, then the
2689 	 *	pmap structure.
2690 	 */
2691 	pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pmap->min, pt_attr_root_level(pt_attr));
2692 
2693 	if (pmap->tte) {
2694 		pmap_tt1_deallocate(pmap, pmap->tte);
2695 		pmap->tte = (tt_entry_t *) NULL;
2696 		pmap->ttep = 0;
2697 	}
2698 
2699 	if (pmap->type != PMAP_TYPE_NESTED) {
2700 		/* return its asid to the pool */
2701 		pmap_get_pt_ops(pmap)->free_id(pmap);
2702 		if (pmap->nested_pmap != NULL) {
2703 			/* release the reference we hold on the nested pmap */
2704 			pmap_destroy_internal(pmap->nested_pmap);
2705 		}
2706 	}
2707 
2708 	pmap_check_ledgers(pmap);
2709 
2710 	if (pmap->nested_region_unnested_table_bitmap) {
2711 		bitmap_free(pmap->nested_region_unnested_table_bitmap, pmap->nested_region_size >> pt_attr_twig_shift(pt_attr));
2712 	}
2713 
2714 	pmap_lock_destroy(pmap);
2715 	zfree(pmap_zone, pmap);
2716 }
2717 
2718 void
pmap_destroy(pmap_t pmap)2719 pmap_destroy(
2720 	pmap_t pmap)
2721 {
2722 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
2723 
2724 	ledger_t ledger = pmap->ledger;
2725 
2726 	pmap_destroy_internal(pmap);
2727 
2728 	ledger_dereference(ledger);
2729 
2730 	PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
2731 }
2732 
2733 
2734 /*
2735  *	Add a reference to the specified pmap.
2736  */
2737 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)2738 pmap_reference_internal(
2739 	pmap_t pmap)
2740 {
2741 	if (pmap != PMAP_NULL) {
2742 		validate_pmap_mutable(pmap);
2743 		os_ref_retain_raw(&pmap->ref_count, &pmap_refgrp);
2744 	}
2745 }
2746 
2747 void
pmap_reference(pmap_t pmap)2748 pmap_reference(
2749 	pmap_t pmap)
2750 {
2751 	pmap_reference_internal(pmap);
2752 }
2753 
2754 static sptm_frame_type_t
get_sptm_pt_type(pmap_t pmap)2755 get_sptm_pt_type(pmap_t pmap)
2756 {
2757 	const bool is_stage2_pmap = false;
2758 	if (is_stage2_pmap) {
2759 		assert(pmap->type != PMAP_TYPE_NESTED);
2760 		return XNU_STAGE2_PAGE_TABLE;
2761 	} else {
2762 		return pmap->type == PMAP_TYPE_NESTED ? XNU_PAGE_TABLE_SHARED : XNU_PAGE_TABLE;
2763 	}
2764 }
2765 
2766 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,uint8_t sptm_root_flags)2767 pmap_tt1_allocate(pmap_t pmap, uint8_t sptm_root_flags)
2768 {
2769 	pmap_paddr_t pa = 0;
2770 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2771 	const bool is_stage2_pmap = false;
2772 
2773 	const kern_return_t ret = pmap_page_alloc(&pa, PMAP_PAGE_NOZEROFILL);
2774 
2775 	if (ret != KERN_SUCCESS) {
2776 		return (tt_entry_t *)0;
2777 	}
2778 
2779 	/**
2780 	 * Drain the epochs to ensure any lingering batched operations that may have taken
2781 	 * an in-flight reference to this page are complete.
2782 	 */
2783 	pmap_retype_epoch_prepare_drain();
2784 
2785 	assert(pa);
2786 
2787 	/* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
2788 	 * Depending on the device, this can vary between 512b and 16K. */
2789 	OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
2790 	pmap_tt_ledger_credit(pmap, PAGE_SIZE);
2791 
2792 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2793 	retype_params.attr_idx = pt_attr->geometry_id;
2794 	retype_params.flags = sptm_root_flags;
2795 	if (is_stage2_pmap) {
2796 		retype_params.vmid = pmap->vmid;
2797 	} else {
2798 		retype_params.asid = pmap->asid;
2799 	}
2800 
2801 	pmap_retype_epoch_drain();
2802 
2803 	sptm_retype(pa, XNU_DEFAULT, is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE : XNU_USER_ROOT_TABLE,
2804 	    retype_params);
2805 
2806 	return (tt_entry_t *) phystokv(pa);
2807 }
2808 
2809 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt)2810 pmap_tt1_deallocate(
2811 	pmap_t pmap,
2812 	tt_entry_t *tt)
2813 {
2814 	pmap_paddr_t pa = kvtophys_nofail((vm_offset_t)tt);
2815 	const bool is_stage2_pmap = false;
2816 	const sptm_frame_type_t page_type = is_stage2_pmap ? XNU_STAGE2_ROOT_TABLE :
2817 	    pmap->type == PMAP_TYPE_NESTED ? XNU_SHARED_ROOT_TABLE : XNU_USER_ROOT_TABLE;
2818 
2819 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2820 	sptm_retype(pa, page_type, XNU_DEFAULT, retype_params);
2821 	pmap_page_free(pa);
2822 
2823 	OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
2824 	pmap_tt_ledger_debit(pmap, PAGE_SIZE);
2825 }
2826 
2827 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)2828 pmap_tt_allocate(
2829 	pmap_t pmap,
2830 	tt_entry_t **ttp,
2831 	unsigned int level,
2832 	unsigned int options)
2833 {
2834 	pmap_paddr_t pa;
2835 	*ttp = NULL;
2836 
2837 	if (*ttp == NULL) {
2838 		const unsigned int alloc_flags =
2839 		    (options & PMAP_TT_ALLOCATE_NOWAIT) ? PMAP_PAGE_ALLOCATE_NOWAIT : 0;
2840 
2841 		/* Allocate a VM page to be used as the page table. */
2842 		if (pmap_page_alloc(&pa, alloc_flags) != KERN_SUCCESS) {
2843 			return KERN_RESOURCE_SHORTAGE;
2844 		}
2845 
2846 		pt_desc_t *ptdp = ptd_alloc(pmap, alloc_flags);
2847 		if (ptdp == NULL) {
2848 			pmap_page_free(pa);
2849 			return KERN_RESOURCE_SHORTAGE;
2850 		}
2851 
2852 		unsigned int pai = pa_index(pa);
2853 		locked_pvh_t locked_pvh = pvh_lock(pai);
2854 		assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL), "%s: non-empty PVH %p",
2855 		    __func__, (void*)locked_pvh.pvh);
2856 
2857 		/**
2858 		 * Drain the epochs to ensure any lingering batched operations that may have taken
2859 		 * an in-flight reference to this page are complete.
2860 		 */
2861 		pmap_retype_epoch_prepare_drain();
2862 
2863 		if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
2864 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
2865 		} else {
2866 			OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
2867 		}
2868 
2869 		pmap_tt_ledger_credit(pmap, PAGE_SIZE);
2870 
2871 		PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
2872 
2873 		pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
2874 		pvh_unlock(&locked_pvh);
2875 
2876 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2877 		retype_params.level = (sptm_pt_level_t)level;
2878 
2879 		/**
2880 		 * SPTM TODO: To reduce the cost of draining and retyping, consider caching freed page table pages
2881 		 * in a small per-CPU bucket and reusing them in preference to calling pmap_page_alloc() above.
2882 		 */
2883 		pmap_retype_epoch_drain();
2884 
2885 		sptm_retype(pa, XNU_DEFAULT, get_sptm_pt_type(pmap), retype_params);
2886 
2887 		*ttp = (tt_entry_t *)phystokv(pa);
2888 	}
2889 
2890 	assert(*ttp);
2891 
2892 	return KERN_SUCCESS;
2893 }
2894 
2895 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)2896 pmap_tt_deallocate(
2897 	pmap_t pmap,
2898 	tt_entry_t *ttp,
2899 	unsigned int level)
2900 {
2901 	pt_desc_t *ptdp;
2902 	vm_offset_t     free_page = 0;
2903 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2904 
2905 	ptdp = ptep_get_ptd(ttp);
2906 	ptdp->va = (vm_offset_t)-1;
2907 
2908 	const uint16_t refcnt = sptm_get_page_table_refcnt(kvtophys_nofail((vm_offset_t)ttp));
2909 
2910 	if (__improbable(refcnt != 0)) {
2911 		panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, refcnt);
2912 	}
2913 
2914 	free_page = (vm_offset_t)ttp & ~PAGE_MASK;
2915 	if (free_page != 0) {
2916 		pmap_paddr_t pa = kvtophys_nofail(free_page);
2917 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
2918 		sptm_retype(pa, get_sptm_pt_type(pmap), XNU_DEFAULT, retype_params);
2919 		ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
2920 
2921 		unsigned int pai = pa_index(pa);
2922 		locked_pvh_t locked_pvh = pvh_lock(pai);
2923 		assertf(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTDP), "%s: non-PTD PVH %p",
2924 		    __func__, (void*)locked_pvh.pvh);
2925 		pvh_update_head(&locked_pvh, NULL, PVH_TYPE_NULL);
2926 		pvh_unlock(&locked_pvh);
2927 		pmap_page_free(pa);
2928 		if (level < pt_attr_leaf_level(pt_attr)) {
2929 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
2930 		} else {
2931 			OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
2932 		}
2933 		PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
2934 		pmap_tt_ledger_debit(pmap, PAGE_SIZE);
2935 	}
2936 }
2937 
2938 /**
2939  * Check table refcounts after clearing a translation table entry pointing to that table
2940  *
2941  * @note If the cleared TTE points to a leaf table, then that leaf table
2942  *       must have a refcnt of zero before the TTE can be removed.
2943  *
2944  * @param pmap The pmap containing the page table whose TTE is being removed.
2945  * @param tte Value stored in the TTE prior to clearing it
2946  * @param level The level of the page table that contains the TTE being removed
2947  */
2948 static void
pmap_tte_check_refcounts(pmap_t pmap,tt_entry_t tte,unsigned int level)2949 pmap_tte_check_refcounts(
2950 	pmap_t pmap,
2951 	tt_entry_t tte,
2952 	unsigned int level)
2953 {
2954 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2955 
2956 	/**
2957 	 * Remember, the passed in "level" parameter refers to the level above the
2958 	 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
2959 	 * page table).
2960 	 */
2961 	const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
2962 
2963 	unsigned short refcnt = 0;
2964 
2965 	/**
2966 	 * It's possible that a concurrent pmap_disconnect() operation may need to reference
2967 	 * a PTE on the pagetable page to be removed.  A full disconnect() may have cleared
2968 	 * one or more PTEs on this page but not yet dropped the refcount, which would cause
2969 	 * us to panic in this function on a non-zero refcount.  Moreover, it's possible for
2970 	 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
2971 	 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
2972 	 * drop the pagetable refcount accordingly, without taking any PVH locks that could
2973 	 * synchronize it against the disconnect operation.  If that removal caused the
2974 	 * refcount to reach zero, the pagetable page could be freed before the disconnect
2975 	 * operation is finished using the relevant pagetable descriptor.
2976 	 * Address these cases by waiting until all CPUs have been observed to not be
2977 	 * executing pmap_disconnect().
2978 	 */
2979 	if (remove_leaf_table) {
2980 		bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
2981 		const int max_cpu = ml_get_max_cpu_number();
2982 		bitmap_full(&active_disconnects[0], max_cpu + 1);
2983 		bool inflight_disconnect;
2984 
2985 		/*
2986 		 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
2987 		 * ahead of any prior PTE load which may have observed the effect of a
2988 		 * concurrent disconnect operation.  An acquire fence is required for this;
2989 		 * a load-acquire operation is insufficient.
2990 		 */
2991 		os_atomic_thread_fence(acquire);
2992 		do {
2993 			inflight_disconnect = false;
2994 			for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
2995 			    i >= 0;
2996 			    i = bitmap_next(&active_disconnects[0], i)) {
2997 				const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
2998 				if (cpu_data == NULL) {
2999 					continue;
3000 				}
3001 				if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3002 					__builtin_arm_wfe();
3003 					inflight_disconnect = true;
3004 					continue;
3005 				}
3006 				os_atomic_clear_exclusive();
3007 				bitmap_clear(&active_disconnects[0], (unsigned int)i);
3008 			}
3009 		} while (inflight_disconnect);
3010 		/* Ensure the refcount is observed after any observation of inflight_disconnect */
3011 		os_atomic_thread_fence(acquire);
3012 		refcnt = sptm_get_page_table_refcnt(tte_to_pa(tte));
3013 	}
3014 
3015 #if MACH_ASSERT
3016 	/**
3017 	 * On internal devices, always do the page table consistency check
3018 	 * regardless of page table level or the actual refcnt value.
3019 	 */
3020 	{
3021 #else /* MACH_ASSERT */
3022 	/**
3023 	 * Only perform the page table consistency check when deleting leaf page
3024 	 * tables and it seems like there might be valid/compressed mappings
3025 	 * leftover.
3026 	 */
3027 	if (__improbable(remove_leaf_table && refcnt != 0)) {
3028 #endif /* MACH_ASSERT */
3029 
3030 		/**
3031 		 * There are multiple problems that can arise as a non-zero refcnt:
3032 		 * 1. A bug in the refcnt management logic.
3033 		 * 2. A memory stomper or hardware failure.
3034 		 * 3. The VM forgetting to unmap all of the valid mappings in an address
3035 		 *    space before destroying a pmap.
3036 		 *
3037 		 * By looping over the page table and determining how many valid or
3038 		 * compressed entries there actually are, we can narrow down which of
3039 		 * these three cases is causing this panic. If the expected refcnt
3040 		 * (valid + compressed) and the actual refcnt don't match then the
3041 		 * problem is probably either a memory corruption issue (if the
3042 		 * non-empty entries don't match valid+compressed, that could also be a
3043 		 * sign of corruption) or refcnt management bug. Otherwise, there
3044 		 * actually are leftover mappings and the higher layers of xnu are
3045 		 * probably at fault.
3046 		 *
3047 		 * Note that we use PAGE_SIZE to govern the range of the table check,
3048 		 * because even for 4K processes we still allocate a 16K page for each
3049 		 * page table; we simply map it using 4 adjacent TTEs for the 4K case.
3050 		 */
3051 		pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(PAGE_SIZE - 1)));
3052 
3053 		pt_entry_t *ptep = bpte;
3054 		unsigned short wiredcnt = ptep_get_info((pt_entry_t*)ttetokv(tte))->wiredcnt;
3055 		unsigned short non_empty = 0, valid = 0, comp = 0;
3056 		for (unsigned int i = 0; i < (PAGE_SIZE / sizeof(*ptep)); i++, ptep++) {
3057 			/* Keep track of all non-empty entries to detect memory corruption. */
3058 			if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3059 				non_empty++;
3060 			}
3061 
3062 			if (__improbable(pte_is_compressed(*ptep, ptep))) {
3063 				comp++;
3064 			} else if (__improbable(pte_is_valid(*ptep))) {
3065 				valid++;
3066 			}
3067 		}
3068 
3069 #if MACH_ASSERT
3070 		/**
3071 		 * On internal machines, panic whenever a page table getting deleted has
3072 		 * leftover mappings (valid or otherwise) or a leaf page table has a
3073 		 * non-zero refcnt.
3074 		 */
3075 		if (__improbable((non_empty != 0) || (remove_leaf_table && ((refcnt != 0) || (wiredcnt != 0))))) {
3076 #else /* MACH_ASSERT */
3077 		/* We already know the leaf page-table has a non-zero refcnt, so panic. */
3078 		{
3079 #endif /* MACH_ASSERT */
3080 			panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3081 			    "%d compressed, %d non-empty, refcnt=%d, wiredcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3082 			    level + 1, valid, comp, non_empty, refcnt, wiredcnt, level, (uint64_t)tte, pmap, bpte);
3083 		}
3084 	}
3085 }
3086 
3087 /**
3088  * Remove translation table entry pointing to a nested shared region table
3089  *
3090  * @note The TTE to clear out is expected to point to a leaf table with a refcnt
3091  *       of zero.
3092  *
3093  * @param pmap The user pmap containing the nested page table whose TTE is being removed.
3094  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3095  * @param ttep Pointer to the TTE that should be cleared out.
3096  */
3097 static void
3098 pmap_tte_trim(
3099 	pmap_t pmap,
3100 	vm_offset_t va_start,
3101 	tt_entry_t *ttep)
3102 {
3103 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3104 	assert(ttep != NULL);
3105 	const tt_entry_t tte = *ttep;
3106 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3107 
3108 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3109 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3110 		    "stomper? pmap=%p ttep=%p", __func__, pt_attr_twig_level(pt_attr), pmap, ttep);
3111 	}
3112 
3113 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
3114 	sptm_unnest_region(pmap->ttep, pmap->nested_pmap->ttep, va_start, (pt_attr_twig_size(pt_attr) * page_ratio) >> pt_attr->pta_page_shift);
3115 
3116 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3117 
3118 	pmap_tte_check_refcounts(pmap, tte, pt_attr_twig_level(pt_attr));
3119 }
3120 
3121 /**
3122  * Remove a translation table entry.
3123  *
3124  * @note If the TTE to clear out points to a leaf table, then that leaf table
3125  *       must have a mapping refcount of zero before the TTE can be removed.
3126  * @note This function expects to be called with pmap locked exclusive, and will
3127  *       return with pmap unlocked.
3128  *
3129  * @param pmap The pmap containing the page table whose TTE is being removed.
3130  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3131  * @param ttep Pointer to the TTE that should be cleared out.
3132  * @param level The level of the page table that contains the TTE to be removed.
3133  */
3134 static void
3135 pmap_tte_remove(
3136 	pmap_t pmap,
3137 	vm_offset_t va_start,
3138 	tt_entry_t *ttep,
3139 	unsigned int level)
3140 {
3141 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3142 	assert(ttep != NULL);
3143 	const tt_entry_t tte = *ttep;
3144 
3145 	if (__improbable(tte == ARM_TTE_EMPTY)) {
3146 		panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3147 		    "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3148 	}
3149 
3150 	sptm_unmap_table(pmap->ttep, pt_attr_align_va(pmap_get_pt_attr(pmap), level, va_start), (sptm_pt_level_t)level);
3151 
3152 	pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3153 
3154 	pmap_tte_check_refcounts(pmap, tte, level);
3155 }
3156 
3157 /**
3158  * Given a pointer to an entry within a `level` page table, delete the
3159  * page table at `level` + 1 that is represented by that entry. For instance,
3160  * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3161  * contains the PA of the L3 table, and `level` would be "2".
3162  *
3163  * @note If the table getting deallocated is a leaf table, then that leaf table
3164  *       must have a mapping refcount of zero before getting deallocated.
3165  * @note This function expects to be called with pmap locked exclusive and will
3166  *       return with pmap unlocked.
3167  *
3168  * @param pmap The pmap that owns the page table to be deallocated.
3169  * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3170  * @param ttep Pointer to the `level` TTE to remove.
3171  * @param level The level of the table that contains an entry pointing to the
3172  *              table to be removed. The deallocated page table will be a
3173  *              `level` + 1 table (so if `level` is 2, then an L3 table will be
3174  *              deleted).
3175  */
3176 void
3177 pmap_tte_deallocate(
3178 	pmap_t pmap,
3179 	vm_offset_t va_start,
3180 	tt_entry_t *ttep,
3181 	unsigned int level)
3182 {
3183 	tt_entry_t tte;
3184 
3185 	pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3186 
3187 	tte = *ttep;
3188 
3189 	if (tte_get_ptd(tte)->pmap != pmap) {
3190 		panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3191 		    __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3192 	}
3193 
3194 	assertf(tte_is_table(tte), "%s: invalid TTE %p (0x%llx)", __func__, ttep,
3195 	    (unsigned long long)tte);
3196 
3197 	/* pmap_tte_remove() will drop the pmap lock */
3198 	pmap_tte_remove(pmap, va_start, ttep, level);
3199 
3200 	pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3201 }
3202 
3203 /*
3204  *	Remove a range of hardware page-table entries.
3205  *	The range is given as the first (inclusive)
3206  *	and last (exclusive) virtual addresses mapped by
3207  *      the PTE region to be removed.
3208  *
3209  *	The pmap must be locked shared.
3210  *	If the pmap is not the kernel pmap, the range must lie
3211  *	entirely within one pte-page. Assumes that the pte-page exists.
3212  *
3213  *	Returns the number of PTE changed
3214  */
3215 MARK_AS_PMAP_TEXT static void
3216 pmap_remove_range(
3217 	pmap_t pmap,
3218 	vm_map_address_t va,
3219 	vm_map_address_t end)
3220 {
3221 	pmap_remove_range_options(pmap, va, end, PMAP_OPTIONS_REMOVE);
3222 }
3223 
3224 MARK_AS_PMAP_TEXT void
3225 pmap_remove_range_options(
3226 	pmap_t pmap,
3227 	vm_map_address_t start,
3228 	vm_map_address_t end,
3229 	int options)
3230 {
3231 	const unsigned int sptm_flags = ((options & PMAP_OPTIONS_REMOVE) ? SPTM_REMOVE_COMPRESSED : 0);
3232 	unsigned int num_removed = 0;
3233 	unsigned int num_external = 0, num_internal = 0, num_reusable = 0;
3234 	unsigned int num_alt_internal = 0;
3235 	unsigned int num_compressed = 0, num_alt_compressed = 0;
3236 	unsigned short num_unwired = 0;
3237 	bool need_strong_sync = false;
3238 
3239 	/*
3240 	 * The pmap lock should be held here.  It will only be held shared in most if not all cases.
3241 	 */
3242 	pmap_assert_locked(pmap, PMAP_LOCK_HELD);
3243 
3244 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3245 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
3246 	const uint64_t pmap_page_shift = pt_attr_leaf_shift(pt_attr);
3247 	vm_map_address_t va = start;
3248 	pt_entry_t *cpte = pmap_pte(pmap, va);
3249 	assert(cpte != NULL);
3250 
3251 	while (va < end) {
3252 		/**
3253 		 * We may need to sleep when taking the PVH lock below, and our pmap_pv_remove()
3254 		 * call below may also place the lock in sleep mode if processing a large PV list.
3255 		 * We therefore can't leave preemption disabled across that code, which means we
3256 		 * can't directly use the per-CPU prev_ptes array in that code.  Since that code
3257 		 * only cares about the physical address stored in each prev_ptes entry, we'll
3258 		 * use a local array to stash off only the 4-byte physical address index in order
3259 		 * to reduce stack usage.
3260 		 */
3261 		unsigned int pai_list[SPTM_MAPPING_LIMIT];
3262 		_Static_assert(SPTM_MAPPING_LIMIT <= 64,
3263 		    "SPTM_MAPPING_LIMIT value causes excessive stack usage for pai_list");
3264 
3265 		unsigned int num_mappings = (end - va) >> pmap_page_shift;
3266 		if (num_mappings > SPTM_MAPPING_LIMIT) {
3267 			num_mappings = SPTM_MAPPING_LIMIT;
3268 		}
3269 
3270 		/**
3271 		 * Disable preemption to ensure that we can safely access per-CPU mapping data after
3272 		 * issuing the SPTM call.
3273 		 */
3274 		disable_preemption();
3275 		/**
3276 		 * Enter the retype epoch for the batched unmap operation.  This is necessary because we
3277 		 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
3278 		 * call, so a concurrent pmap_page_protect() operation against one of those pages may
3279 		 * race this call.  That should be perfectly fine as far as the PTE updates are concerned,
3280 		 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
3281 		 * if it does not first drain our epoch.
3282 		 */
3283 		pmap_retype_epoch_enter();
3284 		sptm_unmap_region(pmap->ttep, va, num_mappings, sptm_flags);
3285 		pmap_retype_epoch_exit();
3286 
3287 		sptm_pte_t *prev_ptes = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes;
3288 		for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3289 			const pt_entry_t prev_pte = prev_ptes[i];
3290 
3291 			if (pte_is_compressed(prev_pte, cpte)) {
3292 				if (options & PMAP_OPTIONS_REMOVE) {
3293 					++num_compressed;
3294 					if (prev_pte & ARM_PTE_COMPRESSED_ALT) {
3295 						++num_alt_compressed;
3296 					}
3297 				}
3298 				pai_list[i] = INVALID_PAI;
3299 				continue;
3300 			} else if (!pte_is_valid(prev_pte)) {
3301 				pai_list[i] = INVALID_PAI;
3302 				continue;
3303 			}
3304 
3305 			if (pte_is_wired(prev_pte)) {
3306 				num_unwired++;
3307 			}
3308 
3309 			const pmap_paddr_t pa = pte_to_pa(prev_pte);
3310 
3311 			if (__improbable(!pa_valid(pa))) {
3312 				pai_list[i] = INVALID_PAI;
3313 				continue;
3314 			}
3315 			pai_list[i] = pa_index(pa);
3316 		}
3317 
3318 		enable_preemption();
3319 		cpte -= num_mappings;
3320 
3321 		for (unsigned int i = 0; i < num_mappings; ++i, ++cpte) {
3322 			if (pai_list[i] == INVALID_PAI) {
3323 				continue;
3324 			}
3325 			locked_pvh_t locked_pvh;
3326 			if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
3327 				locked_pvh = pvh_lock_nopreempt(pai_list[i]);
3328 			} else {
3329 				locked_pvh = pvh_lock(pai_list[i]);
3330 			}
3331 
3332 			bool is_internal, is_altacct;
3333 			pv_remove_return_t remove_status = pmap_remove_pv(pmap, cpte, &locked_pvh, &is_internal, &is_altacct);
3334 
3335 			switch (remove_status) {
3336 			case PV_REMOVE_SUCCESS:
3337 				++num_removed;
3338 				if (is_altacct) {
3339 					assert(is_internal);
3340 					num_internal++;
3341 					num_alt_internal++;
3342 				} else if (is_internal) {
3343 					if (ppattr_test_reusable(pai_list[i])) {
3344 						num_reusable++;
3345 					} else {
3346 						num_internal++;
3347 					}
3348 				} else {
3349 					num_external++;
3350 				}
3351 				break;
3352 			default:
3353 				/*
3354 				 * PVE already removed; this can happen due to a concurrent pmap_disconnect()
3355 				 * executing before we grabbed the PVH lock.
3356 				 */
3357 				break;
3358 			}
3359 
3360 			pvh_unlock(&locked_pvh);
3361 		}
3362 
3363 		va += (num_mappings << pmap_page_shift);
3364 	}
3365 
3366 	if (__improbable(need_strong_sync)) {
3367 		arm64_sync_tlb(true);
3368 	}
3369 
3370 	/*
3371 	 *	Update the counts
3372 	 */
3373 	pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
3374 
3375 	if (pmap != kernel_pmap) {
3376 		if (num_unwired != 0) {
3377 			ptd_info_t * const ptd_info = ptep_get_info(cpte - 1);
3378 			if (__improbable(os_atomic_sub_orig(&ptd_info->wiredcnt, num_unwired, relaxed) < num_unwired)) {
3379 				panic("%s: pmap %p VA [0x%llx, 0x%llx) (ptd info %p) wired count underflow", __func__, pmap,
3380 				    (unsigned long long)start, (unsigned long long)end, ptd_info);
3381 			}
3382 		}
3383 
3384 		/* update ledgers */
3385 		pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
3386 		pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
3387 		pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
3388 		pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
3389 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
3390 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
3391 		pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
3392 		/* make needed adjustments to phys_footprint */
3393 		pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
3394 		    ((num_internal -
3395 		    num_alt_internal) +
3396 		    (num_compressed -
3397 		    num_alt_compressed)) * pmap_page_size);
3398 	}
3399 }
3400 
3401 
3402 /*
3403  *	Remove the given range of addresses
3404  *	from the specified map.
3405  *
3406  *	It is assumed that the start and end are properly
3407  *	rounded to the hardware page size.
3408  */
3409 void
3410 pmap_remove(
3411 	pmap_t pmap,
3412 	vm_map_address_t start,
3413 	vm_map_address_t end)
3414 {
3415 	pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
3416 }
3417 
3418 MARK_AS_PMAP_TEXT vm_map_address_t
3419 pmap_remove_options_internal(
3420 	pmap_t pmap,
3421 	vm_map_address_t start,
3422 	vm_map_address_t end,
3423 	int options)
3424 {
3425 	vm_map_address_t eva = end;
3426 	tt_entry_t     *tte_p;
3427 	bool            unlock = true;
3428 
3429 	if (__improbable(end < start)) {
3430 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
3431 	}
3432 	if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3433 		panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
3434 	}
3435 
3436 	validate_pmap_mutable(pmap);
3437 
3438 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3439 
3440 	pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
3441 	pmap_lock(pmap, lock_mode);
3442 
3443 	tte_p = pmap_tte(pmap, start);
3444 
3445 	if ((tte_p == NULL) || ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_FAULT)) {
3446 		goto done;
3447 	}
3448 
3449 	assertf(tte_is_table(*tte_p), "%s: invalid TTE %p (0x%llx) for pmap %p va 0x%llx",
3450 	    __func__, tte_p, (unsigned long long)*tte_p, pmap, (unsigned long long)start);
3451 
3452 	pmap_remove_range_options(pmap, start, end, options);
3453 
3454 	if (pmap->type != PMAP_TYPE_USER) {
3455 		goto done;
3456 	}
3457 
3458 	uint16_t refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3459 	if (__improbable(refcnt == 0)) {
3460 		ptd_info_t *ptd_info = ptep_get_info((pt_entry_t*)ttetokv(*tte_p));
3461 		os_atomic_inc(&ptd_info->wiredcnt, relaxed); // Prevent someone else from freeing the table if we need to drop the lock
3462 		if (!pmap_lock_shared_to_exclusive(pmap)) {
3463 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3464 		}
3465 		lock_mode = PMAP_LOCK_EXCLUSIVE;
3466 		refcnt = sptm_get_page_table_refcnt(tte_to_pa(*tte_p));
3467 		if ((os_atomic_dec(&ptd_info->wiredcnt, relaxed) == 0) && (refcnt == 0)) {
3468 			/**
3469 			 * Drain any concurrent retype-sensitive SPTM operations.  This is needed to
3470 			 * ensure that we don't unmap the page table and retype it while those operations
3471 			 * are still finishing on other CPUs, leading to an SPTM violation.  In particular,
3472 			 * the multipage batched cacheability/attribute update code may issue SPTM calls
3473 			 * without holding the relevant PVH or pmap locks, so we can't guarantee those
3474 			 * calls have actually completed despite observing refcnt == 0.
3475 			 *
3476 			 * At this point, we CAN guarantee that:
3477 			 * 1) All prior PTE removals required to produce refcnt == 0 have
3478 			 *    completed and been synchronized for all observers by DSB, and the
3479 			 *    relevant PV list entries removed.  Subsequent calls not already in the
3480 			 *    retype epoch will no longer observe these mappings.
3481 			 * 2) We now hold the pmap lock exclusive, so there will be no further attempt
3482 			 *    to enter mappings in this page table before it is unmapped.
3483 			 */
3484 			pmap_retype_epoch_prepare_drain();
3485 			pmap_retype_epoch_drain();
3486 			pmap_tte_deallocate(pmap, start, tte_p, pt_attr_twig_level(pt_attr));
3487 			unlock = false; // pmap_tte_deallocate() has dropped the lock
3488 		}
3489 	}
3490 done:
3491 	if (unlock) {
3492 		pmap_unlock(pmap, lock_mode);
3493 	}
3494 
3495 	return eva;
3496 }
3497 
3498 void
3499 pmap_remove_options(
3500 	pmap_t pmap,
3501 	vm_map_address_t start,
3502 	vm_map_address_t end,
3503 	int options)
3504 {
3505 	vm_map_address_t va;
3506 
3507 	if (pmap == PMAP_NULL) {
3508 		return;
3509 	}
3510 
3511 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3512 
3513 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
3514 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
3515 	    VM_KERNEL_ADDRHIDE(end));
3516 
3517 #if MACH_ASSERT
3518 	if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
3519 		panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
3520 		    pmap, (uint64_t)start, (uint64_t)end);
3521 	}
3522 	if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
3523 		panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
3524 		    pmap, (uint64_t)start, (uint64_t)end);
3525 	}
3526 #endif
3527 
3528 	/*
3529 	 * We allow single-page requests to execute non-preemptibly,
3530 	 * as it doesn't make sense to sample AST_URGENT for a single-page
3531 	 * operation, and there are a couple of special use cases that
3532 	 * require a non-preemptible single-page operation.
3533 	 */
3534 	if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
3535 		pmap_verify_preemptible();
3536 	}
3537 
3538 	/*
3539 	 *      Invalidate the translation buffer first
3540 	 */
3541 	va = start;
3542 	while (va < end) {
3543 		vm_map_address_t l;
3544 
3545 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
3546 		if (l > end) {
3547 			l = end;
3548 		}
3549 
3550 		va = pmap_remove_options_internal(pmap, va, l, options);
3551 	}
3552 
3553 	PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
3554 }
3555 
3556 
3557 /*
3558  *	Remove phys addr if mapped in specified map
3559  */
3560 void
3561 pmap_remove_some_phys(
3562 	__unused pmap_t map,
3563 	__unused ppnum_t pn)
3564 {
3565 	/* Implement to support working set code */
3566 }
3567 
3568 /*
3569  * Implementation of PMAP_SWITCH_USER that Mach VM uses to
3570  * switch a thread onto a new vm_map.
3571  */
3572 void
3573 pmap_switch_user(thread_t thread, vm_map_t new_map)
3574 {
3575 	pmap_t new_pmap = new_map->pmap;
3576 
3577 
3578 	thread->map = new_map;
3579 	pmap_set_pmap(new_pmap, thread);
3580 
3581 }
3582 void
3583 pmap_set_pmap(
3584 	pmap_t pmap,
3585 	thread_t thread)
3586 {
3587 	pmap_switch(pmap, thread);
3588 }
3589 
3590 MARK_AS_PMAP_TEXT void
3591 pmap_switch_internal(
3592 	pmap_t pmap,
3593 	thread_t thread)
3594 {
3595 	validate_pmap_mutable(pmap);
3596 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3597 	const uint16_t asid_index = PMAP_HWASID(pmap);
3598 	if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
3599 		panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
3600 	}
3601 
3602 #if __ARM_KERNEL_PROTECT__
3603 	asid_index >>= 1;
3604 #endif
3605 
3606 	if (asid_index > 0) {
3607 		pmap_update_plru(asid_index);
3608 	}
3609 
3610 	__unused sptm_return_t sptm_return;
3611 #pragma unused(thread)
3612 	if (0) {
3613 	} else {
3614 		sptm_return = sptm_switch_root(pmap->ttep, 0, 0);
3615 	}
3616 
3617 #if DEVELOPMENT || DEBUG
3618 	if (__improbable(sptm_return & SPTM_SWITCH_ASID_TLBI_FLUSH)) {
3619 		os_atomic_inc(&pmap_asid_flushes, relaxed);
3620 	}
3621 
3622 	if (__improbable(sptm_return & SPTM_SWITCH_RCTX_FLUSH)) {
3623 		os_atomic_inc(&pmap_speculation_restrictions, relaxed);
3624 	}
3625 #endif /* DEVELOPMENT || DEBUG */
3626 }
3627 
3628 void
3629 pmap_switch(
3630 	pmap_t pmap,
3631 	thread_t thread)
3632 {
3633 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), PMAP_HWASID(pmap));
3634 	pmap_switch_internal(pmap, thread);
3635 	PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
3636 }
3637 
3638 void
3639 pmap_page_protect(
3640 	ppnum_t ppnum,
3641 	vm_prot_t prot)
3642 {
3643 	pmap_page_protect_options(ppnum, prot, 0, NULL);
3644 }
3645 
3646 /**
3647  *  Helper function for performing per-mapping accounting following an SPTM disjoint unmap request.
3648  *
3649  * @note [pmap] cannot be the kernel pmap. This is because we do not maintain a ledger in the
3650  *       kernel pmap.
3651  *
3652  * @param pmap The pmap that contained the mapping
3653  * @param pai The physical page index mapped by the mapping
3654  * @param is_compressed Indicates whether the operation was an unmap-to-compress vs. a full unmap
3655  * @param is_internal Indicates whether the mapping was for an internal (aka anonymous) VM page
3656  * @param is_altacct Indicates whether the mapping was subject to alternate accounting.
3657  */
3658 static void
3659 pmap_disjoint_unmap_accounting(pmap_t pmap, unsigned int pai, bool is_compressed, bool is_internal, bool is_altacct)
3660 {
3661 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3662 	pvh_assert_locked(pai);
3663 
3664 	assert(pmap != kernel_pmap);
3665 
3666 	if (is_internal &&
3667 	    !is_altacct &&
3668 	    ppattr_test_reusable(pai)) {
3669 		pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3670 	} else if (!is_internal) {
3671 		pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3672 	}
3673 
3674 	if (is_altacct) {
3675 		assert(is_internal);
3676 		pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3677 		pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3678 		if (is_compressed) {
3679 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3680 			pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3681 		}
3682 	} else if (ppattr_test_reusable(pai)) {
3683 		assert(is_internal);
3684 		if (is_compressed) {
3685 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3686 			/* was not in footprint, but is now */
3687 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3688 		}
3689 	} else if (is_internal) {
3690 		pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3691 
3692 		/*
3693 		 * Update all stats related to physical footprint, which only
3694 		 * deals with internal pages.
3695 		 */
3696 		if (is_compressed) {
3697 			/*
3698 			 * This removal is only being done so we can send this page to
3699 			 * the compressor; therefore it mustn't affect total task footprint.
3700 			 */
3701 			pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3702 		} else {
3703 			/*
3704 			 * This internal page isn't going to the compressor, so adjust stats to keep
3705 			 * phys_footprint up to date.
3706 			 */
3707 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3708 		}
3709 	} else {
3710 		/* external page: no impact on ledgers */
3711 	}
3712 }
3713 
3714 /**
3715  * Helper function for issuing a disjoint unmap request to the SPTM and performing
3716  * related accounting.  This function uses the 'prev_ptes' list generated by
3717  * the sptm_unmap_disjoint() call to determine whether said call altered the
3718  * relevant PTEs in a manner that would require accounting updates.
3719  *
3720  * @param pa The physical address against which the disjoint unmap will be issued.
3721  * @param num_mappings The number of disjoint mappings for the SPTM to update.
3722  *                     The per-CPU sptm_ops array should contain the same number
3723  *                     of individual disjoint requests.
3724  */
3725 static void
3726 pmap_disjoint_unmap(pmap_paddr_t pa, unsigned int num_mappings)
3727 {
3728 	const unsigned int pai = pa_index(pa);
3729 
3730 	pvh_assert_locked(pai);
3731 
3732 	assert(num_mappings <= SPTM_MAPPING_LIMIT);
3733 
3734 	assert(get_preemption_level() > 0);
3735 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
3736 
3737 	sptm_unmap_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings);
3738 
3739 	for (unsigned int cur_mapping = 0; cur_mapping < num_mappings; ++cur_mapping) {
3740 		pt_entry_t prev_pte = sptm_pcpu->sptm_prev_ptes[cur_mapping];
3741 
3742 		pt_desc_t * const ptdp = sptm_pcpu->sptm_ptds[cur_mapping];
3743 		const pmap_t pmap = ptdp->pmap;
3744 
3745 		assertf(!pte_is_valid(prev_pte) ||
3746 		    ((pte_to_pa(prev_pte) & ~PAGE_MASK) == pa), "%s: prev_pte 0x%llx does not map pa 0x%llx",
3747 		    __func__, (unsigned long long)prev_pte, (unsigned long long)pa);
3748 
3749 		const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
3750 		pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3751 
3752 		if (pmap != kernel_pmap) {
3753 			/*
3754 			 * If the prior PTE is invalid (which may happen due to a concurrent remove operation),
3755 			 * the compressed marker won't be written so we shouldn't account the mapping as compressed.
3756 			 */
3757 			const bool is_compressed = (pte_is_valid(prev_pte) &&
3758 			    ((sptm_pcpu->sptm_ops[cur_mapping].pte_template & ARM_PTE_COMPRESSED_MASK) != 0));
3759 			const bool is_internal = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_INTERNAL) != 0;
3760 			const bool is_altacct = (sptm_pcpu->sptm_acct_flags[cur_mapping] & PMAP_SPTM_FLAG_ALTACCT) != 0;
3761 
3762 			/*
3763 			 * The rule is that accounting related to PTE contents (wired, PTD refcount)
3764 			 * must be updated by whoever clears the PTE, while accounting related to physical page
3765 			 * attributes must be updated by whoever clears the PVE.  We therefore always call
3766 			 * pmap_disjoint_unmap_accounting() here since we're removing the PVE, but only update
3767 			 * wired/PTD accounting if the prior PTE was valid.
3768 			 */
3769 			pmap_disjoint_unmap_accounting(pmap, pai, is_compressed, is_internal, is_altacct);
3770 
3771 			if (!pte_is_valid(prev_pte)) {
3772 				continue;
3773 			}
3774 
3775 			if (pte_is_wired(prev_pte)) {
3776 				pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
3777 				if (__improbable(os_atomic_dec_orig(&sptm_pcpu->sptm_ptd_info[cur_mapping]->wiredcnt, relaxed) == 0)) {
3778 					panic("%s: over-unwire of ptdp %p, ptd info %p", __func__,
3779 					    ptdp, sptm_pcpu->sptm_ptd_info[cur_mapping]);
3780 				}
3781 			}
3782 		}
3783 	}
3784 }
3785 
3786 /**
3787  * The following two functions, pmap_multipage_op_submit_disjoint() and
3788  * pmap_multipage_op_add_page(), are intended to allow callers to manage batched SPTM
3789  * operations that may span multiple physical pages.  They are intended to operate in
3790  * a way that allows callers such as pmap_page_protect_options_with_flush_range() to
3791  * insert mappings into the per-CPU SPTM disjoint ops array in the same manner that
3792  * they would for an ordinary single-page operation.
3793  * Functions such as pmap_page_protect_options_with_flush_range() operate on a single
3794  * physical page but may be passed a non-NULL flush_range object to indicate that the
3795  * call is part of a larger batched operation which may span multiple physical pages.
3796  * In that scenario, these functions are intended to be used as follows:
3797  * 1) Call pmap_multipage_op_add_page() to insert a "header" for the page into the per-
3798  *    CPU SPTM ops array.  Use the return value from this call as the starting index
3799  *    at which to add ordinary mapping entries into the same array.
3800  * 2) Insert sptm_disjoint_op_t entries into the ops array in the normal manner until
3801  *    the array is full, the SPTM options required for the upcoming sequence of pages
3802  *    need to change, or the current mapping matches flush_range->current_ptep.
3803  *    In the latter case, pmap_insert_flush_range_template() may instead be used
3804  *    to insert the mapping into the per-CPU SPTM region templates array.  See the
3805  *    documentation for pmap_insert_flush_range_template() below.
3806  * 3) If the array is full, call pmap_multipage_op_submit_disjoint() and return to step 1).
3807  * 4) If the SPTM options need to change, call pmap_multipage_op_add_page() to insert
3808  *    a new header with the updated options and, using the return value as the new
3809  *    insertion point for the ops array, resume step 2).
3810  * 5) Upon completion, if there are any pending not-yet-submitted mappings, do not
3811  *    submit those mappings to the SPTM as would ordinarily be done for a single-page
3812  *    call.  These trailing mappings will be submitted as part of the next batch,
3813  *    or by the next-higher caller if the range operation is complete.
3814  *
3815  * Note that, as a performance optimization, the caller may track the insertion
3816  * point in the disjoint ops array locally (i.e. without incrementing
3817  * flush_range->pending_disjoint_entries on every iteration, as long as it takes care to do the
3818  * following:
3819  * 1) Initialize and update that insertion point as described in steps 1) and 4) above.
3820  * 2) Pass the updated insertion point as the 'pending_disjoint_entries' parameter into the calls
3821  *    in steps 3) and 4) above.
3822  * 3) Update flush_range->pending_disjoint_entries with the locally-maintained value along with
3823  *    step 5) above.
3824  */
3825 
3826 /**
3827  * Submit any pending disjoint multi-page mapping updates to the SPTM.
3828  *
3829  * @note This function must be called with preemption disabled, and will drop
3830  *       the preemption-disable count upon submitting to the SPTM.
3831  * @note [pending_disjoint_entries] must include *all* pending entries in the SPTM ops array,
3832  *       including physical address "header" entries.
3833  * @note This function automatically updates the per_paddr_header.num_mappings field
3834  *       for the most recent physical address header in the SPTM ops array to its final
3835  *       value.
3836  *
3837  * @param pending_disjoint_entries The number of not-yet-submitted mappings according to the caller.
3838  *                        This value may be greater than [flush_range]->pending_disjoint_entries if
3839  *                        the caller has inserted mappings into the ops array without
3840  *                        updating [flush_range]->pending_disjoint_entries, in which case this
3841  *                        function will update [flush_range]->pending_disjoint_entries with the
3842  *                        caller's value.
3843  * @param flush_range The object tracking the current state of the multipage disjoint
3844  *                    operation.
3845  */
3846 static inline void
3847 pmap_multipage_op_submit_disjoint(unsigned int pending_disjoint_entries, pmap_tlb_flush_range_t *flush_range)
3848 {
3849 	/**
3850 	 * Reconcile the number of pending entries as tracked by the caller with the
3851 	 * number of pending entries tracked by flush_range.  If the caller's value is
3852 	 * greater, we assume the caller has inserted locally-tracked mappings into the
3853 	 * array without directly updating flush_range->pending_disjoint_entries.  Otherwise, we
3854 	 * assume the caller has no locally-tracked mappings and is simply trying to
3855 	 * purge any pending mappings from a prior call sequence.
3856 	 */
3857 	if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
3858 		flush_range->pending_disjoint_entries = pending_disjoint_entries;
3859 	} else {
3860 		assert(pending_disjoint_entries == 0);
3861 	}
3862 	if (flush_range->pending_disjoint_entries != 0) {
3863 		assert(get_preemption_level() > 0);
3864 		/**
3865 		 * Compute the correct number of mappings for the most recent paddr
3866 		 * header based on the current position in the SPTM ops array.
3867 		 */
3868 		flush_range->current_header->per_paddr_header.num_mappings =
3869 		    flush_range->pending_disjoint_entries - flush_range->current_header_first_mapping_index;
3870 		const sptm_return_t sptm_return = sptm_update_disjoint_multipage(
3871 			PERCPU_GET(pmap_sptm_percpu)->sptm_ops_pa, flush_range->pending_disjoint_entries);
3872 
3873 		/**
3874 		 * We may be submitting the batch and exiting the epoch partway through
3875 		 * processing the PV list for a page.  That's fine, because in that case we'll
3876 		 * hold the PV lock for that page, which will prevent mappings of that page from
3877 		 * being disconnected and will prevent the completion of pmap_remove() against
3878 		 * any of those mappings, thus also guaranteeing the relevant page table pages
3879 		 * can't be freed.  The epoch still protects mappings for any prior page in
3880 		 * the batch, whose PV locks are no longer held.
3881 		 */
3882 		pmap_retype_epoch_exit();
3883 		enable_preemption();
3884 		if (flush_range->pending_region_entries != 0) {
3885 			flush_range->processed_entries += flush_range->pending_disjoint_entries;
3886 		} else {
3887 			flush_range->processed_entries = 0;
3888 		}
3889 		flush_range->pending_disjoint_entries = 0;
3890 		if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
3891 			flush_range->ptfr_flush_needed = true;
3892 		}
3893 	}
3894 }
3895 
3896 /**
3897  * Insert a new physical address "header" entry into the per-CPU SPTM ops array for a
3898  * multi-page SPTM operation.  It is expected that the caller will subsequently add
3899  * mapping entries for this physical address into the array.
3900  *
3901  * @note This function will disable preemption upon creation of the first paddr header
3902  *       (index 0 in the per-CPU SPTM ops array) and it is expected that
3903  *       pmap_multipage_op_submit() will subsequently be called on the same CPU.
3904  * @note Before inserting the new header, this function automatically updates the
3905  *       per_paddr_header.num_mappings field for the previous physical address header
3906  *       (if present) in the SPTM ops array to its final value.
3907  *
3908  * @param phys The physical address for which to insert a header entry.
3909  * @param inout_pending_disjoint_entries
3910  *              [input] The number of not-yet-submitted mappings according to the caller.
3911  *                      This value may be greater than [flush_range]->pending_disjoint_entries if
3912  *                      the caller has inserted mappings into the ops array without
3913  *                      updating [flush_range]->pending_disjoint_entries, in which case this
3914  *                      function will update [flush_range]->pending_disjoint_entries with the
3915  *                      caller's value.
3916  *              [output] Returns the starting index at which the caller should insert mapping
3917  *                       entries into the per-CPU SPTM ops array.
3918  * @param sptm_update_options SPTM_UPDATE_* flags to pass to the SPTM call.
3919  *                            SPTM_UPDATE_SKIP_PAPT is automatically inserted by this
3920  *                            function.
3921  * @param flush_range The object tracking the current state of the multipage operation.
3922  *
3923  * @return True if the region operation was submitted to the SPTM due to the ops array already
3924  *         being full, false otherwise.  In the former case, the new header will not be added
3925  *         to the array; the caller will need to re-invoke this function after taking any
3926  *         necessary post-submission action (such as enabling preemption).
3927  */
3928 static inline bool
3929 pmap_multipage_op_add_page(
3930 	pmap_paddr_t phys,
3931 	unsigned int *inout_pending_disjoint_entries,
3932 	uint32_t sptm_update_options,
3933 	pmap_tlb_flush_range_t *flush_range)
3934 {
3935 	unsigned int pending_disjoint_entries = *inout_pending_disjoint_entries;
3936 
3937 	/**
3938 	 * Reconcile the number of pending entries as tracked by the caller with the
3939 	 * number of pending entries tracked by flush_range.  If the caller's value is
3940 	 * greater, we assume the caller has inserted locally-tracked mappings into the
3941 	 * array without directly updating flush_range->pending_disjoint_entries.  Otherwise, we
3942 	 * assume the caller has no locally-tracked mappings and is adding its paddr
3943 	 * header for the first time.
3944 	 */
3945 	if (pending_disjoint_entries > flush_range->pending_disjoint_entries) {
3946 		flush_range->pending_disjoint_entries = pending_disjoint_entries;
3947 	} else {
3948 		assert(pending_disjoint_entries == 0);
3949 	}
3950 	if (flush_range->pending_disjoint_entries >= (SPTM_MAPPING_LIMIT - 1)) {
3951 		/**
3952 		 * If the SPTM ops array is either full or only has space for the paddr
3953 		 * header, there won't be room for mapping entries, so submit the pending
3954 		 * mappings to the SPTM now, and return to allow the caller to take
3955 		 * any necessary post-submission action.
3956 		 */
3957 		pmap_multipage_op_submit_disjoint(pending_disjoint_entries, flush_range);
3958 		*inout_pending_disjoint_entries = 0;
3959 		return true;
3960 	}
3961 	pending_disjoint_entries = flush_range->pending_disjoint_entries;
3962 
3963 	sptm_update_options |= SPTM_UPDATE_SKIP_PAPT;
3964 	if (pending_disjoint_entries == 0) {
3965 		disable_preemption();
3966 		/**
3967 		 * Enter the retype epoch while we gather the disjoint update arguments
3968 		 * and issue the SPTM call.  Since this operation may cover multiple physical
3969 		 * pages, we may construct the argument array and invoke the SPTM without holding
3970 		 * all relevant PVH locks or pmap locks.  We therefore need to record that we are
3971 		 * collecting and modifying mapping state so that e.g. pmap_page_protect() does
3972 		 * not attempt to retype the underlying pages and pmap_remove() does not attempt
3973 		 * to free the page tables used for these mappings without first draining our epoch.
3974 		 */
3975 		pmap_retype_epoch_enter();
3976 		flush_range->pending_disjoint_entries = 1;
3977 	} else {
3978 		/**
3979 		 * Before inserting the new header, update the prior header's number
3980 		 * of paddr-specific mappings to its final value.
3981 		 */
3982 		assert(flush_range->current_header != NULL);
3983 		flush_range->current_header->per_paddr_header.num_mappings =
3984 		    pending_disjoint_entries - flush_range->current_header_first_mapping_index;
3985 	}
3986 	sptm_disjoint_op_t *sptm_ops = PERCPU_GET(pmap_sptm_percpu)->sptm_ops;
3987 	flush_range->current_header = (sptm_update_disjoint_multipage_op_t*)&sptm_ops[pending_disjoint_entries];
3988 	flush_range->current_header_first_mapping_index = ++pending_disjoint_entries;
3989 	flush_range->current_header->per_paddr_header.paddr = phys;
3990 	flush_range->current_header->per_paddr_header.num_mappings = 0;
3991 	flush_range->current_header->per_paddr_header.options = sptm_update_options;
3992 
3993 	*inout_pending_disjoint_entries = pending_disjoint_entries;
3994 	return false;
3995 }
3996 
3997 /**
3998  * The following two functions, pmap_multipage_op_submit_region() and
3999  * pmap_insert_flush_range_template(), are meant to be used in a similar fashion
4000  * to pmap_multipage_op_submit_disjoint() and pmap_multipage_op_add_page(),
4001  * but for the specific case in which a given mapping within a PV list happens
4002  * to map the current VA within a VA region being operated on by
4003  * phys_attribute_clear_range().  This allows the pmap to further optimize
4004  * the SPTM calls by using sptm_update_region() to modify all mappings within
4005  * the VA region, which requires far fewer table walks than a disjoint operation.
4006  * Since the starting VA of the region, the owning pmap, and the insertion point
4007  * within the per-CPU region templates array are already known, these functions
4008  * don't require the special "header" entry or the complex array position tracking
4009  * of their disjoint equivalents above.
4010  * Note that these functions may be used together with the disjoint functions above;
4011  * these functions can be used for the "primary" mappings corresponding to the VA
4012  * region being manipulated by the VM layer, while the disjoint functions can be
4013  * used for any alias mappings of the underlying pages which fall outside that
4014  * VA region.
4015  */
4016 
4017 /**
4018  * Submit any pending region-based templates for the specified flush_range.
4019  *
4020  * @note This function must be called with preemption disabled, and will drop
4021  *       the preemption-disable count upon submitting to the SPTM.
4022  *
4023  * @param flush_range The object tracking the current state of the region operation.
4024  */
4025 static inline void
4026 pmap_multipage_op_submit_region(pmap_tlb_flush_range_t *flush_range)
4027 {
4028 	if (flush_range->pending_region_entries != 0) {
4029 		assert(get_preemption_level() > 0);
4030 		pmap_assert_locked(flush_range->ptfr_pmap, PMAP_LOCK_SHARED);
4031 		/**
4032 		 * If there are any pending disjoint entries, we're already in a retype epoch.
4033 		 * For disjoint entries, we need to hold the epoch during the entire time we
4034 		 * construct the disjoint ops array because those ops may point to some arbitrary
4035 		 * pmap and we need to ensure the relevant page tables and even the pmap itself
4036 		 * aren't concurrently reclaimed while our ops array points to them.
4037 		 * But for a region op like this, we know we already hold the relevant pmap lock
4038 		 * so none of the above can happen concurrently.  We therefore only need to hold
4039 		 * the epoch across the SPTM call itself to prevent a concurrent unmap operation
4040 		 * from attempting to retype the mapped pages while our SPTM call has them in-
4041 		 * flight.
4042 		 */
4043 		if (flush_range->pending_disjoint_entries == 0) {
4044 			pmap_retype_epoch_enter();
4045 		}
4046 		const sptm_return_t sptm_return = sptm_update_region(flush_range->ptfr_pmap->ttep,
4047 		    flush_range->pending_region_start, flush_range->pending_region_entries,
4048 		    PERCPU_GET(pmap_sptm_percpu)->sptm_templates_pa,
4049 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | SPTM_UPDATE_DEFER_TLBI);
4050 		if (flush_range->pending_disjoint_entries == 0) {
4051 			pmap_retype_epoch_exit();
4052 		}
4053 		enable_preemption();
4054 		if (flush_range->pending_disjoint_entries != 0) {
4055 			flush_range->processed_entries += flush_range->pending_region_entries;
4056 		} else {
4057 			flush_range->processed_entries = 0;
4058 		}
4059 		flush_range->pending_region_start += (flush_range->pending_region_entries <<
4060 		        pmap_get_pt_attr(flush_range->ptfr_pmap)->pta_page_shift);
4061 		flush_range->pending_region_entries = 0;
4062 		if (sptm_return == SPTM_UPDATE_DELAYED_TLBI) {
4063 			flush_range->ptfr_flush_needed = true;
4064 		}
4065 	}
4066 }
4067 
4068 /**
4069  * Insert a PTE template into the per-CPU SPTM region ops array.
4070  * This is meant to be used as a performance optimization for the case in which a given
4071  * mapping being processed by a function such as pmap_page_protect_options_with_flush_range()
4072  * happens to map the current iteration position within [flush_range]'s VA region.
4073  * In this case the mapping can be inserted as a region-based template rather than a disjoint
4074  * operation as would be done in the general case.  The idea is that region-based SPTM
4075  * operations are significantly less expensive than disjoint operations, because each region
4076  * operation only requires a single page table walk at the beginning vs. a table walk for
4077  * each mapping in the disjoint case.  Since the majority of mappings processed by a flush
4078  * range operation belong to the main flush range VA region (i.e. alias mappings outside
4079  * the region are less common), the performance improvement can be significant.
4080  *
4081  * @note This function will disable preemption upon inserting the first entry into the
4082  *       per-CPU templates array, and will re-enable preemption upon submitting the region
4083  *       operation to the SPTM.
4084  *
4085  * @param template The PTE template to insert into the per-CPU templates array.
4086  * @param flush_range The object tracking the current state of the region operation.
4087  *
4088  * @return True if the region operation was submitted to the SPTM, false otherwise.
4089  */
4090 static inline bool
4091 pmap_insert_flush_range_template(pt_entry_t template, pmap_tlb_flush_range_t *flush_range)
4092 {
4093 	if (flush_range->pending_region_entries == 0) {
4094 		disable_preemption();
4095 	}
4096 	flush_range->region_entry_added = true;
4097 	PERCPU_GET(pmap_sptm_percpu)->sptm_templates[flush_range->pending_region_entries++] = template;
4098 	if (flush_range->pending_region_entries == SPTM_MAPPING_LIMIT) {
4099 		pmap_multipage_op_submit_region(flush_range);
4100 		return true;
4101 	}
4102 	return false;
4103 }
4104 
4105 /**
4106  * Wrapper function for submitting any pending operations, region-based or disjoint,
4107  * tracked by a flush range object.  This is meant to be used by the top-level caller that
4108  * iterates over the flush range's VA region and calls functions such as
4109  * pmap_page_protect_options_with_flush_range() or arm_force_fast_fault_with_flush_range()
4110  * to construct the relevant SPTM operations arrays.
4111  *
4112  * @param flush_range The object tracking the current state of region and/or disjoint operations.
4113  */
4114 static inline void
4115 pmap_multipage_op_submit(pmap_tlb_flush_range_t *flush_range)
4116 {
4117 	pmap_multipage_op_submit_disjoint(0, flush_range);
4118 	pmap_multipage_op_submit_region(flush_range);
4119 }
4120 
4121 /**
4122  * This is an internal-only flag that indicates the caller of pmap_page_protect_options_with_flush_range()
4123  * is removing/updating all mappings in preparation for a retype operation.  In this case
4124  * pmap_page_protect_options() will assume (and assert) that the PVH lock for the physical page is held
4125  * by the calller, and will perform the necessary retype epoch drain prior to returning.
4126  */
4127 #define PMAP_OPTIONS_PPO_PENDING_RETYPE 0x80000000
4128 _Static_assert(PMAP_OPTIONS_PPO_PENDING_RETYPE & PMAP_OPTIONS_RESERVED_MASK,
4129     "PMAP_OPTIONS_PPO_PENDING_RETYPE outside reserved encoding space");
4130 
4131 /**
4132  * Lower the permission for all mappings to a given page. If VM_PROT_NONE is specified,
4133  * the mappings will be removed.
4134  *
4135  * @param ppnum Page number to lower the permission of.
4136  * @param prot The permission to lower to.
4137  * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
4138  *                PMAP_OPTIONS_PPO_PENDING_RETYPE indicates the PVH lock for ppnum is
4139  *                already locked and a retype epoch drain shold be performed.
4140  *                PMAP_OPTIONS_COMPRESSOR indicates the function is called by the
4141  *                VM compressor.
4142  * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
4143  *                   by the caller.  This is an input/output parameter which may be updated
4144  *                   to reflect a new PV head value to be passed to a later call to pvh_unlock().
4145  * @param flush_range When present, this function will skip the TLB flush for the
4146  *                    mappings that are covered by the range, leaving that to be
4147  *                    done later by the caller.  It may also avoid submitting mapping
4148  *                    updates directly to the SPTM, instead accumulating them in a
4149  *                    per-CPU array to be submitted later by the caller.
4150  *
4151  * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4152  */
4153 MARK_AS_PMAP_TEXT static void
4154 pmap_page_protect_options_with_flush_range(
4155 	ppnum_t ppnum,
4156 	vm_prot_t prot,
4157 	unsigned int options,
4158 	locked_pvh_t *locked_pvh,
4159 	pmap_tlb_flush_range_t *flush_range)
4160 {
4161 	pmap_paddr_t phys = ptoa(ppnum);
4162 	locked_pvh_t local_locked_pvh = {.pvh = 0};
4163 	pv_entry_t *pve_p = NULL;
4164 	pv_entry_t *pveh_p = NULL;
4165 	pv_entry_t *pvet_p = NULL;
4166 	pt_entry_t *pte_p = NULL;
4167 	pv_entry_t *new_pve_p = NULL;
4168 	pt_entry_t *new_pte_p = NULL;
4169 
4170 	bool remove = false;
4171 	unsigned int pvh_cnt = 0;
4172 	unsigned int num_mappings = 0, num_skipped_mappings = 0;
4173 
4174 	assert(ppnum != vm_page_fictitious_addr);
4175 
4176 	/**
4177 	 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
4178 	 *
4179 	 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
4180 	 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
4181 	 * semantics conflict with each other, so assert they are not both true.
4182 	 */
4183 	assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
4184 
4185 	/* Only work with managed pages. */
4186 	if (!pa_valid(phys)) {
4187 		return;
4188 	}
4189 
4190 	/*
4191 	 * Determine the new protection.
4192 	 */
4193 	switch (prot) {
4194 	case VM_PROT_ALL:
4195 		return;         /* nothing to do */
4196 	case VM_PROT_READ:
4197 	case VM_PROT_READ | VM_PROT_EXECUTE:
4198 		break;
4199 	default:
4200 		/* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4201 		options = options & ~PMAP_OPTIONS_NOFLUSH;
4202 		remove = true;
4203 		break;
4204 	}
4205 
4206 	/**
4207 	 * We don't support cross-page batching (indicated by flush_range being non-NULL) for removals,
4208 	 * as removals must use the SPTM prev_ptes array for accounting, which isn't supported for cross-
4209 	 * page batches.
4210 	 */
4211 	assert((flush_range == NULL) || !remove);
4212 
4213 	unsigned int pai = pa_index(phys);
4214 	if (__probable(locked_pvh == NULL)) {
4215 		if (flush_range != NULL) {
4216 			/**
4217 			 * If we're partway through processing a multi-page batched call,
4218 			 * preemption will already be disabled so we can't simply call
4219 			 * pvh_lock() which may block.  Instead, we first try to acquire
4220 			 * the lock without waiting, which in most cases should succeed.
4221 			 * If it fails, we submit the pending batched operations to re-
4222 			 * enable preemption and then acquire the lock normally.
4223 			 */
4224 			local_locked_pvh = pvh_try_lock(pai);
4225 			if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
4226 				pmap_multipage_op_submit(flush_range);
4227 				local_locked_pvh = pvh_lock(pai);
4228 			}
4229 		} else {
4230 			local_locked_pvh = pvh_lock(pai);
4231 		}
4232 	} else {
4233 		local_locked_pvh = *locked_pvh;
4234 		assert(pai == local_locked_pvh.pai);
4235 	}
4236 	assert(local_locked_pvh.pvh != 0);
4237 	pvh_assert_locked(pai);
4238 
4239 	bool pvh_lock_sleep_mode_needed = false;
4240 
4241 	/*
4242 	 * PVH should be locked before accessing per-CPU data, as we're relying on the lock
4243 	 * to disable preemption.
4244 	 */
4245 	pmap_cpu_data_t *pmap_cpu_data = NULL;
4246 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4247 	sptm_disjoint_op_t *sptm_ops = NULL;
4248 	pt_desc_t **sptm_ptds = NULL;
4249 	ptd_info_t **sptm_ptd_info = NULL;
4250 
4251 	/* BEGIN IGNORE CODESTYLE */
4252 
4253 	/**
4254 	 * This would also work as a block, with the above variables declared using the
4255 	 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
4256 	 * dereferencing __block variables through stack forwarding pointers) isn't needed
4257 	 * here, as we never need to use this code sequence as a closure.
4258 	 */
4259 	#define PPO_PERCPU_INIT() do { \
4260 	        disable_preemption(); \
4261 	        pmap_cpu_data = pmap_get_cpu_data(); \
4262 	        sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
4263 	        sptm_ops = sptm_pcpu->sptm_ops; \
4264 	        sptm_ptds = sptm_pcpu->sptm_ptds; \
4265 	        sptm_ptd_info = sptm_pcpu->sptm_ptd_info; \
4266 	        if (remove) { \
4267 	                os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed); \
4268 			/* \
4269 			 * Ensure the store to inflight_disconnect will be observed before any of the
4270 			 * ensuing PTE/refcount stores in this function.  This flag is used to avoid
4271 			 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4272 			 * another CPU, in between this function's clearing a PTE and dropping the
4273 			 * corresponding pagetable refcount.  That can lead to a panic if the
4274 			 * destroying thread observes a non-zero refcount.  For this we need a store-
4275 			 * store barrier; a store-release operation would not be sufficient.
4276 			 */ \
4277 	                os_atomic_thread_fence(release); \
4278 	        } \
4279 	} while (0)
4280 
4281 	/* END IGNORE CODESTYLE */
4282 
4283 
4284 	PPO_PERCPU_INIT();
4285 
4286 	pv_entry_t **pve_pp = NULL;
4287 
4288 	if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
4289 		pte_p = pvh_ptep(local_locked_pvh.pvh);
4290 	} else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4291 		pve_p = pvh_pve_list(local_locked_pvh.pvh);
4292 		pveh_p = pve_p;
4293 	} else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
4294 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
4295 	}
4296 
4297 	int pve_ptep_idx = 0;
4298 	const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4299 
4300 	/*
4301 	 * We need to keep track of whether a particular PVE list contains IOMMU
4302 	 * mappings when removing entries, because we should only remove CPU
4303 	 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
4304 	 * it around.
4305 	 */
4306 	bool iommu_mapping_in_pve = false;
4307 
4308 	/**
4309 	 * With regard to TLBI, there are three cases:
4310 	 *
4311 	 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
4312 	 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
4313 	 *    itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
4314 	 *    mapping is out of the range.
4315 	 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
4316 	 *    let SPTM handle TLBI flushing.
4317 	 */
4318 	const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
4319 	const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
4320 
4321 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4322 		if (__improbable(pvh_lock_sleep_mode_needed)) {
4323 			assert((num_mappings == 0) && (num_skipped_mappings == 0));
4324 			if (remove) {
4325 				/**
4326 				 * Clear the in-flight disconnect indicator for the current CPU, as we've
4327 				 * already submitted any prior pending SPTM operations, and we're about to
4328 				 * briefly re-enable preemption which may cause this thread to be migrated.
4329 				 */
4330 				os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
4331 			}
4332 			/**
4333 			 * Undo the explicit preemption disable done in the last call to PPO_PER_CPU_INIT().
4334 			 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
4335 			 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
4336 			 * core while processing SPTM per-CPU data.  At the same time, we also want preemption
4337 			 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
4338 			 * urgent ASTs can be handled.
4339 			 */
4340 			enable_preemption();
4341 			pvh_lock_enter_sleep_mode(&local_locked_pvh);
4342 			pvh_lock_sleep_mode_needed = false;
4343 			PPO_PERCPU_INIT();
4344 		}
4345 
4346 		if (pve_p != PV_ENTRY_NULL) {
4347 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4348 			if (pte_p == PT_ENTRY_NULL) {
4349 				goto protect_skip_pve;
4350 			}
4351 		}
4352 
4353 #ifdef PVH_FLAG_IOMMU
4354 		if (pvh_ptep_is_iommu(pte_p)) {
4355 			iommu_mapping_in_pve = true;
4356 			if (__improbable(remove && (options & PMAP_OPTIONS_COMPRESSOR))) {
4357 				const iommu_instance_t iommu = ptep_get_iommu(pte_p);
4358 				panic("%s: attempt to compress ppnum 0x%x owned by iommu driver "
4359 				    "%u (token: %#x), pve_p=%p", __func__, ppnum, GET_IOMMU_ID(iommu),
4360 				    GET_IOMMU_TOKEN(iommu), pve_p);
4361 			}
4362 			if (remove && (pve_p == PV_ENTRY_NULL)) {
4363 				/*
4364 				 * We've found an IOMMU entry and it's the only entry in the PV list.
4365 				 * We don't discard IOMMU entries, so simply set up the new PV list to
4366 				 * contain the single IOMMU PTE and exit the loop.
4367 				 */
4368 				new_pte_p = pte_p;
4369 				break;
4370 			}
4371 			++num_skipped_mappings;
4372 			goto protect_skip_pve;
4373 		}
4374 #endif
4375 
4376 		const pt_entry_t spte = os_atomic_load(pte_p, relaxed);
4377 
4378 		if (__improbable(!remove && !pte_is_valid(spte))) {
4379 			++num_skipped_mappings;
4380 			goto protect_skip_pve;
4381 		}
4382 
4383 		pt_desc_t *ptdp = NULL;
4384 		pmap_t pmap = NULL;
4385 		vm_map_address_t va = 0;
4386 
4387 		if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
4388 			/**
4389 			 * If the current mapping matches the flush range's current iteration position,
4390 			 * there's no need to do the work of getting the PTD.  We already know the pmap,
4391 			 * and the VA is implied by flush_range->pending_region_start.
4392 			 */
4393 			pmap = flush_range->ptfr_pmap;
4394 		} else {
4395 			ptdp = ptep_get_ptd(pte_p);
4396 			pmap = ptdp->pmap;
4397 			va = ptd_get_va(ptdp, pte_p);
4398 		}
4399 
4400 		/**
4401 		 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
4402 		 * pending disjoint ops, so we don't need to do flush range disjoint op management.
4403 		 */
4404 		if ((flush_range != NULL) && (ptdp != NULL)) {
4405 			/**
4406 			 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
4407 			 * We do this in three cases:
4408 			 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
4409 			 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
4410 			 *    for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
4411 			 * 3) We need to change the options passed to the SPTM for a run of one or more mappings.  Specifically,
4412 			 *    if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
4413 			 *    belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
4414 			 *    the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
4415 			 */
4416 			uint32_t per_mapping_sptm_update_options = sptm_update_options;
4417 			if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4418 				per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
4419 			}
4420 			if ((num_mappings == 0) ||
4421 			    (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
4422 				if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
4423 					/**
4424 					 * If we needed to submit the pending disjoint ops to make room for the new page,
4425 					 * flush any pending region ops to reenable preemption and restart the loop with
4426 					 * the lock in sleep mode.  This prevents preemption from being held disabled
4427 					 * for an arbitrary amount of time in the pathological case in which we have
4428 					 * both pending region ops and an excessively long PV list that repeatedly
4429 					 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
4430 					 */
4431 					pmap_multipage_op_submit_region(flush_range);
4432 					assert(num_mappings == 0);
4433 					num_skipped_mappings = 0;
4434 					pvh_lock_sleep_mode_needed = true;
4435 					continue;
4436 				}
4437 			}
4438 		}
4439 
4440 		if (__improbable((pmap == NULL) ||
4441 		    (pte_is_valid(spte) && (atop(pte_to_pa(spte)) != ppnum)))) {
4442 #if MACH_ASSERT
4443 			if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4444 				/* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4445 				pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4446 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4447 
4448 				pv_entry_t *check_pvep = pve_p;
4449 
4450 				do {
4451 					if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4452 						panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4453 						    "pvep=%p, pai=0x%x", __func__, pte_p, pmap, (void*)local_locked_pvh.pvh, pve_p, pai);
4454 					}
4455 				} while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4456 
4457 				/* Restore previous PTEP value. */
4458 				pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4459 			}
4460 #endif
4461 			panic("%s: bad PVE pte_p=%p pmap=%p prot=%d options=%u, pvh=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4462 			    __func__, pte_p, pmap, prot, options, (void*)local_locked_pvh.pvh, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4463 		}
4464 
4465 		pt_entry_t pte_template = ARM_PTE_EMPTY;
4466 
4467 		if (ptdp != NULL) {
4468 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
4469 			sptm_ops[num_mappings].vaddr = va;
4470 		}
4471 
4472 		/* Remove the mapping if new protection is NONE */
4473 		if (remove) {
4474 			sptm_ptds[num_mappings] = ptdp;
4475 			sptm_ptd_info[num_mappings] = ptd_get_info(ptdp);
4476 			sptm_pcpu->sptm_acct_flags[num_mappings] = 0;
4477 			if (pmap != kernel_pmap) {
4478 				const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4479 				const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4480 
4481 				if (is_internal) {
4482 					sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_INTERNAL;
4483 					ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4484 				}
4485 				if (is_altacct) {
4486 					sptm_pcpu->sptm_acct_flags[num_mappings] |= PMAP_SPTM_FLAG_ALTACCT;
4487 					ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4488 				}
4489 				if (compress && is_internal) {
4490 					pte_template = ARM_PTE_COMPRESSED;
4491 					if (is_altacct) {
4492 						pte_template |= ARM_PTE_COMPRESSED_ALT;
4493 					}
4494 				}
4495 			}
4496 			/* Remove this CPU mapping from PVE list. */
4497 			if (pve_p != PV_ENTRY_NULL) {
4498 				pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4499 			}
4500 		} else {
4501 			const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4502 
4503 			if (pmap == kernel_pmap) {
4504 				pte_template = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4505 			} else {
4506 				pte_template = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4507 			}
4508 
4509 			/*
4510 			 * We must at least clear the 'was writeable' flag, as we're at least revoking write access,
4511 			 * meaning that the VM is effectively requesting that subsequent write accesses to these mappings
4512 			 * go through vm_fault() instead of being handled by arm_fast_fault().
4513 			 */
4514 			pte_set_was_writeable(pte_template, false);
4515 
4516 			/*
4517 			 * While the naive implementation of this would serve to add execute
4518 			 * permission, this is not how the VM uses this interface, or how
4519 			 * x86_64 implements it.  So ignore requests to add execute permissions.
4520 			 */
4521 #if DEVELOPMENT || DEBUG
4522 			if ((!(prot & VM_PROT_EXECUTE) && nx_enabled && pmap->nx_enabled) ||
4523 			    (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4524 #else
4525 			if (!(prot & VM_PROT_EXECUTE) ||
4526 			    (pte_to_xprr_perm(spte) == XPRR_USER_TPRO_PERM))
4527 #endif
4528 			{
4529 				pte_template |= pt_attr_leaf_xn(pt_attr);
4530 			}
4531 		}
4532 
4533 		if (ptdp != NULL) {
4534 			sptm_ops[num_mappings].pte_template = pte_template;
4535 			++num_mappings;
4536 		} else if (pmap_insert_flush_range_template(pte_template, flush_range)) {
4537 			/**
4538 			 * We submit both the pending disjoint and pending region ops whenever
4539 			 * either category reaches the mapping limit.  Having pending operations
4540 			 * in either category will keep preemption disabled, and we want to ensure
4541 			 * that we can at least temporarily re-enable preemption roughly every
4542 			 * SPTM_MAPPING_LIMIT mappings.
4543 			 */
4544 			pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4545 			pvh_lock_sleep_mode_needed = true;
4546 			num_mappings = num_skipped_mappings = 0;
4547 		}
4548 
4549 protect_skip_pve:
4550 		if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
4551 			if (flush_range != NULL) {
4552 				/* See comment above for why we submit both disjoint and region ops when we hit the limit. */
4553 				pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
4554 				pmap_multipage_op_submit_region(flush_range);
4555 			} else if (num_mappings > 0) {
4556 				if (remove) {
4557 					pmap_disjoint_unmap(phys, num_mappings);
4558 				} else {
4559 					sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4560 				}
4561 			}
4562 			pvh_lock_sleep_mode_needed = true;
4563 			num_mappings = num_skipped_mappings = 0;
4564 		}
4565 		pte_p = PT_ENTRY_NULL;
4566 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
4567 			pve_ptep_idx = 0;
4568 
4569 			if (remove) {
4570 				/**
4571 				 * If there are any IOMMU mappings in the PVE list, preserve
4572 				 * those mappings in a new PVE list (new_pve_p) which will later
4573 				 * become the new PVH entry. Keep track of the CPU mappings in
4574 				 * pveh_p/pvet_p so they can be deallocated later.
4575 				 */
4576 				if (iommu_mapping_in_pve) {
4577 					iommu_mapping_in_pve = false;
4578 					pv_entry_t *temp_pve_p = pve_next(pve_p);
4579 					pve_remove(&local_locked_pvh, pve_pp, pve_p);
4580 					if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
4581 						pveh_p = pvh_pve_list(local_locked_pvh.pvh);
4582 					} else {
4583 						assert(pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL));
4584 						pveh_p = PV_ENTRY_NULL;
4585 					}
4586 					pve_p->pve_next = new_pve_p;
4587 					new_pve_p = pve_p;
4588 					pve_p = temp_pve_p;
4589 					continue;
4590 				} else {
4591 					pvet_p = pve_p;
4592 					pvh_cnt++;
4593 				}
4594 			}
4595 
4596 			pve_pp = pve_next_ptr(pve_p);
4597 			pve_p = pve_next(pve_p);
4598 			iommu_mapping_in_pve = false;
4599 		}
4600 	}
4601 
4602 	if (num_mappings != 0) {
4603 		if (remove) {
4604 			pmap_disjoint_unmap(phys, num_mappings);
4605 		} else if (flush_range == NULL) {
4606 			sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
4607 		} else {
4608 			/* Resync the pending mapping state in flush_range with our local state. */
4609 			assert(num_mappings >= flush_range->pending_disjoint_entries);
4610 			flush_range->pending_disjoint_entries = num_mappings;
4611 		}
4612 	}
4613 
4614 	if (remove) {
4615 		os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
4616 	}
4617 
4618 	/**
4619 	 * Undo the explicit disable_preemption() done in PPO_PERCPU_INIT().
4620 	 * Note that enable_preemption() decrements a per-thread counter, so if
4621 	 * we happen to still hold the PVH lock in spin mode then preemption won't
4622 	 * actually be re-enabled until we drop the lock (which also decrements
4623 	 * the per-thread counter.
4624 	 */
4625 	enable_preemption();
4626 
4627 	/* if we removed a bunch of entries, take care of them now */
4628 	if (remove) {
4629 		/**
4630 		 * If we (or our caller as indicated by PMAP_OPTIONS_PPO_PENDING_RETYPE) will
4631 		 * be retyping the page, we need to drain the epochs to ensure that concurrent
4632 		 * calls to batched operations such as pmap_remove() and the various multipage
4633 		 * attribute update functions have finished consuming mappings of this page.
4634 		 */
4635 		const bool needs_retyping = pmap_prepare_unmapped_page_for_retype(phys);
4636 		if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !needs_retyping) {
4637 			/**
4638 			 * pmap_prepare_unmapped_page_for_retype() will only return true if
4639 			 * the page belongs to a certain set of types that need to be auto-
4640 			 * retyped back to XNU_DEFAULT when they are unmapped.  But if the
4641 			 * caller indicated that it's going to retype the page, we need
4642 			 * to drain the epochs regardless of the current page type.
4643 			 */
4644 			pmap_retype_epoch_prepare_drain();
4645 		}
4646 		if (new_pve_p != PV_ENTRY_NULL) {
4647 			pvh_update_head(&local_locked_pvh, new_pve_p, PVH_TYPE_PVEP);
4648 		} else if (new_pte_p != PT_ENTRY_NULL) {
4649 			pvh_update_head(&local_locked_pvh, new_pte_p, PVH_TYPE_PTEP);
4650 		} else {
4651 			pvh_set_flags(&local_locked_pvh, 0);
4652 			pvh_update_head(&local_locked_pvh, PV_ENTRY_NULL, PVH_TYPE_NULL);
4653 		}
4654 
4655 		/* If removing the last mapping to a specially-protected page, retype the page back to XNU_DEFAULT. */
4656 		const bool retype_needed = pmap_retype_unmapped_page(phys);
4657 		if ((options & PMAP_OPTIONS_PPO_PENDING_RETYPE) && !retype_needed) {
4658 			pmap_retype_epoch_drain();
4659 		}
4660 	}
4661 
4662 	if (__probable(locked_pvh == NULL)) {
4663 		pvh_unlock(&local_locked_pvh);
4664 	} else {
4665 		*locked_pvh = local_locked_pvh;
4666 	}
4667 
4668 	if (remove && (pvet_p != PV_ENTRY_NULL)) {
4669 		assert(pveh_p != PV_ENTRY_NULL);
4670 		pv_list_free(pveh_p, pvet_p, pvh_cnt);
4671 	}
4672 
4673 	if ((flush_range != NULL) && !preemption_enabled()) {
4674 		flush_range->processed_entries += num_skipped_mappings;
4675 	}
4676 }
4677 
4678 MARK_AS_PMAP_TEXT void
4679 pmap_page_protect_options_internal(
4680 	ppnum_t ppnum,
4681 	vm_prot_t prot,
4682 	unsigned int options,
4683 	void *arg)
4684 {
4685 	if (arg != NULL) {
4686 		/*
4687 		 * This is a legacy argument from pre-ARM era that the VM layer passes in to hint that it will call
4688 		 * pmap_flush() later to flush the TLB. On ARM platforms, however, pmap_flush() is not implemented,
4689 		 * as it's typically more efficient to perform the TLB flushing inline with the page table updates
4690 		 * themselves. Therefore, if the argument is non-NULL, pmap will take care of TLB flushing itself
4691 		 * by clearing PMAP_OPTIONS_NOFLUSH.
4692 		 */
4693 		options &= ~PMAP_OPTIONS_NOFLUSH;
4694 	}
4695 	pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL, NULL);
4696 }
4697 
4698 void
4699 pmap_page_protect_options(
4700 	ppnum_t ppnum,
4701 	vm_prot_t prot,
4702 	unsigned int options,
4703 	void *arg)
4704 {
4705 	pmap_paddr_t    phys = ptoa(ppnum);
4706 
4707 	assert(ppnum != vm_page_fictitious_addr);
4708 
4709 	/* Only work with managed pages. */
4710 	if (!pa_valid(phys)) {
4711 		return;
4712 	}
4713 
4714 	/*
4715 	 * Determine the new protection.
4716 	 */
4717 	if (prot == VM_PROT_ALL) {
4718 		return;         /* nothing to do */
4719 	}
4720 
4721 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
4722 
4723 	pmap_page_protect_options_internal(ppnum, prot, options, arg);
4724 
4725 	PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
4726 }
4727 
4728 
4729 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
4730 MARK_AS_PMAP_TEXT void
4731 pmap_disable_user_jop_internal(pmap_t pmap)
4732 {
4733 	if (pmap == kernel_pmap) {
4734 		panic("%s: called with kernel_pmap", __func__);
4735 	}
4736 	validate_pmap_mutable(pmap);
4737 	sptm_configure_root(pmap->ttep, 0, SPTM_ROOT_PT_FLAG_JOP);
4738 	pmap->disable_jop = true;
4739 }
4740 
4741 void
4742 pmap_disable_user_jop(pmap_t pmap)
4743 {
4744 	pmap_disable_user_jop_internal(pmap);
4745 }
4746 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
4747 
4748 /*
4749  * Indicates if the pmap layer enforces some additional restrictions on the
4750  * given set of protections.
4751  */
4752 bool
4753 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
4754 {
4755 	return false;
4756 }
4757 
4758 /*
4759  *	Set the physical protection on the
4760  *	specified range of this map as requested.
4761  *	VERY IMPORTANT: Will not increase permissions.
4762  *	VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
4763  */
4764 void
4765 pmap_protect(
4766 	pmap_t pmap,
4767 	vm_map_address_t b,
4768 	vm_map_address_t e,
4769 	vm_prot_t prot)
4770 {
4771 	pmap_protect_options(pmap, b, e, prot, 0, NULL);
4772 }
4773 
4774 static bool
4775 pmap_protect_strong_sync(unsigned int num_mappings __unused)
4776 {
4777 	return false;
4778 }
4779 
4780 MARK_AS_PMAP_TEXT vm_map_address_t
4781 pmap_protect_options_internal(
4782 	pmap_t pmap,
4783 	vm_map_address_t start,
4784 	vm_map_address_t end,
4785 	vm_prot_t prot,
4786 	unsigned int options,
4787 	__unused void *args)
4788 {
4789 	pt_entry_t       *pte_p;
4790 	bool             set_NX = true;
4791 	bool             set_XO = false;
4792 	bool             should_have_removed = false;
4793 	bool             need_strong_sync = false;
4794 
4795 	/* Validate the pmap input before accessing its data. */
4796 	validate_pmap_mutable(pmap);
4797 
4798 	const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4799 
4800 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
4801 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4802 	}
4803 
4804 #if DEVELOPMENT || DEBUG
4805 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
4806 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
4807 			should_have_removed = true;
4808 		}
4809 	} else
4810 #endif
4811 	{
4812 		/* Determine the new protection. */
4813 		switch (prot) {
4814 		case VM_PROT_EXECUTE:
4815 			set_XO = true;
4816 			OS_FALLTHROUGH;
4817 		case VM_PROT_READ:
4818 		case VM_PROT_READ | VM_PROT_EXECUTE:
4819 			break;
4820 		case VM_PROT_READ | VM_PROT_WRITE:
4821 		case VM_PROT_ALL:
4822 			return end;         /* nothing to do */
4823 		default:
4824 			should_have_removed = true;
4825 		}
4826 	}
4827 
4828 	if (__improbable(should_have_removed)) {
4829 		panic("%s: should have been a remove operation, "
4830 		    "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
4831 		    __FUNCTION__,
4832 		    pmap, (void *)start, (void *)end, prot, options, args);
4833 	}
4834 
4835 #if DEVELOPMENT || DEBUG
4836 	bool force_write = false;
4837 	if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
4838 		force_write = true;
4839 	}
4840 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4841 #else
4842 	if ((prot & VM_PROT_EXECUTE))
4843 #endif
4844 	{
4845 		set_NX = false;
4846 	} else {
4847 		set_NX = true;
4848 	}
4849 
4850 	const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4851 	vm_map_address_t va = start;
4852 	vm_map_address_t sptm_start_va = start;
4853 	unsigned int num_mappings = 0;
4854 
4855 	pmap_lock(pmap, PMAP_LOCK_SHARED);
4856 
4857 	pte_p = pmap_pte(pmap, start);
4858 
4859 	if (pte_p == NULL) {
4860 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
4861 		return end;
4862 	}
4863 
4864 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
4865 #if DEVELOPMENT || DEBUG
4866 	if (!force_write)
4867 #endif
4868 	{
4869 		disable_preemption();
4870 		sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
4871 	}
4872 
4873 	pt_entry_t tmplate = ARM_PTE_EMPTY;
4874 
4875 	if (pmap == kernel_pmap) {
4876 #if DEVELOPMENT || DEBUG
4877 		if (force_write) {
4878 			tmplate = ARM_PTE_AP(AP_RWNA);
4879 		} else
4880 #endif
4881 		{
4882 			tmplate = ARM_PTE_AP(AP_RONA);
4883 		}
4884 	} else {
4885 #if DEVELOPMENT || DEBUG
4886 		if (force_write) {
4887 			assert(pmap->type != PMAP_TYPE_NESTED);
4888 			tmplate = pt_attr_leaf_rw(pt_attr);
4889 		} else
4890 #endif
4891 		if (set_XO) {
4892 			tmplate = pt_attr_leaf_rona(pt_attr);
4893 		} else {
4894 			tmplate = pt_attr_leaf_ro(pt_attr);
4895 		}
4896 	}
4897 
4898 	if (set_NX) {
4899 		tmplate |= pt_attr_leaf_xn(pt_attr);
4900 	}
4901 
4902 	while (va < end) {
4903 		pt_entry_t spte = ARM_PTE_EMPTY;
4904 
4905 		/**
4906 		 * Removing "NX" would grant "execute" access immediately, bypassing any
4907 		 * checks VM might want to do in its soft fault path.
4908 		 * pmap_protect() and co. are not allowed to increase access permissions,
4909 		 * except in the PMAP_OPTIONS_PROTECT_IMMEDIATE internal-only case.
4910 		 * Therefore, if we are not explicitly clearing execute permissions, inherit
4911 		 * the existing permissions.
4912 		 */
4913 		if (!set_NX) {
4914 			spte = os_atomic_load(pte_p, relaxed);
4915 			if (__improbable(!pte_is_valid(spte))) {
4916 				tmplate |= pt_attr_leaf_xn(pt_attr);
4917 			} else {
4918 				tmplate |= (spte & ARM_PTE_XMASK);
4919 			}
4920 		}
4921 
4922 #if DEVELOPMENT || DEBUG
4923 		/*
4924 		 * PMAP_OPTIONS_PROTECT_IMMEDIATE is an internal-only option that's intended to
4925 		 * provide a "backdoor" to allow normally write-protected compressor pages to be
4926 		 * be temporarily written without triggering expensive write faults.
4927 		 */
4928 		while (force_write) {
4929 			if (spte == ARM_PTE_EMPTY) {
4930 				spte = os_atomic_load(pte_p, relaxed);
4931 			}
4932 			const pt_entry_t prev_pte = spte;
4933 
4934 			/* A concurrent disconnect may have cleared the PTE. */
4935 			if (__improbable(!pte_is_valid(spte))) {
4936 				break;
4937 			}
4938 
4939 			/* Inherit permissions and "was_writeable" from the template. */
4940 			spte = (spte & ~(ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE)) |
4941 			    (tmplate & (ARM_PTE_APMASK | ARM_PTE_XMASK | ARM_PTE_WRITEABLE));
4942 
4943 			/* Access flag should be set for any immediate change in protections */
4944 			spte |= ARM_PTE_AF;
4945 			const pmap_paddr_t pa = pte_to_pa(spte);
4946 			const unsigned int pai = pa_index(pa);
4947 			locked_pvh_t locked_pvh;
4948 			if (pa_valid(pa)) {
4949 				locked_pvh = pvh_lock(pai);
4950 
4951 				/**
4952 				 * The VM may concurrently call pmap_disconnect() on the compressor
4953 				 * page in question, e.g. if relocating the page to satisfy a precious
4954 				 * allocation.  Now that we hold the PVH lock, re-check the PTE and
4955 				 * restart the loop if it's different from the value we read before
4956 				 * we held the lock.
4957 				 */
4958 				if (__improbable(os_atomic_load(pte_p, relaxed) != prev_pte)) {
4959 					pvh_unlock(&locked_pvh);
4960 					spte = ARM_PTE_EMPTY;
4961 					continue;
4962 				}
4963 				ppattr_modify_bits(pai, PP_ATTR_REFFAULT | PP_ATTR_MODFAULT,
4964 				    PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
4965 			}
4966 
4967 			__assert_only const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, va, spte);
4968 
4969 			/**
4970 			 * We don't expect the VM to be concurrently calling pmap_remove() against these
4971 			 * compressor mappings.  If it does for some reason, that could cause the above
4972 			 * call to return either SPTM_SUCCESS or SPTM_MAP_FLUSH_PENDING.
4973 			 */
4974 			assert3u(sptm_status, ==, SPTM_MAP_VALID);
4975 
4976 			if (pa_valid(pa)) {
4977 				pvh_unlock(&locked_pvh);
4978 			}
4979 			break;
4980 		}
4981 
4982 #endif /* DEVELOPMENT || DEBUG */
4983 
4984 		va += pmap_page_size;
4985 		++pte_p;
4986 
4987 #if DEVELOPMENT || DEBUG
4988 		if (!force_write)
4989 #endif
4990 		{
4991 			sptm_pcpu->sptm_templates[num_mappings] = tmplate;
4992 			++num_mappings;
4993 			if (num_mappings == SPTM_MAPPING_LIMIT) {
4994 				/**
4995 				 * Enter the retype epoch for the batched update operation.  This is necessary because we
4996 				 * cannot reasonably hold the PVH locks for all pages mapped by the region during this
4997 				 * call, so a concurrent pmap_page_protect() operation against one of those pages may
4998 				 * race this call.  That should be perfectly fine as far as the PTE updates are concerned,
4999 				 * but if pmap_page_protect() then needs to retype the page, an SPTM violation may result
5000 				 * if it does not first drain our epoch.
5001 				 */
5002 				pmap_retype_epoch_enter();
5003 				sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5004 				    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5005 				pmap_retype_epoch_exit();
5006 				need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5007 
5008 				/* Temporarily re-enable preemption to allow any urgent ASTs to be processed. */
5009 				enable_preemption();
5010 				num_mappings = 0;
5011 				sptm_start_va = va;
5012 				disable_preemption();
5013 				sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
5014 			}
5015 		}
5016 	}
5017 
5018 	/* This won't happen in the force_write case as we should never increment num_mappings. */
5019 	if (num_mappings != 0) {
5020 		pmap_retype_epoch_enter();
5021 		sptm_update_region(pmap->ttep, sptm_start_va, num_mappings, sptm_pcpu->sptm_templates_pa,
5022 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE);
5023 		pmap_retype_epoch_exit();
5024 		need_strong_sync = need_strong_sync || pmap_protect_strong_sync(num_mappings);
5025 	}
5026 
5027 #if DEVELOPMENT || DEBUG
5028 	if (!force_write)
5029 #endif
5030 	{
5031 		enable_preemption();
5032 	}
5033 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
5034 	if (__improbable(need_strong_sync)) {
5035 		arm64_sync_tlb(true);
5036 	}
5037 	return va;
5038 }
5039 
5040 void
5041 pmap_protect_options(
5042 	pmap_t pmap,
5043 	vm_map_address_t b,
5044 	vm_map_address_t e,
5045 	vm_prot_t prot,
5046 	unsigned int options,
5047 	__unused void *args)
5048 {
5049 	vm_map_address_t l, beg;
5050 
5051 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5052 
5053 	if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5054 		panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5055 		    pmap, (uint64_t)b, (uint64_t)e);
5056 	}
5057 
5058 	/*
5059 	 * We allow single-page requests to execute non-preemptibly,
5060 	 * as it doesn't make sense to sample AST_URGENT for a single-page
5061 	 * operation, and there are a couple of special use cases that
5062 	 * require a non-preemptible single-page operation.
5063 	 */
5064 	if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5065 		pmap_verify_preemptible();
5066 	}
5067 
5068 #if DEVELOPMENT || DEBUG
5069 	if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5070 		if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5071 			pmap_remove_options(pmap, b, e, options);
5072 			return;
5073 		}
5074 	} else
5075 #endif
5076 	{
5077 		/* Determine the new protection. */
5078 		switch (prot) {
5079 		case VM_PROT_EXECUTE:
5080 		case VM_PROT_READ:
5081 		case VM_PROT_READ | VM_PROT_EXECUTE:
5082 			break;
5083 		case VM_PROT_READ | VM_PROT_WRITE:
5084 		case VM_PROT_ALL:
5085 			return;         /* nothing to do */
5086 		default:
5087 			pmap_remove_options(pmap, b, e, options);
5088 			return;
5089 		}
5090 	}
5091 
5092 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5093 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5094 	    VM_KERNEL_ADDRHIDE(e));
5095 
5096 	beg = b;
5097 
5098 	while (beg < e) {
5099 		l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5100 
5101 		if (l > e) {
5102 			l = e;
5103 		}
5104 
5105 		beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5106 	}
5107 
5108 
5109 	PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5110 }
5111 
5112 /**
5113  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5114  *
5115  * @param pmap pmap to insert the pages into.
5116  * @param va virtual address to map the pages into.
5117  * @param pa page number of the first physical page to map.
5118  * @param size block size, in number of pages.
5119  * @param prot mapping protection attributes.
5120  * @param attr flags to pass to pmap_enter().
5121  *
5122  * @return KERN_SUCCESS.
5123  */
5124 kern_return_t
5125 pmap_map_block(
5126 	pmap_t pmap,
5127 	addr64_t va,
5128 	ppnum_t pa,
5129 	uint32_t size,
5130 	vm_prot_t prot,
5131 	int attr,
5132 	unsigned int flags)
5133 {
5134 	return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5135 }
5136 
5137 /**
5138  * Inserts an arbitrary number of physical pages ("block") in a pmap.
5139  * As opposed to pmap_map_block(), this function takes
5140  * a physical address as an input and operates using the
5141  * page size associated with the input pmap.
5142  *
5143  * @param pmap pmap to insert the pages into.
5144  * @param va virtual address to map the pages into.
5145  * @param pa physical address of the first physical page to map.
5146  * @param size block size, in number of pages.
5147  * @param prot mapping protection attributes.
5148  * @param attr flags to pass to pmap_enter().
5149  *
5150  * @return KERN_SUCCESS.
5151  */
5152 kern_return_t
5153 pmap_map_block_addr(
5154 	pmap_t pmap,
5155 	addr64_t va,
5156 	pmap_paddr_t pa,
5157 	uint32_t size,
5158 	vm_prot_t prot,
5159 	int attr,
5160 	unsigned int flags)
5161 {
5162 #if __ARM_MIXED_PAGE_SIZE__
5163 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5164 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5165 #else
5166 	const uint64_t pmap_page_size = PAGE_SIZE;
5167 #endif
5168 
5169 	for (ppnum_t page = 0; page < size; page++) {
5170 		if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE, PMAP_MAPPING_TYPE_INFER) != KERN_SUCCESS) {
5171 			panic("%s: failed pmap_enter_addr, "
5172 			    "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5173 			    __FUNCTION__,
5174 			    pmap, va, (uint64_t)pa, size, prot, flags);
5175 		}
5176 
5177 		va += pmap_page_size;
5178 		pa += pmap_page_size;
5179 	}
5180 
5181 
5182 	return KERN_SUCCESS;
5183 }
5184 
5185 kern_return_t
5186 pmap_enter_addr(
5187 	pmap_t pmap,
5188 	vm_map_address_t v,
5189 	pmap_paddr_t pa,
5190 	vm_prot_t prot,
5191 	vm_prot_t fault_type,
5192 	unsigned int flags,
5193 	boolean_t wired,
5194 	pmap_mapping_type_t mapping_type)
5195 {
5196 	return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, mapping_type);
5197 }
5198 
5199 /*
5200  *	Insert the given physical page (p) at
5201  *	the specified virtual address (v) in the
5202  *	target physical map with the protection requested.
5203  *
5204  *	If specified, the page will be wired down, meaning
5205  *	that the related pte can not be reclaimed.
5206  *
5207  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5208  *	or lose information.  That is, this routine must actually
5209  *	insert this page into the given map eventually (must make
5210  *	forward progress eventually.
5211  */
5212 kern_return_t
5213 pmap_enter(
5214 	pmap_t pmap,
5215 	vm_map_address_t v,
5216 	ppnum_t pn,
5217 	vm_prot_t prot,
5218 	vm_prot_t fault_type,
5219 	unsigned int flags,
5220 	boolean_t wired,
5221 	pmap_mapping_type_t mapping_type)
5222 {
5223 	return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, mapping_type);
5224 }
5225 
5226 /*
5227  * Attempt to update a PTE constructed by pmap_enter_options().
5228  *
5229  * @note performs no page table or accounting modifications, nor any lasting SPTM page type modification, on failure.
5230  * @note expects to be called with preemption disabled to guarantee safe access to SPTM per-CPU data.
5231  *
5232  * @param pmap The pmap representing the address space in which to store the new PTE
5233  * @param pte_p The physical aperture KVA of the PTE to store
5234  * @param new_pte The new value to store in *pte_p
5235  * @param v The virtual address mapped by pte_p
5236  * @param locked_pvh Input/Output parameter pointing to a wrapped pv_head_table entry returned by
5237  *        a previous call to pvh_lock().  *locked_pvh will be updated if existing mappings
5238  *        need to be disconnected prior to retyping.
5239  * @param old_pte Returns the prior PTE contents, iff the PTE is successfully updated
5240  * @param options bitmask of PMAP_OPTIONS_* flags passed to pmap_enter_options().
5241  * @param mapping_type The type of the new mapping, this defines which SPTM frame type to use.
5242  *
5243  * @return SPTM_SUCCESS iff able to successfully update *pte_p to new_pte via sptm_map_page(),
5244  *         SPTM_MAP_VALID if an existing mapping was successfully upgraded via sptm_map_page(),
5245  *         SPTM_MAP_FLUSH_PENDING if the TLB flush of a previous mapping is still in-flight and
5246  *             the mapping operation should be retried, or if the mapping operation should be retried
5247  *             because we had to temporarily re-enable preemption which would invalidate caller-held
5248  *             per-CPU data.
5249  *         Otherwise an appropriate SPTM or TXM error code; in these cases the mapping should not be
5250  *             retried and the caller should return an error.
5251  */
5252 static inline sptm_return_t
5253 pmap_enter_pte(
5254 	pmap_t pmap,
5255 	pt_entry_t *pte_p,
5256 	pt_entry_t new_pte,
5257 	locked_pvh_t *locked_pvh,
5258 	pt_entry_t *old_pte,
5259 	vm_map_address_t v,
5260 	unsigned int options,
5261 	pmap_mapping_type_t mapping_type)
5262 {
5263 	sptm_pte_t prev_pte;
5264 	bool changed_wiring = false;
5265 
5266 	assert(pte_p != NULL);
5267 	assert(old_pte != NULL);
5268 
5269 	/* SPTM TODO: handle PAGE_RATIO_4 configurations if those devices remain supported. */
5270 
5271 	assert(get_preemption_level() > 0);
5272 	const pmap_paddr_t pa = pte_to_pa(new_pte) & ~PAGE_MASK;
5273 	sptm_frame_type_t prev_frame_type = XNU_DEFAULT;
5274 	sptm_frame_type_t new_frame_type = XNU_DEFAULT;
5275 
5276 	/*
5277 	 * If the caller specified a mapping type of PMAP_MAPPINGS_TYPE_INFER, then we
5278 	 * keep the existing logic of deriving the SPTM frame type from the XPRR permissions.
5279 	 *
5280 	 * If the caller specified another mapping type, we simply follow that. This refactor was
5281 	 * needed for the XNU_KERNEL_RESTRICTED work, and it also allows us to be more precise at
5282 	 * what we want. It's better to let the caller specify the mapping type rather than use the
5283 	 * permissions for that.
5284 	 *
5285 	 * In the future, we should move entirely to use pmap_mapping_type_t; see rdar://114886323.
5286 	 */
5287 	if (mapping_type != PMAP_MAPPING_TYPE_INFER) {
5288 		switch (mapping_type) {
5289 		case PMAP_MAPPING_TYPE_DEFAULT:
5290 			new_frame_type = (sptm_frame_type_t)mapping_type;
5291 			break;
5292 		case PMAP_MAPPING_TYPE_ROZONE:
5293 			assert(((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pmap_get_pt_attr(pmap)))));
5294 			new_frame_type = (sptm_frame_type_t)mapping_type;
5295 			break;
5296 		case PMAP_MAPPING_TYPE_RESTRICTED:
5297 			if (use_xnu_restricted) {
5298 				new_frame_type = (sptm_frame_type_t)mapping_type;
5299 			} else {
5300 				new_frame_type = XNU_DEFAULT;
5301 			}
5302 			break;
5303 		default:
5304 			panic("invalid mapping type: %d", mapping_type);
5305 		}
5306 	} else if (__improbable(pte_to_xprr_perm(new_pte) == XPRR_USER_JIT_PERM)) {
5307 		/*
5308 		 * Always check for XPRR_USER_JIT_PERM before we check for anything else. When using
5309 		 * RWX permissions, the only allowed type is XNU_USER_JIT, regardless of any other
5310 		 * flags which the VM may have provided.
5311 		 *
5312 		 * TODO: Assert that the PMAP_OPTIONS_XNU_USER_DEBUG flag isn't set when entering
5313 		 * this case. We can't do this for now because this might trigger on some macOS
5314 		 * systems where applications use MAP_JIT with RW/RX permissions, and then later
5315 		 * switch to RWX (which will cause a switch to XNU_USER_JIT from XNU_USER_DEBUG
5316 		 * but the VM will still have PMAP_OPTIONS_XNU_USER_DEBUG set). If the VM can
5317 		 * catch this case, and remove PMAP_OPTIONS_XNU_USER_DEBUG when an application
5318 		 * switches to RWX, then we can start asserting this requirement.
5319 		 */
5320 		new_frame_type = XNU_USER_JIT;
5321 	} else if (__improbable(options & PMAP_OPTIONS_XNU_USER_DEBUG)) {
5322 		/*
5323 		 * Both XNU_USER_DEBUG and XNU_USER_EXEC allow RX permissions. Given that, we must
5324 		 * test for PMAP_OPTIONS_XNU_USER_DEBUG before we test for XNU_USER_EXEC since the
5325 		 * XNU_USER_DEBUG type overlays the XNU_USER_EXEC type.
5326 		 */
5327 		new_frame_type = XNU_USER_DEBUG;
5328 	} else if (pte_to_xprr_perm(new_pte) == XPRR_USER_RX_PERM) {
5329 		new_frame_type = XNU_USER_EXEC;
5330 	}
5331 
5332 	if (__improbable(new_frame_type != XNU_DEFAULT)) {
5333 		prev_frame_type = sptm_get_frame_type(pa);
5334 	}
5335 
5336 	if (__improbable(new_frame_type != prev_frame_type)) {
5337 		/**
5338 		 * Remove all existing mappings prior to retyping, so that we can safely retype without having to worry
5339 		 * about a concurrent operation on one of those mappings triggering an SPTM violation.  In particular,
5340 		 * pmap_remove() may clear a mapping to this page without holding its PVH lock.  This approach works
5341 		 * because we hold the PVH lock during this call, and any attempt to enter a new mapping for the page
5342 		 * will also need to grab the PVH lock and call this function.
5343 		 */
5344 		pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
5345 		    PMAP_OPTIONS_PPO_PENDING_RETYPE, locked_pvh, NULL);
5346 		/**
5347 		 * In the unlikely event that pmap_page_protect_options_with_flush_range() had to process
5348 		 * an excessively long PV list, it will have enabled preemption by placing the PVH lock
5349 		 * in sleep mode.  In this case, we may have been migrated to a different CPU, and caller
5350 		 * assumptions about the state of per-CPU data (such as per-CPU PVE availability) will no
5351 		 * longer hold true.  Ask the caller to retry by pretending we encountered a pending flush.
5352 		 */
5353 		if (__improbable(preemption_enabled())) {
5354 			return SPTM_MAP_FLUSH_PENDING;
5355 		}
5356 		sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5357 		/* Reload the existing frame type, as pmap_page_protect_options() may have changed it back to XNU_DEFAULT. */
5358 		prev_frame_type = sptm_get_frame_type(pa);
5359 		sptm_retype(pa, prev_frame_type, new_frame_type, retype_params);
5360 	}
5361 
5362 	const sptm_return_t sptm_status = sptm_map_page(pmap->ttep, v, new_pte);
5363 	if (__improbable((sptm_status != SPTM_SUCCESS) && (sptm_status != SPTM_MAP_VALID))) {
5364 		/*
5365 		 * We should always undo our previous retype, even if the SPTM returned SPTM_MAP_FLUSH_PENDING as
5366 		 * opposed to a TXM error.  In the case of SPTM_MAP_FLUSH_PENDING, pmap_enter() will drop the PVH
5367 		 * lock before turning around to retry the mapping operation.  It may then be possible for the
5368 		 * mapping state of the page to change such that our next attempt to map it will fail with a TXM
5369 		 * error, so if we were to leave the new type in place here we would then have lost our record
5370 		 * of the previous type and would effectively leave the page in an inconsistent state.
5371 		 */
5372 		if (__improbable(new_frame_type != prev_frame_type)) {
5373 			sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
5374 			sptm_retype(pa, new_frame_type, prev_frame_type, retype_params);
5375 		}
5376 		return sptm_status;
5377 	}
5378 
5379 	*old_pte = prev_pte = PERCPU_GET(pmap_sptm_percpu)->sptm_prev_ptes[0];
5380 
5381 	if (prev_pte != new_pte) {
5382 		changed_wiring = pte_is_compressed(prev_pte, pte_p) ?
5383 		    (new_pte & ARM_PTE_WIRED) != 0 :
5384 		    (new_pte & ARM_PTE_WIRED) != (prev_pte & ARM_PTE_WIRED);
5385 
5386 		if ((pmap != kernel_pmap) && changed_wiring) {
5387 			pte_update_wiredcnt(pmap, pte_p, (new_pte & ARM_PTE_WIRED) != 0);
5388 		}
5389 
5390 		PMAP_TRACE(4 + pt_attr_leaf_level(pmap_get_pt_attr(pmap)), PMAP_CODE(PMAP__TTE),
5391 		    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v),
5392 		    VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)), new_pte);
5393 	}
5394 
5395 	return sptm_status;
5396 }
5397 
5398 MARK_AS_PMAP_TEXT static pt_entry_t
5399 wimg_to_pte(unsigned int wimg, pmap_paddr_t pa)
5400 {
5401 	pt_entry_t pte;
5402 
5403 	switch (wimg & (VM_WIMG_MASK)) {
5404 	case VM_WIMG_IO:
5405 		// Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5406 		// Device-nGnRnE. On H14+, accesses to them can be reordered by
5407 		// AP, while preserving the security benefits of using device
5408 		// mapping against side-channel attacks. On pre-H14 platforms,
5409 		// the accesses will still be strongly ordered.
5410 		if (is_dram_addr(pa)) {
5411 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5412 		} else {
5413 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5414 #if HAS_FEAT_XS
5415 			pmap_io_range_t *io_rgn = pmap_find_io_attr(pa);
5416 			if (__improbable((io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_STRONG_SYNC))) {
5417 				pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE_XS);
5418 			}
5419 #endif /* HAS_FEAT_XS */
5420 		}
5421 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5422 		break;
5423 	case VM_WIMG_RT:
5424 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5425 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5426 		break;
5427 	case VM_WIMG_POSTED:
5428 		if (is_dram_addr(pa)) {
5429 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5430 		} else {
5431 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5432 		}
5433 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5434 		break;
5435 	case VM_WIMG_POSTED_REORDERED:
5436 		if (is_dram_addr(pa)) {
5437 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5438 		} else {
5439 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5440 		}
5441 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5442 		break;
5443 	case VM_WIMG_POSTED_COMBINED_REORDERED:
5444 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5445 #if HAS_FEAT_XS
5446 		if (!is_dram_addr(pa)) {
5447 			pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5448 		}
5449 #endif /* HAS_FEAT_XS */
5450 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5451 		break;
5452 	case VM_WIMG_WCOMB:
5453 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5454 		pte |= ARM_PTE_NX | ARM_PTE_PNX;
5455 		break;
5456 	case VM_WIMG_WTHRU:
5457 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5458 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5459 		break;
5460 	case VM_WIMG_COPYBACK:
5461 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5462 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5463 		break;
5464 	case VM_WIMG_INNERWBACK:
5465 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5466 		pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5467 		break;
5468 	default:
5469 		pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5470 		pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5471 	}
5472 
5473 	return pte;
5474 }
5475 
5476 
5477 /*
5478  * Construct a PTE (and the physical page attributes) for the given virtual to
5479  * physical mapping.
5480  *
5481  * This function has no side effects and is safe to call so that it is safe to
5482  * call while attempting a pmap_enter transaction.
5483  */
5484 MARK_AS_PMAP_TEXT static pt_entry_t
5485 pmap_construct_pte(
5486 	const pmap_t pmap,
5487 	vm_map_address_t va,
5488 	pmap_paddr_t pa,
5489 	vm_prot_t prot,
5490 	vm_prot_t fault_type,
5491 	boolean_t wired,
5492 	const pt_attr_t* const pt_attr,
5493 	uint16_t *pp_attr_bits /* OUTPUT */
5494 	)
5495 {
5496 	bool set_NX = false, set_XO = false;
5497 	pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE_VALID;
5498 	assert(pp_attr_bits != NULL);
5499 	*pp_attr_bits = 0;
5500 
5501 	if (wired) {
5502 		pte |= ARM_PTE_WIRED;
5503 	}
5504 
5505 #if DEVELOPMENT || DEBUG
5506 	if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5507 #else
5508 	if ((prot & VM_PROT_EXECUTE))
5509 #endif
5510 	{
5511 		set_NX = false;
5512 	} else {
5513 		set_NX = true;
5514 	}
5515 
5516 	if (prot == VM_PROT_EXECUTE) {
5517 		set_XO = true;
5518 
5519 	}
5520 
5521 	if (set_NX) {
5522 		pte |= pt_attr_leaf_xn(pt_attr);
5523 	} else {
5524 		if (pmap == kernel_pmap) {
5525 			pte |= ARM_PTE_NX;
5526 		} else {
5527 			pte |= pt_attr_leaf_x(pt_attr);
5528 		}
5529 	}
5530 
5531 	if (pmap == kernel_pmap) {
5532 #if __ARM_KERNEL_PROTECT__
5533 		pte |= ARM_PTE_NG;
5534 #endif /* __ARM_KERNEL_PROTECT__ */
5535 		if (prot & VM_PROT_WRITE) {
5536 			pte |= ARM_PTE_AP(AP_RWNA);
5537 			*pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5538 		} else {
5539 			pte |= ARM_PTE_AP(AP_RONA);
5540 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5541 		}
5542 	} else {
5543 		if (pmap->type != PMAP_TYPE_NESTED) {
5544 			pte |= ARM_PTE_NG;
5545 		} else if ((pmap->nested_region_unnested_table_bitmap)
5546 		    && (va >= pmap->nested_region_addr)
5547 		    && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5548 			unsigned int index = (unsigned int)((va - pmap->nested_region_addr)  >> pt_attr_twig_shift(pt_attr));
5549 
5550 			if ((pmap->nested_region_unnested_table_bitmap)
5551 			    && bitmap_test(pmap->nested_region_unnested_table_bitmap, index)) {
5552 				pte |= ARM_PTE_NG;
5553 			}
5554 		}
5555 		if (prot & VM_PROT_WRITE) {
5556 			assert(pmap->type != PMAP_TYPE_NESTED);
5557 			if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5558 				if (fault_type & VM_PROT_WRITE) {
5559 					pte |= pt_attr_leaf_rw(pt_attr);
5560 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5561 				} else {
5562 					pte |= pt_attr_leaf_ro(pt_attr);
5563 					/*
5564 					 * Mark the page as MODFAULT so that a subsequent write
5565 					 * may be handled through arm_fast_fault().
5566 					 */
5567 					*pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5568 					pte_set_was_writeable(pte, true);
5569 				}
5570 			} else {
5571 				pte |= pt_attr_leaf_rw(pt_attr);
5572 				*pp_attr_bits |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5573 			}
5574 		} else {
5575 			if (set_XO) {
5576 				pte |= pt_attr_leaf_rona(pt_attr);
5577 			} else {
5578 				pte |= pt_attr_leaf_ro(pt_attr);
5579 			}
5580 			*pp_attr_bits |= PP_ATTR_REFERENCED;
5581 		}
5582 	}
5583 
5584 	pte |= ARM_PTE_AF;
5585 	return pte;
5586 }
5587 
5588 MARK_AS_PMAP_TEXT kern_return_t
5589 pmap_enter_options_internal(
5590 	pmap_t pmap,
5591 	vm_map_address_t v,
5592 	pmap_paddr_t pa,
5593 	vm_prot_t prot,
5594 	vm_prot_t fault_type,
5595 	unsigned int flags,
5596 	boolean_t wired,
5597 	unsigned int options,
5598 	pmap_mapping_type_t mapping_type)
5599 {
5600 	ppnum_t         pn = (ppnum_t)atop(pa);
5601 	pt_entry_t      *pte_p;
5602 	unsigned int    wimg_bits;
5603 	bool            committed = false;
5604 	kern_return_t   kr = KERN_SUCCESS;
5605 	uint16_t pp_attr_bits;
5606 	volatile uint16_t *wiredcnt = NULL;
5607 	pv_free_list_t *local_pv_free;
5608 
5609 	validate_pmap_mutable(pmap);
5610 
5611 	/**
5612 	 * Prepare for the SPTM call early by prefetching the relavant FTEs. Cache misses
5613 	 * in SPTM accessing these turn out to contribute to a large portion of delay on
5614 	 * the critical path. Technically, sptm_prefetch_fte may not find an FTE associated
5615 	 * with pa and return LIBSPTM_FAILURE. However, we are okay with that as it's only
5616 	 * a best-effort performance optimization.
5617 	 */
5618 	sptm_prefetch_fte(pmap->ttep);
5619 	sptm_prefetch_fte(pa);
5620 
5621 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5622 
5623 	if ((v) & pt_attr_leaf_offmask(pt_attr)) {
5624 		panic("pmap_enter_options() pmap %p v 0x%llx",
5625 		    pmap, (uint64_t)v);
5626 	}
5627 
5628 	if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
5629 		panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
5630 		    pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
5631 	}
5632 
5633 	if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
5634 		panic("pmap_enter_options() pmap %p pa 0x%llx",
5635 		    pmap, (uint64_t)pa);
5636 	}
5637 
5638 	/* The PA should not extend beyond the architected physical address space */
5639 	pa &= ARM_PTE_PAGE_MASK;
5640 
5641 	if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
5642 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
5643 		extern vm_offset_t ctrr_test_page;
5644 		if (__probable(v != ctrr_test_page))
5645 #endif
5646 		panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
5647 	}
5648 	assert(pn != vm_page_fictitious_addr);
5649 
5650 	pmap_lock(pmap, PMAP_LOCK_SHARED);
5651 
5652 	/*
5653 	 *	Expand pmap to include this pte.  Assume that
5654 	 *	pmap is always expanded to include enough hardware
5655 	 *	pages to map one VM page.
5656 	 */
5657 	while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
5658 		/* Must unlock to expand the pmap. */
5659 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
5660 
5661 		kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
5662 
5663 		if (kr != KERN_SUCCESS) {
5664 			return kr;
5665 		}
5666 
5667 		pmap_lock(pmap, PMAP_LOCK_SHARED);
5668 	}
5669 
5670 	if (options & PMAP_OPTIONS_NOENTER) {
5671 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
5672 		return KERN_SUCCESS;
5673 	}
5674 
5675 	/*
5676 	 * Since we may not hold the pmap lock exclusive, updating the pte is
5677 	 * done via a cmpxchg loop.
5678 	 * We need to be careful about modifying non-local data structures before commiting
5679 	 * the new pte since we may need to re-do the transaction.
5680 	 */
5681 	const pt_entry_t prev_pte = os_atomic_load(pte_p, relaxed);
5682 
5683 	if (pte_is_valid(prev_pte) && (pte_to_pa(prev_pte) != pa)) {
5684 		/*
5685 		 * There is already a mapping here & it's for a different physical page.
5686 		 * First remove that mapping.
5687 		 * We assume that we can leave the pmap lock held for shared access rather
5688 		 * than exclusive access here, because we assume that the VM won't try to
5689 		 * simultaneously map the same VA to multiple different physical pages.
5690 		 * If that assumption is violated, sptm_map_page() will panic as the architecture
5691 		 * does not allow the output address of a mapping to be changed without a break-
5692 		 * before-make sequence.
5693 		 */
5694 		pmap_remove_range(pmap, v, v + PAGE_SIZE);
5695 	}
5696 
5697 	if (pmap != kernel_pmap) {
5698 		ptd_info_t *ptd_info = ptep_get_info(pte_p);
5699 		wiredcnt = &ptd_info->wiredcnt;
5700 	}
5701 
5702 	while (!committed) {
5703 		pt_entry_t spte = ARM_PTE_EMPTY;
5704 		pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
5705 		bool skip_footprint_debit = false;
5706 
5707 		/*
5708 		 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
5709 		 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
5710 		 * read-write protection. The PMAP layer though still needs to use the right
5711 		 * index, which is the older XO-now-TPRO one and that is specially selected
5712 		 * here thanks to PMAP_OPTIONS_MAP_TPRO.
5713 		 *
5714 		 * Note that pmap_construct_pte() may check the nested region ASID bitmap,
5715 		 * which needs to happen at every iteration of the commit loop in case we
5716 		 * previously dropped the pmap lock.
5717 		 */
5718 		pt_entry_t pte = pmap_construct_pte(pmap, v, pa,
5719 		    ((options & PMAP_OPTIONS_MAP_TPRO) ? VM_PROT_RORW_TP : prot), fault_type, wired, pt_attr, &pp_attr_bits);
5720 
5721 		if (pa_valid(pa)) {
5722 			unsigned int pai;
5723 			boolean_t   is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
5724 
5725 			is_internal = FALSE;
5726 			is_altacct = FALSE;
5727 
5728 			pai = pa_index(pa);
5729 			locked_pvh_t locked_pvh;
5730 
5731 			if (__improbable(options & PMAP_OPTIONS_NOPREEMPT)) {
5732 				locked_pvh = pvh_lock_nopreempt(pai);
5733 			} else {
5734 				locked_pvh = pvh_lock(pai);
5735 			}
5736 
5737 			/*
5738 			 * Make sure that the current per-cpu PV free list has
5739 			 * enough entries (2 in the worst-case scenario) to handle the enter_pv
5740 			 * if the transaction succeeds. At this point, preemption has either
5741 			 * been disabled by the caller or by pvh_lock() above.
5742 			 * Note that we can still be interrupted, but a primary
5743 			 * interrupt handler can never enter the pmap.
5744 			 */
5745 			assert(get_preemption_level() > 0);
5746 			local_pv_free = &pmap_get_cpu_data()->pv_free;
5747 			const bool allocation_required = !pvh_test_type(locked_pvh.pvh, PVH_TYPE_NULL) &&
5748 			    !(pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP) && pvh_ptep(locked_pvh.pvh) == pte_p);
5749 
5750 			if (__improbable(allocation_required && (local_pv_free->count < 2))) {
5751 				pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
5752 				int new_allocated_pves = 0;
5753 
5754 				while (new_allocated_pves < 2) {
5755 					local_pv_free = &pmap_get_cpu_data()->pv_free;
5756 					pv_status = pv_alloc(pmap, PMAP_LOCK_SHARED, options, &new_pve_p[new_allocated_pves], &locked_pvh, wiredcnt);
5757 					if (pv_status == PV_ALLOC_FAIL) {
5758 						break;
5759 					} else if (pv_status == PV_ALLOC_RETRY) {
5760 						/*
5761 						 * In the case that pv_alloc() had to grab a new page of PVEs,
5762 						 * it will have dropped the pmap lock while doing so.
5763 						 * On non-PPL devices, dropping the lock re-enables preemption so we may
5764 						 * be on a different CPU now.
5765 						 */
5766 						local_pv_free = &pmap_get_cpu_data()->pv_free;
5767 					} else {
5768 						/* If we've gotten this far then a node should've been allocated. */
5769 						assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
5770 
5771 						new_allocated_pves++;
5772 					}
5773 				}
5774 
5775 				for (int i = 0; i < new_allocated_pves; i++) {
5776 					pv_free(new_pve_p[i]);
5777 				}
5778 			}
5779 
5780 			if (pv_status == PV_ALLOC_FAIL) {
5781 				pvh_unlock(&locked_pvh);
5782 				kr = KERN_RESOURCE_SHORTAGE;
5783 				break;
5784 			} else if (pv_status == PV_ALLOC_RETRY) {
5785 				pvh_unlock(&locked_pvh);
5786 				/* We dropped the pmap and PVH locks to allocate. Retry transaction. */
5787 				continue;
5788 			}
5789 
5790 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
5791 				wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
5792 			} else {
5793 				wimg_bits = pmap_cache_attributes(pn);
5794 			}
5795 
5796 			/**
5797 			 * We may be retrying this operation after dropping the PVH lock.
5798 			 * Cache attributes for the physical page may have changed while the lock
5799 			 * was dropped, so update PTE cache attributes on each loop iteration.
5800 			 */
5801 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
5802 
5803 
5804 			const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, &locked_pvh, &spte, v, options, mapping_type);
5805 			assert(committed == false);
5806 			if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
5807 				committed = true;
5808 			} else if (sptm_status == SPTM_MAP_FLUSH_PENDING) {
5809 				pvh_unlock(&locked_pvh);
5810 				continue;
5811 			} else if (sptm_status == SPTM_MAP_CODESIGN_ERROR) {
5812 				pvh_unlock(&locked_pvh);
5813 				kr = KERN_CODESIGN_ERROR;
5814 				break;
5815 			} else {
5816 				pvh_unlock(&locked_pvh);
5817 				kr = KERN_FAILURE;
5818 				break;
5819 			}
5820 			const bool had_valid_mapping = (sptm_status == SPTM_MAP_VALID);
5821 			/* End of transaction. Commit pv changes, pa bits, and memory accounting. */
5822 			if (!had_valid_mapping) {
5823 				pv_entry_t *new_pve_p = PV_ENTRY_NULL;
5824 				int pve_ptep_idx = 0;
5825 				pv_status = pmap_enter_pv(pmap, pte_p, options, PMAP_LOCK_SHARED, &locked_pvh, &new_pve_p, &pve_ptep_idx);
5826 				/* We did all the allocations up top. So this shouldn't be able to fail. */
5827 				if (pv_status != PV_ALLOC_SUCCESS) {
5828 					panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
5829 					    __func__, pv_status, new_pve_p, pmap);
5830 				}
5831 
5832 				if (pmap != kernel_pmap) {
5833 					if (options & PMAP_OPTIONS_INTERNAL) {
5834 						ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
5835 						if ((options & PMAP_OPTIONS_ALT_ACCT) ||
5836 						    PMAP_FOOTPRINT_SUSPENDED(pmap)) {
5837 							/*
5838 							 * Make a note to ourselves that this
5839 							 * mapping is using alternative
5840 							 * accounting. We'll need this in order
5841 							 * to know which ledger to debit when
5842 							 * the mapping is removed.
5843 							 *
5844 							 * The altacct bit must be set while
5845 							 * the pv head is locked. Defer the
5846 							 * ledger accounting until after we've
5847 							 * dropped the lock.
5848 							 */
5849 							ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
5850 							is_altacct = TRUE;
5851 						}
5852 					}
5853 					if (ppattr_test_reusable(pai) &&
5854 					    !is_altacct) {
5855 						is_reusable = TRUE;
5856 					} else if (options & PMAP_OPTIONS_INTERNAL) {
5857 						is_internal = TRUE;
5858 					} else {
5859 						is_external = TRUE;
5860 					}
5861 				}
5862 			}
5863 
5864 			pvh_unlock(&locked_pvh);
5865 
5866 			if (pp_attr_bits != 0) {
5867 				ppattr_pa_set_bits(pa, pp_attr_bits);
5868 			}
5869 
5870 			if (!had_valid_mapping && (pmap != kernel_pmap)) {
5871 				pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5872 
5873 				if (is_internal) {
5874 					/*
5875 					 * Make corresponding adjustments to
5876 					 * phys_footprint statistics.
5877 					 */
5878 					pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5879 					if (is_altacct) {
5880 						/*
5881 						 * If this page is internal and
5882 						 * in an IOKit region, credit
5883 						 * the task's total count of
5884 						 * dirty, internal IOKit pages.
5885 						 * It should *not* count towards
5886 						 * the task's total physical
5887 						 * memory footprint, because
5888 						 * this entire region was
5889 						 * already billed to the task
5890 						 * at the time the mapping was
5891 						 * created.
5892 						 *
5893 						 * Put another way, this is
5894 						 * internal++ and
5895 						 * alternate_accounting++, so
5896 						 * net effect on phys_footprint
5897 						 * is 0. That means: don't
5898 						 * touch phys_footprint here.
5899 						 */
5900 						pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5901 					} else {
5902 						if (pte_is_compressed(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
5903 							/* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
5904 							skip_footprint_debit = true;
5905 						} else {
5906 							pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5907 						}
5908 					}
5909 				}
5910 				if (is_reusable) {
5911 					pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5912 				} else if (is_external) {
5913 					pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5914 				}
5915 			}
5916 		} else {
5917 			if (prot & VM_PROT_EXECUTE) {
5918 				kr = KERN_FAILURE;
5919 				break;
5920 			}
5921 
5922 			wimg_bits = pmap_cache_attributes(pn);
5923 			if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
5924 				wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
5925 			}
5926 
5927 			pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
5928 
5929 
5930 			/**
5931 			 * pmap_enter_pte() expects to be called with preemption disabled so it can access
5932 			 * the per-CPU prev_ptes array.
5933 			 */
5934 			disable_preemption();
5935 			const sptm_return_t sptm_status = pmap_enter_pte(pmap, pte_p, pte, NULL, &spte, v, options, mapping_type);
5936 			enable_preemption();
5937 			assert(committed == false);
5938 			if ((sptm_status == SPTM_SUCCESS) || (sptm_status == SPTM_MAP_VALID)) {
5939 				committed = true;
5940 
5941 				/**
5942 				 * If there was already a valid pte here then we reuse its
5943 				 * reference on the ptd and drop the one that we took above.
5944 				 */
5945 			} else if (__improbable(sptm_status != SPTM_MAP_FLUSH_PENDING)) {
5946 				panic("%s: Unexpected SPTM return code %u for non-managed PA 0x%llx", __func__, (unsigned int)sptm_status, (unsigned long long)pa);
5947 			}
5948 		}
5949 		if (committed) {
5950 			if (pte_is_compressed(spte, pte_p)) {
5951 				assert(pmap != kernel_pmap);
5952 
5953 				/* One less "compressed" */
5954 				pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
5955 				    pt_attr_page_size(pt_attr) * PAGE_RATIO);
5956 
5957 				if (spte & ARM_PTE_COMPRESSED_ALT) {
5958 					pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5959 				} else if (!skip_footprint_debit) {
5960 					/* Was part of the footprint */
5961 					pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5962 				}
5963 			}
5964 		}
5965 	}
5966 
5967 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
5968 
5969 	if (kr == KERN_CODESIGN_ERROR) {
5970 		/* Print any logs from TXM */
5971 		txm_print_logs();
5972 	}
5973 	return kr;
5974 }
5975 
5976 kern_return_t
5977 pmap_enter_options_addr(
5978 	pmap_t pmap,
5979 	vm_map_address_t v,
5980 	pmap_paddr_t pa,
5981 	vm_prot_t prot,
5982 	vm_prot_t fault_type,
5983 	unsigned int flags,
5984 	boolean_t wired,
5985 	unsigned int options,
5986 	__unused void   *arg,
5987 	pmap_mapping_type_t mapping_type)
5988 {
5989 	kern_return_t kr = KERN_FAILURE;
5990 
5991 
5992 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
5993 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
5994 
5995 	kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options, mapping_type);
5996 
5997 	PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
5998 
5999 	return kr;
6000 }
6001 
6002 kern_return_t
6003 pmap_enter_options(
6004 	pmap_t pmap,
6005 	vm_map_address_t v,
6006 	ppnum_t pn,
6007 	vm_prot_t prot,
6008 	vm_prot_t fault_type,
6009 	unsigned int flags,
6010 	boolean_t wired,
6011 	unsigned int options,
6012 	__unused void   *arg,
6013 	pmap_mapping_type_t mapping_type)
6014 {
6015 	return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot,
6016 	           fault_type, flags, wired, options, arg, mapping_type);
6017 }
6018 
6019 /*
6020  *	Routine:	pmap_change_wiring
6021  *	Function:	Change the wiring attribute for a map/virtual-address
6022  *			pair.
6023  *	In/out conditions:
6024  *			The mapping must already exist in the pmap.
6025  */
6026 MARK_AS_PMAP_TEXT void
6027 pmap_change_wiring_internal(
6028 	pmap_t pmap,
6029 	vm_map_address_t v,
6030 	boolean_t wired)
6031 {
6032 	pt_entry_t     *pte_p, prev_pte;
6033 
6034 	validate_pmap_mutable(pmap);
6035 
6036 	pmap_lock(pmap, PMAP_LOCK_SHARED);
6037 
6038 	const pt_entry_t new_wiring = (wired ? ARM_PTE_WIRED : 0);
6039 
6040 	pte_p = pmap_pte(pmap, v);
6041 	if (pte_p == PT_ENTRY_NULL) {
6042 		if (!wired) {
6043 			/*
6044 			 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6045 			 * may have been freed by a remove operation.
6046 			 */
6047 			goto pmap_change_wiring_return;
6048 		} else {
6049 			panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6050 		}
6051 	}
6052 
6053 	disable_preemption();
6054 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
6055 	sptm_pcpu->sptm_templates[0] = (*pte_p & ~ARM_PTE_WIRED) | new_wiring;
6056 
6057 	pmap_retype_epoch_enter();
6058 	sptm_update_region(pmap->ttep, v, 1, sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_SW_WIRED);
6059 	pmap_retype_epoch_exit();
6060 
6061 	prev_pte = os_atomic_load(&sptm_pcpu->sptm_prev_ptes[0], relaxed);
6062 	enable_preemption();
6063 
6064 	if (!pte_is_valid(prev_pte)) {
6065 		goto pmap_change_wiring_return;
6066 	}
6067 
6068 	if ((pmap != kernel_pmap) && (wired != pte_is_wired(prev_pte))) {
6069 		pte_update_wiredcnt(pmap, pte_p, wired);
6070 	}
6071 
6072 pmap_change_wiring_return:
6073 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
6074 }
6075 
6076 void
6077 pmap_change_wiring(
6078 	pmap_t pmap,
6079 	vm_map_address_t v,
6080 	boolean_t wired)
6081 {
6082 	pmap_change_wiring_internal(pmap, v, wired);
6083 }
6084 
6085 MARK_AS_PMAP_TEXT pmap_paddr_t
6086 pmap_find_pa_internal(
6087 	pmap_t pmap,
6088 	addr64_t va)
6089 {
6090 	pmap_paddr_t    pa = 0;
6091 
6092 	validate_pmap(pmap);
6093 
6094 	if (pmap != kernel_pmap) {
6095 		pmap_lock(pmap, PMAP_LOCK_SHARED);
6096 	}
6097 
6098 	pa = pmap_vtophys(pmap, va);
6099 
6100 	if (pmap != kernel_pmap) {
6101 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
6102 	}
6103 
6104 	return pa;
6105 }
6106 
6107 pmap_paddr_t
6108 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6109 {
6110 	pmap_paddr_t pa = 0;
6111 
6112 	if (pmap == kernel_pmap) {
6113 		pa = mmu_kvtop(va);
6114 	} else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6115 		/*
6116 		 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6117 		 * translation even if PAN would prevent kernel access through the translation.
6118 		 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6119 		 */
6120 		pa = mmu_uvtop(va);
6121 	}
6122 	return pa;
6123 }
6124 
6125 pmap_paddr_t
6126 pmap_find_pa(
6127 	pmap_t pmap,
6128 	addr64_t va)
6129 {
6130 	pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6131 
6132 	if (pa != 0) {
6133 		return pa;
6134 	}
6135 
6136 	if (not_in_kdp) {
6137 		return pmap_find_pa_internal(pmap, va);
6138 	} else {
6139 		return pmap_vtophys(pmap, va);
6140 	}
6141 }
6142 
6143 ppnum_t
6144 pmap_find_phys_nofault(
6145 	pmap_t pmap,
6146 	addr64_t va)
6147 {
6148 	ppnum_t ppn;
6149 	ppn = atop(pmap_find_pa_nofault(pmap, va));
6150 	return ppn;
6151 }
6152 
6153 ppnum_t
6154 pmap_find_phys(
6155 	pmap_t pmap,
6156 	addr64_t va)
6157 {
6158 	ppnum_t ppn;
6159 	ppn = atop(pmap_find_pa(pmap, va));
6160 	return ppn;
6161 }
6162 
6163 /**
6164  * Translate a kernel virtual address into a physical address.
6165  *
6166  * @param va The kernel virtual address to translate. Does not work on user
6167  *           virtual addresses.
6168  *
6169  * @return The physical address if the translation was successful, or zero if
6170  *         no valid mappings were found for the given virtual address.
6171  */
6172 pmap_paddr_t
6173 kvtophys(vm_offset_t va)
6174 {
6175 	sptm_paddr_t pa;
6176 
6177 	if (sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS) {
6178 		return 0;
6179 	}
6180 
6181 	return pa;
6182 }
6183 
6184 /**
6185  * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6186  * points to a non-kernel-managed physical page, then this call will panic().
6187  *
6188  * @note The output of this function is guaranteed to be a kernel-managed
6189  *       physical page, which means it's safe to pass the output directly to
6190  *       pa_index() to create a physical address index for various pmap data
6191  *       structures.
6192  *
6193  * @param va The kernel virtual address to translate. Does not work on user
6194  *           virtual addresses.
6195  *
6196  * @return The translated physical address for the given virtual address.
6197  */
6198 pmap_paddr_t
6199 kvtophys_nofail(vm_offset_t va)
6200 {
6201 	pmap_paddr_t pa;
6202 
6203 	if (__improbable(sptm_kvtophys(va, &pa) != LIBSPTM_SUCCESS)) {
6204 		panic("%s: VA->PA translation failed for va %p", __func__, (void *)va);
6205 	}
6206 
6207 	return pa;
6208 }
6209 
6210 pmap_paddr_t
6211 pmap_vtophys(
6212 	pmap_t pmap,
6213 	addr64_t va)
6214 {
6215 	if ((va < pmap->min) || (va >= pmap->max)) {
6216 		return 0;
6217 	}
6218 
6219 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6220 
6221 	tt_entry_t * ttp = NULL;
6222 	tt_entry_t * ttep = NULL;
6223 	tt_entry_t   tte = ARM_TTE_EMPTY;
6224 	pmap_paddr_t pa = 0;
6225 	unsigned int cur_level;
6226 
6227 	ttp = pmap->tte;
6228 
6229 	for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6230 		ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6231 
6232 		tte = *ttep;
6233 
6234 		const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6235 		const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6236 		const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6237 		const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6238 
6239 		if ((tte & valid_mask) != valid_mask) {
6240 			return (pmap_paddr_t) 0;
6241 		}
6242 
6243 		/* This detects both leaf entries and intermediate block mappings. */
6244 		if ((tte & type_mask) == type_block) {
6245 			pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6246 			break;
6247 		}
6248 
6249 		ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6250 	}
6251 
6252 	return pa;
6253 }
6254 
6255 /*
6256  *	pmap_init_pte_page - Initialize a page table page.
6257  */
6258 MARK_AS_PMAP_TEXT void
6259 pmap_init_pte_page(
6260 	pmap_t pmap,
6261 	pt_entry_t *pte_p,
6262 	vm_offset_t va,
6263 	unsigned int ttlevel,
6264 	boolean_t alloc_ptd)
6265 {
6266 	pt_desc_t   *ptdp = NULL;
6267 	unsigned int pai = pa_index(kvtophys_nofail((vm_offset_t)pte_p));
6268 	const uintptr_t pvh = pai_to_pvh(pai);
6269 
6270 	if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6271 		if (alloc_ptd) {
6272 			/*
6273 			 * This path should only be invoked from arm_vm_init.  If we are emulating 16KB pages
6274 			 * on 4KB hardware, we may already have allocated a page table descriptor for a
6275 			 * bootstrap request, so we check for an existing PTD here.
6276 			 */
6277 			ptdp = ptd_alloc(pmap, PMAP_PAGE_ALLOCATE_NOWAIT);
6278 			if (ptdp == NULL) {
6279 				panic("%s: unable to allocate PTD", __func__);
6280 			}
6281 			locked_pvh_t locked_pvh = pvh_lock(pai);
6282 			pvh_update_head(&locked_pvh, ptdp, PVH_TYPE_PTDP);
6283 			pvh_unlock(&locked_pvh);
6284 		} else {
6285 			panic("pmap_init_pte_page(): no PTD for pte_p %p", pte_p);
6286 		}
6287 	} else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6288 		ptdp = pvh_ptd(pvh);
6289 	} else {
6290 		panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6291 	}
6292 
6293 	// pagetable zero-fill and barrier should be guaranteed by the SPTM
6294 	ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6295 }
6296 
6297 /*
6298  * This function guarantees that a pmap has the necessary page tables in place
6299  * to map the specified VA.  If necessary, it will allocate new tables at any
6300  * non-root level in the hierarchy (the root table is always already allocated
6301  * and stored in the pmap).
6302  *
6303  * @note This function is expected to be called without any pmap or PVH lock
6304  *       held.
6305  *
6306  * @note It is possible for an L3 table newly allocated by this function to be
6307  *       deleted by another thread before control returns to the caller, iff that
6308  *       table is an ordinary userspace table.  Callers that use this function
6309  *       to allocate new user L3 tables are therefore expected to keep calling
6310  *       this function until they observe a successful L3 PTE lookup with the pmap
6311  *       lock held.  As long as it does not drop the pmap lock, the caller may
6312  *       then safely use the looked-up L3 table.  See the use of this function in
6313  *       pmap_enter_options_internal() for an example.
6314  *
6315  * @param pmap The pmap for which to ensure mapping space is present.
6316  * @param v The virtual address for which to ensure mapping space is present
6317  *          in [pmap].
6318  * @param options Flags to pass to pmap_tt_allocate() if a new table needs to be
6319  *                allocated.  The only valid option is PMAP_OPTIONS_NOWAIT, which
6320  *                specifies that the allocation must not block.
6321  * @param level The maximum paging level for which to ensure a table is present.
6322  *
6323  * @return KERN_INVALID_ADDRESS if [v] is outside the pmap's mappable range,
6324  *         KERN_RESOURCE_SHORTAGE if a new table can't be allocated,
6325  *         KERN_SUCCESS otherwise.
6326  */
6327 MARK_AS_PMAP_TEXT static kern_return_t
6328 pmap_expand(
6329 	pmap_t pmap,
6330 	vm_map_address_t vaddr,
6331 	unsigned int options,
6332 	unsigned int level)
6333 {
6334 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6335 
6336 	if (__improbable((vaddr < pmap->min) || (vaddr >= pmap->max))) {
6337 		return KERN_INVALID_ADDRESS;
6338 	}
6339 	pmap_paddr_t pa;
6340 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6341 	const uint64_t table_align_mask = (PAGE_SIZE / pmap_page_size) - 1;
6342 	unsigned int ttlevel = pt_attr_root_level(pt_attr);
6343 	tt_entry_t *table_ttep = pmap->tte;
6344 	tt_entry_t *ttep;
6345 	tt_entry_t old_tte = ARM_TTE_EMPTY;
6346 
6347 	pa = 0x0ULL;
6348 
6349 	for (; ttlevel < level; ttlevel++) {
6350 		/**
6351 		 * If the previous iteration didn't allocate a new table, obtain the table from the previous TTE.
6352 		 * Doing this step at the beginning of the loop instead of the end (which would make it part of
6353 		 * the prior iteration) avoids the possibility of executing this step to extract an L3 table KVA
6354 		 * from an L2 TTE, which would be useless because there would be no next iteration to make use
6355 		 * of the table KVA.
6356 		 */
6357 		if (table_ttep == NULL) {
6358 			assert(tte_is_valid_table(old_tte));
6359 			table_ttep = (tt_entry_t*)phystokv(old_tte & ARM_TTE_TABLE_MASK);
6360 		}
6361 
6362 		vm_map_address_t v = pt_attr_align_va(pt_attr, ttlevel, vaddr);
6363 
6364 		/**
6365 		 * We don't need to hold the pmap lock while walking the paging hierarchy.  Only L3 tables are
6366 		 * allowed to be dynamically removed, and only for regular user pmaps at that.  We may allocate
6367 		 * a new L3 table below, but we will only access L0-L2 tables, so there's no risk of a table
6368 		 * being deleted while we are using it for the next level(s) of lookup.
6369 		 */
6370 		ttep = &table_ttep[ttn_index(pt_attr, vaddr, ttlevel)];
6371 		old_tte = os_atomic_load(ttep, relaxed);
6372 		table_ttep = NULL;
6373 		if (!tte_is_valid_table(old_tte)) {
6374 			tt_entry_t new_tte, *new_ttep;
6375 			while (pmap_tt_allocate(pmap, &new_ttep, ttlevel + 1, options | PMAP_PAGE_NOZEROFILL) != KERN_SUCCESS) {
6376 				if (options & PMAP_OPTIONS_NOWAIT) {
6377 					return KERN_RESOURCE_SHORTAGE;
6378 				}
6379 				VM_PAGE_WAIT();
6380 			}
6381 			/* Grab the pmap lock to ensure we don't try to concurrently map different tables at the same TTE. */
6382 			pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6383 			old_tte = os_atomic_load(ttep, relaxed);
6384 			if (!tte_is_valid_table(old_tte)) {
6385 				pmap_init_pte_page(pmap, (pt_entry_t *) new_ttep, v, ttlevel + 1, FALSE);
6386 				pa = kvtophys_nofail((vm_offset_t)new_ttep);
6387 				/*
6388 				 * If the table is going to map a kernel RO zone VA region, then we must
6389 				 * upgrade its SPTM type to XNU_PAGE_TABLE_ROZONE.  The SPTM's type system
6390 				 * requires the table to be transitioned through XNU_DEFAULT for refcount
6391 				 * enforcement, which is fine since this path is expected to execute only
6392 				 * once during boot.
6393 				 */
6394 				if (__improbable(ttlevel == pt_attr_twig_level(pt_attr)) &&
6395 				    (pmap == kernel_pmap) && zone_spans_ro_va(vaddr, vaddr + PAGE_SIZE)) {
6396 					sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
6397 					sptm_retype(pa, XNU_PAGE_TABLE, XNU_DEFAULT, retype_params);
6398 					retype_params.level = (sptm_pt_level_t)pt_attr_leaf_level(pt_attr);
6399 					sptm_retype(pa, XNU_DEFAULT, XNU_PAGE_TABLE_ROZONE, retype_params);
6400 				}
6401 				new_tte = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6402 				sptm_map_table(pmap->ttep, v, (sptm_pt_level_t)ttlevel, new_tte);
6403 				PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6404 				    VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), new_tte);
6405 				/**
6406 				 * If we need to set up multiple TTEs mapping different parts of the same page
6407 				 * (e.g. because we're carving multiple 4K page tables out of a 16K native page,
6408 				 * determine which of the grouped TTEs is the one that we need to follow for the
6409 				 * next level of the table walk.
6410 				 */
6411 				table_ttep = new_ttep + ((((uintptr_t)ttep / sizeof(tt_entry_t)) & table_align_mask) *
6412 				    (pmap_page_size / sizeof(tt_entry_t)));
6413 				pa = 0x0ULL;
6414 				new_ttep = (tt_entry_t *)NULL;
6415 			}
6416 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6417 
6418 			if (new_ttep != (tt_entry_t *)NULL) {
6419 				pmap_tt_deallocate(pmap, new_ttep, ttlevel + 1);
6420 				new_ttep = (tt_entry_t *)NULL;
6421 			}
6422 		}
6423 	}
6424 
6425 	return KERN_SUCCESS;
6426 }
6427 
6428 /*
6429  *	Routine:	pmap_gc
6430  *	Function:
6431  *              Pmap garbage collection
6432  *		Called by the pageout daemon when pages are scarce.
6433  *
6434  */
6435 void
6436 pmap_gc(void)
6437 {
6438 	/*
6439 	 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6440 	 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6441 	 * or may contain wired mappings.  However, it may make sense to scan the pmap VM
6442 	 * object here, and for each page consult the SPTM frame table and if necessary
6443 	 * the PTD in the PV head table.  If the frame table indicates the page is a leaf
6444 	 * page table page and the PTD indicates it has no wired mappings, we can call
6445 	 * pmap_remove() on the VA region mapped by the page and therein return the page
6446 	 * to the VM.
6447 	 */
6448 }
6449 
6450 /*
6451  *      By default, don't attempt pmap GC more frequently
6452  *      than once / 1 minutes.
6453  */
6454 
6455 void
6456 compute_pmap_gc_throttle(
6457 	void *arg __unused)
6458 {
6459 }
6460 
6461 /*
6462  * pmap_attribute_cache_sync(vm_offset_t pa)
6463  *
6464  * Invalidates all of the instruction cache on a physical page and
6465  * pushes any dirty data from the data cache for the same physical page
6466  */
6467 
6468 kern_return_t
6469 pmap_attribute_cache_sync(
6470 	ppnum_t pp,
6471 	vm_size_t size,
6472 	__unused vm_machine_attribute_t attribute,
6473 	__unused vm_machine_attribute_val_t * value)
6474 {
6475 	if (size > PAGE_SIZE) {
6476 		panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
6477 	} else {
6478 		cache_sync_page(pp);
6479 	}
6480 
6481 	return KERN_SUCCESS;
6482 }
6483 
6484 /*
6485  * pmap_sync_page_data_phys(ppnum_t pp)
6486  *
6487  * Invalidates all of the instruction cache on a physical page and
6488  * pushes any dirty data from the data cache for the same physical page.
6489  * Not required on SPTM systems, because the SPTM automatically performs
6490  * the invalidate operation when retyping to one of the types that allow
6491  * for executable permissions.
6492  */
6493 void
6494 pmap_sync_page_data_phys(
6495 	__unused ppnum_t pp)
6496 {
6497 	return;
6498 }
6499 
6500 /*
6501  * pmap_sync_page_attributes_phys(ppnum_t pp)
6502  *
6503  * Write back and invalidate all cachelines on a physical page.
6504  */
6505 void
6506 pmap_sync_page_attributes_phys(
6507 	ppnum_t pp)
6508 {
6509 	flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
6510 }
6511 
6512 #if CONFIG_COREDUMP
6513 /* temporary workaround */
6514 boolean_t
6515 coredumpok(
6516 	vm_map_t map,
6517 	mach_vm_offset_t va)
6518 {
6519 	pt_entry_t     *pte_p;
6520 	pt_entry_t      spte;
6521 
6522 	pte_p = pmap_pte(map->pmap, va);
6523 	if (0 == pte_p) {
6524 		return FALSE;
6525 	}
6526 	if (vm_map_entry_has_device_pager(map, va)) {
6527 		return FALSE;
6528 	}
6529 	spte = *pte_p;
6530 	return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
6531 }
6532 #endif
6533 
6534 void
6535 fillPage(
6536 	ppnum_t pn,
6537 	unsigned int fill)
6538 {
6539 	unsigned int   *addr;
6540 	int             count;
6541 
6542 	addr = (unsigned int *) phystokv(ptoa(pn));
6543 	count = PAGE_SIZE / sizeof(unsigned int);
6544 	while (count--) {
6545 		*addr++ = fill;
6546 	}
6547 }
6548 
6549 extern void     mapping_set_mod(ppnum_t pn);
6550 
6551 void
6552 mapping_set_mod(
6553 	ppnum_t pn)
6554 {
6555 	pmap_set_modify(pn);
6556 }
6557 
6558 extern void     mapping_set_ref(ppnum_t pn);
6559 
6560 void
6561 mapping_set_ref(
6562 	ppnum_t pn)
6563 {
6564 	pmap_set_reference(pn);
6565 }
6566 
6567 /*
6568  * Clear specified attribute bits.
6569  *
6570  * Try to force an arm_fast_fault() for all mappings of
6571  * the page - to force attributes to be set again at fault time.
6572  * If the forcing succeeds, clear the cached bits at the head.
6573  * Otherwise, something must have been wired, so leave the cached
6574  * attributes alone.
6575  */
6576 MARK_AS_PMAP_TEXT static void
6577 phys_attribute_clear_with_flush_range(
6578 	ppnum_t         pn,
6579 	unsigned int    bits,
6580 	int             options,
6581 	void            *arg,
6582 	pmap_tlb_flush_range_t *flush_range)
6583 {
6584 	pmap_paddr_t    pa = ptoa(pn);
6585 	vm_prot_t       allow_mode = VM_PROT_ALL;
6586 
6587 	if ((arg != NULL) || (flush_range != NULL)) {
6588 		options = options & ~PMAP_OPTIONS_NOFLUSH;
6589 	}
6590 
6591 	if (__improbable((options & PMAP_OPTIONS_FF_WIRED) != 0)) {
6592 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6593 		    "invalid options",
6594 		    pn, bits, options, arg, flush_range);
6595 	}
6596 
6597 	if (__improbable((bits & PP_ATTR_MODIFIED) &&
6598 	    (options & PMAP_OPTIONS_NOFLUSH))) {
6599 		panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
6600 		    "should not clear 'modified' without flushing TLBs",
6601 		    pn, bits, options, arg, flush_range);
6602 	}
6603 
6604 	assert(pn != vm_page_fictitious_addr);
6605 
6606 	if (options & PMAP_OPTIONS_CLEAR_WRITE) {
6607 		assert(bits == PP_ATTR_MODIFIED);
6608 
6609 		pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, NULL, flush_range);
6610 		/*
6611 		 * We short circuit this case; it should not need to
6612 		 * invoke arm_force_fast_fault, so just clear the modified bit.
6613 		 * pmap_page_protect has taken care of resetting
6614 		 * the state so that we'll see the next write as a fault to
6615 		 * the VM (i.e. we don't want a fast fault).
6616 		 */
6617 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6618 		return;
6619 	}
6620 	if (bits & PP_ATTR_REFERENCED) {
6621 		allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
6622 	}
6623 	if (bits & PP_ATTR_MODIFIED) {
6624 		allow_mode &= ~VM_PROT_WRITE;
6625 	}
6626 
6627 	if (bits == PP_ATTR_NOENCRYPT) {
6628 		/*
6629 		 * We short circuit this case; it should not need to
6630 		 * invoke arm_force_fast_fault, so just clear and
6631 		 * return.  On ARM, this bit is just a debugging aid.
6632 		 */
6633 		ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
6634 		return;
6635 	}
6636 
6637 	arm_force_fast_fault_with_flush_range(pn, allow_mode, options, NULL, (pp_attr_t)bits, flush_range);
6638 }
6639 
6640 MARK_AS_PMAP_TEXT void
6641 phys_attribute_clear_internal(
6642 	ppnum_t         pn,
6643 	unsigned int    bits,
6644 	int             options,
6645 	void            *arg)
6646 {
6647 	phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
6648 }
6649 
6650 #if __ARM_RANGE_TLBI__
6651 
6652 MARK_AS_PMAP_TEXT static vm_map_address_t
6653 phys_attribute_clear_twig_internal(
6654 	pmap_t pmap,
6655 	vm_map_address_t start,
6656 	vm_map_address_t end,
6657 	unsigned int bits,
6658 	unsigned int options,
6659 	pmap_tlb_flush_range_t *flush_range)
6660 {
6661 	pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
6662 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6663 	assert(end >= start);
6664 	assert((end - start) <= pt_attr_twig_size(pt_attr));
6665 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
6666 	vm_map_address_t va = start;
6667 	pt_entry_t     *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
6668 	tt_entry_t     *tte_p;
6669 	tte_p = pmap_tte(pmap, start);
6670 
6671 	/**
6672 	 * It's possible that this portion of our VA region has never been paged in, in which case
6673 	 * there may not be a valid twig or leaf table here.
6674 	 */
6675 	if ((tte_p == (tt_entry_t *) NULL) || !tte_is_valid_table(*tte_p)) {
6676 		assert(flush_range->pending_region_entries == 0);
6677 		return end;
6678 	}
6679 
6680 	pte_p = (pt_entry_t *) ttetokv(*tte_p);
6681 
6682 	start_pte_p = &pte_p[pte_index(pt_attr, start)];
6683 	end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
6684 	assert(end_pte_p >= start_pte_p);
6685 	for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
6686 		if (flush_range->pending_region_entries == 0) {
6687 			flush_range->pending_region_start = va;
6688 		} else {
6689 			assertf((flush_range->pending_region_start +
6690 			    (flush_range->pending_region_entries * pmap_page_size)) == va,
6691 			    "pending_region_start 0x%llx + 0x%lx pages != va 0%llx",
6692 			    (unsigned long long)flush_range->pending_region_start,
6693 			    (unsigned long)flush_range->pending_region_entries,
6694 			    (unsigned long long)va);
6695 		}
6696 		flush_range->current_ptep = curr_pte_p;
6697 		const pt_entry_t spte = os_atomic_load(curr_pte_p, relaxed);
6698 		const pmap_paddr_t pa = pte_to_pa(spte);
6699 		if (pte_is_valid(spte) && pa_valid(pa)) {
6700 			/* The PTE maps a managed page, so do the appropriate PV list-based permission changes. */
6701 			const ppnum_t pn = (ppnum_t) atop(pa);
6702 			phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
6703 			if (__probable(flush_range->region_entry_added)) {
6704 				flush_range->region_entry_added = false;
6705 			} else {
6706 				/**
6707 				 * It's possible that some other thread removed the mapping between our check
6708 				 * of the PTE above and taking the PVH lock in the
6709 				 * phys_attribute_clear_with_flush_range() path.  In that case we have a
6710 				 * discontinuity in the region to update, so just submit any pending region
6711 				 * templates and start a new region op on the next iteration.
6712 				 */
6713 				pmap_multipage_op_submit_region(flush_range);
6714 			}
6715 		} else if (__improbable(!pte_is_valid(spte))) {
6716 			/**
6717 			 * We've found an invalid mapping, so we have a discontinuity in the the region to
6718 			 * update.  Handle this by submitting any pending region templates and starting a new
6719 			 * region on the next iteration.  In theory we could instead handle this by installing
6720 			 * a "safe" (AF bit cleared, minimal permissions) PTE template; the SPTM would just
6721 			 * ignore the update on finding an invalid mapping in the PTE.  But we don't know
6722 			 * what a "safe" template will be in all cases: for example, JIT regions require all
6723 			 * mappings to either be invalid or to have full RWX permissions.
6724 			 */
6725 			pmap_multipage_op_submit_region(flush_range);
6726 		} else if (pmap_insert_flush_range_template(spte, flush_range)) {
6727 			/**
6728 			 * We've found a mapping to a non-managed page, so just insert the existing
6729 			 * PTE into the pending region ops since we don't manage attributes for non-managed
6730 			 * pages.
6731 			 * If pmap_insert_flush_range_template() returns true, indicating that it reached
6732 			 * the mapping limit and submitted the SPTM call, then we also submit any pending
6733 			 * disjoint ops.  Having pending operations in either category will keep preemption
6734 			 * disabled, and we want to ensure that we can at least temporarily
6735 			 * re-enable preemption every SPTM_MAPPING_LIMIT mappings.
6736 			 */
6737 			pmap_multipage_op_submit_disjoint(0, flush_range);
6738 		}
6739 
6740 		/**
6741 		 * If the total number of pending + processed entries exceeds the mapping threshold,
6742 		 * we may need to submit all pending operations to avoid excessive preemption latency.
6743 		 * Otherwise, a small number of pending disjoint or region ops can hold preemption
6744 		 * disabled across an arbitrary number of total processed entries.
6745 		 * As an optimization, we may be able to avoid submitting if no urgent AST is
6746 		 * pending on the local CPU, but only if we aren't currently in an epoch.  If we are
6747 		 * in an epoch, failure to submit in a timely manner can cause another CPU to wait
6748 		 * too long for our epoch to drain.
6749 		 */
6750 		if (((flush_range->processed_entries + flush_range->pending_disjoint_entries +
6751 		    flush_range->pending_region_entries) >= SPTM_MAPPING_LIMIT) &&
6752 		    (pmap_in_epoch() || pmap_pending_preemption())) {
6753 			pmap_multipage_op_submit(flush_range);
6754 			assert(preemption_enabled());
6755 		}
6756 	}
6757 
6758 	/* SPTM region ops can't span L3 table boundaries, so submit any pending region templates now. */
6759 	pmap_multipage_op_submit_region(flush_range);
6760 	return end;
6761 }
6762 
6763 MARK_AS_PMAP_TEXT vm_map_address_t
6764 phys_attribute_clear_range_internal(
6765 	pmap_t pmap,
6766 	vm_map_address_t start,
6767 	vm_map_address_t end,
6768 	unsigned int bits,
6769 	unsigned int options)
6770 {
6771 	if (__improbable(end < start)) {
6772 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
6773 	}
6774 	validate_pmap_mutable(pmap);
6775 
6776 	vm_map_address_t va = start;
6777 	pmap_tlb_flush_range_t flush_range = {
6778 		.ptfr_pmap = pmap,
6779 		.ptfr_start = start,
6780 		.ptfr_end = end,
6781 		.current_ptep = NULL,
6782 		.pending_region_start = 0,
6783 		.pending_region_entries = 0,
6784 		.region_entry_added = false,
6785 		.current_header = NULL,
6786 		.current_header_first_mapping_index = 0,
6787 		.processed_entries = 0,
6788 		.pending_disjoint_entries = 0,
6789 		.ptfr_flush_needed = false
6790 	};
6791 
6792 	pmap_lock(pmap, PMAP_LOCK_SHARED);
6793 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6794 
6795 	while (va < end) {
6796 		vm_map_address_t curr_end;
6797 
6798 		curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
6799 		if (curr_end > end) {
6800 			curr_end = end;
6801 		}
6802 
6803 		va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
6804 	}
6805 	pmap_multipage_op_submit(&flush_range);
6806 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
6807 	assert((flush_range.pending_disjoint_entries == 0) && (flush_range.pending_region_entries == 0));
6808 	if (flush_range.ptfr_flush_needed) {
6809 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(
6810 			flush_range.ptfr_start,
6811 			flush_range.ptfr_end - flush_range.ptfr_start,
6812 			flush_range.ptfr_pmap,
6813 			true);
6814 		sync_tlb_flush();
6815 	}
6816 	return va;
6817 }
6818 
6819 static void
6820 phys_attribute_clear_range(
6821 	pmap_t pmap,
6822 	vm_map_address_t start,
6823 	vm_map_address_t end,
6824 	unsigned int bits,
6825 	unsigned int options)
6826 {
6827 	/*
6828 	 * We allow single-page requests to execute non-preemptibly,
6829 	 * as it doesn't make sense to sample AST_URGENT for a single-page
6830 	 * operation, and there are a couple of special use cases that
6831 	 * require a non-preemptible single-page operation.
6832 	 */
6833 	if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
6834 		pmap_verify_preemptible();
6835 	}
6836 	__assert_only const int preemption_level = get_preemption_level();
6837 
6838 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
6839 
6840 	phys_attribute_clear_range_internal(pmap, start, end, bits, options);
6841 
6842 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
6843 
6844 	assert(preemption_level == get_preemption_level());
6845 }
6846 #endif /* __ARM_RANGE_TLBI__ */
6847 
6848 static void
6849 phys_attribute_clear(
6850 	ppnum_t         pn,
6851 	unsigned int    bits,
6852 	int             options,
6853 	void            *arg)
6854 {
6855 	/*
6856 	 * Do we really want this tracepoint?  It will be extremely chatty.
6857 	 * Also, should we have a corresponding trace point for the set path?
6858 	 */
6859 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
6860 
6861 	phys_attribute_clear_internal(pn, bits, options, arg);
6862 
6863 	PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
6864 }
6865 
6866 /*
6867  *	Set specified attribute bits.
6868  *
6869  *	Set cached value in the pv head because we have
6870  *	no per-mapping hardware support for referenced and
6871  *	modify bits.
6872  */
6873 MARK_AS_PMAP_TEXT void
6874 phys_attribute_set_internal(
6875 	ppnum_t pn,
6876 	unsigned int bits)
6877 {
6878 	pmap_paddr_t    pa = ptoa(pn);
6879 	assert(pn != vm_page_fictitious_addr);
6880 
6881 	ppattr_pa_set_bits(pa, (uint16_t)bits);
6882 
6883 	return;
6884 }
6885 
6886 static void
6887 phys_attribute_set(
6888 	ppnum_t pn,
6889 	unsigned int bits)
6890 {
6891 	phys_attribute_set_internal(pn, bits);
6892 }
6893 
6894 
6895 /*
6896  *	Check specified attribute bits.
6897  *
6898  *	use the software cached bits (since no hw support).
6899  */
6900 static boolean_t
6901 phys_attribute_test(
6902 	ppnum_t pn,
6903 	unsigned int bits)
6904 {
6905 	pmap_paddr_t    pa = ptoa(pn);
6906 	assert(pn != vm_page_fictitious_addr);
6907 	return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
6908 }
6909 
6910 
6911 /*
6912  *	Set the modify/reference bits on the specified physical page.
6913  */
6914 void
6915 pmap_set_modify(ppnum_t pn)
6916 {
6917 	phys_attribute_set(pn, PP_ATTR_MODIFIED);
6918 }
6919 
6920 
6921 /*
6922  *	Clear the modify bits on the specified physical page.
6923  */
6924 void
6925 pmap_clear_modify(
6926 	ppnum_t pn)
6927 {
6928 	phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
6929 }
6930 
6931 
6932 /*
6933  *	pmap_is_modified:
6934  *
6935  *	Return whether or not the specified physical page is modified
6936  *	by any physical maps.
6937  */
6938 boolean_t
6939 pmap_is_modified(
6940 	ppnum_t pn)
6941 {
6942 	return phys_attribute_test(pn, PP_ATTR_MODIFIED);
6943 }
6944 
6945 
6946 /*
6947  *	Set the reference bit on the specified physical page.
6948  */
6949 static void
6950 pmap_set_reference(
6951 	ppnum_t pn)
6952 {
6953 	phys_attribute_set(pn, PP_ATTR_REFERENCED);
6954 }
6955 
6956 /*
6957  *	Clear the reference bits on the specified physical page.
6958  */
6959 void
6960 pmap_clear_reference(
6961 	ppnum_t pn)
6962 {
6963 	phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
6964 }
6965 
6966 
6967 /*
6968  *	pmap_is_referenced:
6969  *
6970  *	Return whether or not the specified physical page is referenced
6971  *	by any physical maps.
6972  */
6973 boolean_t
6974 pmap_is_referenced(
6975 	ppnum_t pn)
6976 {
6977 	return phys_attribute_test(pn, PP_ATTR_REFERENCED);
6978 }
6979 
6980 /*
6981  * pmap_get_refmod(phys)
6982  *  returns the referenced and modified bits of the specified
6983  *  physical page.
6984  */
6985 unsigned int
6986 pmap_get_refmod(
6987 	ppnum_t pn)
6988 {
6989 	return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
6990 	       | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
6991 }
6992 
6993 static inline unsigned int
6994 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
6995 {
6996 	return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
6997 	       ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
6998 }
6999 
7000 /*
7001  * pmap_clear_refmod(phys, mask)
7002  *  clears the referenced and modified bits as specified by the mask
7003  *  of the specified physical page.
7004  */
7005 void
7006 pmap_clear_refmod_options(
7007 	ppnum_t         pn,
7008 	unsigned int    mask,
7009 	unsigned int    options,
7010 	void            *arg)
7011 {
7012 	unsigned int    bits;
7013 
7014 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7015 	phys_attribute_clear(pn, bits, options, arg);
7016 }
7017 
7018 /*
7019  * Perform pmap_clear_refmod_options on a virtual address range.
7020  * The operation will be performed in bulk & tlb flushes will be coalesced
7021  * if possible.
7022  *
7023  * Returns true if the operation is supported on this platform.
7024  * If this function returns false, the operation is not supported and
7025  * nothing has been modified in the pmap.
7026  */
7027 bool
7028 pmap_clear_refmod_range_options(
7029 	pmap_t pmap __unused,
7030 	vm_map_address_t start __unused,
7031 	vm_map_address_t end __unused,
7032 	unsigned int mask __unused,
7033 	unsigned int options __unused)
7034 {
7035 #if __ARM_RANGE_TLBI__
7036 	unsigned int    bits;
7037 	bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7038 	phys_attribute_clear_range(pmap, start, end, bits, options);
7039 	return true;
7040 #else /* __ARM_RANGE_TLBI__ */
7041 #pragma unused(pmap, start, end, mask, options)
7042 	/*
7043 	 * This operation allows the VM to bulk modify refmod bits on a virtually
7044 	 * contiguous range of addresses. This is large performance improvement on
7045 	 * platforms that support ranged tlbi instructions. But on older platforms,
7046 	 * we can only flush per-page or the entire asid. So we currently
7047 	 * only support this operation on platforms that support ranged tlbi.
7048 	 * instructions. On other platforms, we require that
7049 	 * the VM modify the bits on a per-page basis.
7050 	 */
7051 	return false;
7052 #endif /* __ARM_RANGE_TLBI__ */
7053 }
7054 
7055 void
7056 pmap_clear_refmod(
7057 	ppnum_t pn,
7058 	unsigned int mask)
7059 {
7060 	pmap_clear_refmod_options(pn, mask, 0, NULL);
7061 }
7062 
7063 unsigned int
7064 pmap_disconnect_options(
7065 	ppnum_t pn,
7066 	unsigned int options,
7067 	void *arg)
7068 {
7069 	if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7070 		/*
7071 		 * On ARM, the "modified" bit is managed by software, so
7072 		 * we know up-front if the physical page is "modified",
7073 		 * without having to scan all the PTEs pointing to it.
7074 		 * The caller should have made the VM page "busy" so noone
7075 		 * should be able to establish any new mapping and "modify"
7076 		 * the page behind us.
7077 		 */
7078 		if (pmap_is_modified(pn)) {
7079 			/*
7080 			 * The page has been modified and will be sent to
7081 			 * the VM compressor.
7082 			 */
7083 			options |= PMAP_OPTIONS_COMPRESSOR;
7084 		} else {
7085 			/*
7086 			 * The page hasn't been modified and will be freed
7087 			 * instead of compressed.
7088 			 */
7089 		}
7090 	}
7091 
7092 	/* disconnect the page */
7093 	pmap_page_protect_options(pn, 0, options, arg);
7094 
7095 	/* return ref/chg status */
7096 	return pmap_get_refmod(pn);
7097 }
7098 
7099 /*
7100  *	Routine:
7101  *		pmap_disconnect
7102  *
7103  *	Function:
7104  *		Disconnect all mappings for this page and return reference and change status
7105  *		in generic format.
7106  *
7107  */
7108 unsigned int
7109 pmap_disconnect(
7110 	ppnum_t pn)
7111 {
7112 	pmap_page_protect(pn, 0);       /* disconnect the page */
7113 	return pmap_get_refmod(pn);   /* return ref/chg status */
7114 }
7115 
7116 boolean_t
7117 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7118 {
7119 	if (ptoa(first) >= vm_last_phys) {
7120 		return FALSE;
7121 	}
7122 	if (ptoa(last) < vm_first_phys) {
7123 		return FALSE;
7124 	}
7125 
7126 	return TRUE;
7127 }
7128 
7129 /*
7130  * The state maintained by the noencrypt functions is used as a
7131  * debugging aid on ARM.  This incurs some overhead on the part
7132  * of the caller.  A special case check in phys_attribute_clear
7133  * (the most expensive path) currently minimizes this overhead,
7134  * but stubbing these functions out on RELEASE kernels yields
7135  * further wins.
7136  */
7137 boolean_t
7138 pmap_is_noencrypt(
7139 	ppnum_t pn)
7140 {
7141 #if DEVELOPMENT || DEBUG
7142 	boolean_t result = FALSE;
7143 
7144 	if (!pa_valid(ptoa(pn))) {
7145 		return FALSE;
7146 	}
7147 
7148 	result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7149 
7150 	return result;
7151 #else
7152 #pragma unused(pn)
7153 	return FALSE;
7154 #endif
7155 }
7156 
7157 void
7158 pmap_set_noencrypt(
7159 	ppnum_t pn)
7160 {
7161 #if DEVELOPMENT || DEBUG
7162 	if (!pa_valid(ptoa(pn))) {
7163 		return;
7164 	}
7165 
7166 	phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7167 #else
7168 #pragma unused(pn)
7169 #endif
7170 }
7171 
7172 void
7173 pmap_clear_noencrypt(
7174 	ppnum_t pn)
7175 {
7176 #if DEVELOPMENT || DEBUG
7177 	if (!pa_valid(ptoa(pn))) {
7178 		return;
7179 	}
7180 
7181 	phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7182 #else
7183 #pragma unused(pn)
7184 #endif
7185 }
7186 
7187 void
7188 pmap_lock_phys_page(ppnum_t pn)
7189 {
7190 	unsigned int    pai;
7191 	pmap_paddr_t    phys = ptoa(pn);
7192 
7193 	if (pa_valid(phys)) {
7194 		pai = pa_index(phys);
7195 		__unused const locked_pvh_t locked_pvh = pvh_lock(pai);
7196 	} else {
7197 		simple_lock(&phys_backup_lock, LCK_GRP_NULL);
7198 	}
7199 }
7200 
7201 
7202 void
7203 pmap_unlock_phys_page(ppnum_t pn)
7204 {
7205 	unsigned int    pai;
7206 	pmap_paddr_t    phys = ptoa(pn);
7207 
7208 	if (pa_valid(phys)) {
7209 		pai = pa_index(phys);
7210 		locked_pvh_t locked_pvh = {.pvh = pai_to_pvh(pai), .pai = pai};
7211 		pvh_unlock(&locked_pvh);
7212 	} else {
7213 		simple_unlock(&phys_backup_lock);
7214 	}
7215 }
7216 
7217 MARK_AS_PMAP_TEXT void
7218 pmap_clear_user_ttb_internal(void)
7219 {
7220 	set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7221 }
7222 
7223 void
7224 pmap_clear_user_ttb(void)
7225 {
7226 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7227 	pmap_clear_user_ttb_internal();
7228 	PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7229 }
7230 
7231 /**
7232  * Set up a "fast fault", or a page fault that won't go through the VM layer on
7233  * a page. This is primarily used to manage ref/mod bits in software. Depending
7234  * on the value of allow_mode, the next read and/or write of the page will fault
7235  * and the ref/mod bits will be updated.
7236  *
7237  * @param ppnum Page number to set up a fast fault on.
7238  * @param allow_mode VM_PROT_NONE will cause the next read and write access to
7239  *                   fault.
7240  *                   VM_PROT_READ will only cause the next write access to fault.
7241  *                   Other values are undefined.
7242  * @param options PMAP_OPTIONS_NOFLUSH indicates TLBI flush is not needed.
7243  *                PMAP_OPTIONS_FF_WIRED forces a fast fault even on wired pages.
7244  *                PMAP_OPTIONS_SET_REUSABLE/PMAP_OPTIONS_CLEAR_REUSABLE updates
7245  *                the global reusable bit of the page.
7246  * @param locked_pvh If non-NULL, this indicates the PVH lock for [ppnum] is already locked
7247  *                   by the caller.  This is an input/output parameter which may be updated
7248  *                   to reflect a new PV head value to be passed to a later call to pvh_unlock().
7249  * @param bits_to_clear Mask of additional pp_attr_t bits to clear for the physical
7250  *                      page, iff this function completes successfully and returns
7251  *                      TRUE.  This is typically some combination of
7252  *                      the referenced, modified, and noencrypt bits.
7253  * @param flush_range When present, this function will skip the TLB flush for the
7254  *                    mappings that are covered by the range, leaving that to be
7255  *                    done later by the caller.  It may also avoid submitting mapping
7256  *                    updates directly to the SPTM, instead accumulating them in a
7257  *                    per-CPU array to be submitted later by the caller.
7258  *
7259  * @return TRUE if the fast fault was successfully configured for all mappings
7260  *         of the page, FALSE otherwise (e.g. if wired mappings are present and
7261  *         PMAP_OPTIONS_FF_WIRED was not passed).
7262  *
7263  * @note PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7264  *
7265  * @warning PMAP_OPTIONS_FF_WIRED should only be used with pages accessible from
7266  *          EL0.  The kernel may assume that accesses to wired, kernel-owned pages
7267  *          won't fault.
7268  */
7269 MARK_AS_PMAP_TEXT static boolean_t
7270 arm_force_fast_fault_with_flush_range(
7271 	ppnum_t         ppnum,
7272 	vm_prot_t       allow_mode,
7273 	int             options,
7274 	locked_pvh_t   *locked_pvh,
7275 	pp_attr_t       bits_to_clear,
7276 	pmap_tlb_flush_range_t *flush_range)
7277 {
7278 	pmap_paddr_t     phys = ptoa(ppnum);
7279 	pv_entry_t      *pve_p;
7280 	pt_entry_t      *pte_p;
7281 	unsigned int     pai;
7282 	boolean_t        result;
7283 	unsigned int     num_mappings = 0, num_skipped_mappings = 0;
7284 	bool             ref_fault;
7285 	bool             mod_fault;
7286 	bool             clear_write_fault = false;
7287 	bool             ref_aliases_mod = false;
7288 
7289 	assert(ppnum != vm_page_fictitious_addr);
7290 
7291 	/**
7292 	 * Assert that PMAP_OPTIONS_NOFLUSH and flush_range cannot both be specified.
7293 	 *
7294 	 * PMAP_OPTIONS_NOFLUSH indicates there is no need of flushing the TLB in the entire operation, and
7295 	 * flush_range indicates the caller requests deferral of the TLB flushing. Fundemantally, the two
7296 	 * semantics conflict with each other, so assert they are not both true.
7297 	 */
7298 	assert(!(flush_range && (options & PMAP_OPTIONS_NOFLUSH)));
7299 
7300 	if (!pa_valid(phys)) {
7301 		return FALSE;   /* Not a managed page. */
7302 	}
7303 
7304 	result = TRUE;
7305 	ref_fault = false;
7306 	mod_fault = false;
7307 	pai = pa_index(phys);
7308 	locked_pvh_t local_locked_pvh = {.pvh = 0};
7309 	if (__probable(locked_pvh == NULL)) {
7310 		if (flush_range != NULL) {
7311 			/**
7312 			 * If we're partway through processing a multi-page batched call,
7313 			 * preemption will already be disabled so we can't simply call
7314 			 * pvh_lock() which may block.  Instead, we first try to acquire
7315 			 * the lock without waiting, which in most cases should succeed.
7316 			 * If it fails, we submit the pending batched operations to re-
7317 			 * enable preemption and then acquire the lock normally.
7318 			 */
7319 			local_locked_pvh = pvh_try_lock(pai);
7320 			if (__improbable(!pvh_try_lock_success(&local_locked_pvh))) {
7321 				pmap_multipage_op_submit(flush_range);
7322 				local_locked_pvh = pvh_lock(pai);
7323 			}
7324 		} else {
7325 			local_locked_pvh = pvh_lock(pai);
7326 		}
7327 	} else {
7328 		local_locked_pvh = *locked_pvh;
7329 		assert(pai == local_locked_pvh.pai);
7330 	}
7331 	assert(local_locked_pvh.pvh != 0);
7332 	pvh_assert_locked(pai);
7333 
7334 	pte_p = PT_ENTRY_NULL;
7335 	pve_p = PV_ENTRY_NULL;
7336 	if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PTEP)) {
7337 		pte_p = pvh_ptep(local_locked_pvh.pvh);
7338 	} else if (pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_PVEP)) {
7339 		pve_p = pvh_pve_list(local_locked_pvh.pvh);
7340 	} else if (__improbable(!pvh_test_type(local_locked_pvh.pvh, PVH_TYPE_NULL))) {
7341 		panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)local_locked_pvh.pvh, (uint64_t)phys);
7342 	}
7343 
7344 	const bool is_reusable = ppattr_test_reusable(pai);
7345 
7346 	bool pvh_lock_sleep_mode_needed = false;
7347 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
7348 	sptm_disjoint_op_t *sptm_ops = NULL;
7349 
7350 	/**
7351 	 * This would also work as a block, with the above variables declared using the
7352 	 * __block qualifier, but the extra runtime overhead of block syntax (e.g.
7353 	 * dereferencing __block variables through stack forwarding pointers) isn't needed
7354 	 * here, as we never need to use this code sequence as a closure.
7355 	 */
7356 	#define FFF_PERCPU_INIT() do { \
7357 	        disable_preemption(); \
7358 	        sptm_pcpu = PERCPU_GET(pmap_sptm_percpu); \
7359 	        sptm_ops = sptm_pcpu->sptm_ops; \
7360 	} while (0)
7361 
7362 	FFF_PERCPU_INIT();
7363 
7364 	int pve_ptep_idx = 0;
7365 
7366 	/**
7367 	 * With regard to TLBI, there are three cases:
7368 	 *
7369 	 * 1. PMAP_OPTIONS_NOFLUSH is specified. In such case, SPTM doesn't need to flush TLB and neither does pmap.
7370 	 * 2. PMAP_OPTIONS_NOFLUSH is not specified, but flush_range is, indicating the caller intends to flush TLB
7371 	 *    itself (with range TLBI). In such case, we check the flush_range limits and only issue the TLBI if a
7372 	 *    mapping is out of the range.
7373 	 * 3. Neither PMAP_OPTIONS_NOFLUSH nor a valid flush_range pointer is specified. In such case, we should just
7374 	 *    let SPTM handle TLBI flushing.
7375 	 */
7376 	const bool defer_tlbi = (options & PMAP_OPTIONS_NOFLUSH) || flush_range;
7377 	const uint32_t sptm_update_options = SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF | (defer_tlbi ? SPTM_UPDATE_DEFER_TLBI : 0);
7378 
7379 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7380 		pt_entry_t       spte;
7381 		pt_entry_t       tmplate;
7382 
7383 		if (__improbable(pvh_lock_sleep_mode_needed)) {
7384 			assert((num_mappings == 0) && (num_skipped_mappings == 0));
7385 			/**
7386 			 * Undo the explicit preemption disable done in the last call to FFF_PER_CPU_INIT().
7387 			 * If the PVH lock is placed in sleep mode, we can't rely on it to disable preemption,
7388 			 * so we need these explicit preemption twiddles to ensure we don't get migrated off-
7389 			 * core while processing SPTM per-CPU data.  At the same time, we also want preemption
7390 			 * to briefly be re-enabled every SPTM_MAPPING_LIMIT mappings so that any pending
7391 			 * urgent ASTs can be handled.
7392 			 */
7393 			enable_preemption();
7394 			pvh_lock_enter_sleep_mode(&local_locked_pvh);
7395 			pvh_lock_sleep_mode_needed = false;
7396 			FFF_PERCPU_INIT();
7397 		}
7398 
7399 		if (pve_p != PV_ENTRY_NULL) {
7400 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7401 			if (pte_p == PT_ENTRY_NULL) {
7402 				goto fff_skip_pve;
7403 			}
7404 		}
7405 
7406 #ifdef PVH_FLAG_IOMMU
7407 		if (pvh_ptep_is_iommu(pte_p)) {
7408 			++num_skipped_mappings;
7409 			goto fff_skip_pve;
7410 		}
7411 #endif
7412 		spte = os_atomic_load(pte_p, relaxed);
7413 		if (pte_is_compressed(spte, pte_p)) {
7414 			panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7415 		}
7416 
7417 		pt_desc_t *ptdp = NULL;
7418 		pmap_t pmap = NULL;
7419 		vm_map_address_t va = 0;
7420 
7421 		if ((flush_range != NULL) && (pte_p == flush_range->current_ptep)) {
7422 			/**
7423 			 * If the current mapping matches the flush range's current iteration position,
7424 			 * there's no need to do the work of getting the PTD.  We already know the pmap,
7425 			 * and the VA is implied by flush_range->pending_region_start.
7426 			 */
7427 			pmap = flush_range->ptfr_pmap;
7428 		} else {
7429 			ptdp = ptep_get_ptd(pte_p);
7430 			pmap = ptdp->pmap;
7431 			va = ptd_get_va(ptdp, pte_p);
7432 			assert(va >= pmap->min && va < pmap->max);
7433 		}
7434 
7435 		bool skip_pte = pte_is_wired(spte) &&
7436 		    ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7437 
7438 		if (skip_pte) {
7439 			result = FALSE;
7440 		}
7441 
7442 		// A concurrent pmap_remove() may have cleared the PTE
7443 		if (__improbable(!pte_is_valid(spte))) {
7444 			skip_pte = true;
7445 		}
7446 
7447 		/**
7448 		 * If the PTD is NULL, we're adding the current mapping to the pending region templates instead of the
7449 		 * pending disjoint ops, so we don't need to do flush range disjoint op management.
7450 		 */
7451 		if ((flush_range != NULL) && (ptdp != NULL) && !skip_pte) {
7452 			/**
7453 			 * Insert a "header" entry for this physical page into the SPTM disjoint ops array.
7454 			 * We do this in three cases:
7455 			 * 1) We're at the beginning of the SPTM ops array (num_mappings == 0, flush_range->pending_disjoint_entries == 0).
7456 			 * 2) We may not be at the beginning of the SPTM ops array, but we are about to add the first operation
7457 			 *    for this physical page (num_mappings == 0, flush_range->pending_disjoint_entries == ?).
7458 			 * 3) We need to change the options passed to the SPTM for a run of one or more mappings.  Specifically,
7459 			 *    if we encounter a run of mappings that reside outside the VA region of our flush_range, or that
7460 			 *    belong to a pmap other than the one targeted by our flush_range, we should ask the SPTM to flush
7461 			 *    the TLB for us (i.e., clear SPTM_UPDATE_DEFER_TLBI), but only for those specific mappings.
7462 			 */
7463 			uint32_t per_mapping_sptm_update_options = sptm_update_options;
7464 			if ((flush_range->ptfr_pmap != pmap) || (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
7465 				per_mapping_sptm_update_options &= ~SPTM_UPDATE_DEFER_TLBI;
7466 			}
7467 			if ((num_mappings == 0) ||
7468 			    (flush_range->current_header->per_paddr_header.options != per_mapping_sptm_update_options)) {
7469 				if (pmap_multipage_op_add_page(phys, &num_mappings, per_mapping_sptm_update_options, flush_range)) {
7470 					/**
7471 					 * If we needed to submit the pending disjoint ops to make room for the new page,
7472 					 * flush any pending region ops to reenable preemption and restart the loop with
7473 					 * the lock in sleep mode.  This prevents preemption from being held disabled
7474 					 * for an arbitrary amount of time in the pathological case in which we have
7475 					 * both pending region ops and an excessively long PV list that repeatedly
7476 					 * requires new page headers with SPTM_MAPPING_LIMIT - 1 entries already pending.
7477 					 */
7478 					pmap_multipage_op_submit_region(flush_range);
7479 					assert(num_mappings == 0);
7480 					num_skipped_mappings = 0;
7481 					pvh_lock_sleep_mode_needed = true;
7482 					continue;
7483 				}
7484 			}
7485 		}
7486 
7487 		const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7488 
7489 		/* update pmap stats and ledgers */
7490 		const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7491 		const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7492 		if (is_altacct) {
7493 			/*
7494 			 * We do not track "reusable" status for
7495 			 * "alternate accounting" mappings.
7496 			 */
7497 		} else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7498 		    is_reusable &&
7499 		    is_internal &&
7500 		    pmap != kernel_pmap) {
7501 			/* one less "reusable" */
7502 			pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7503 			/* one more "internal" */
7504 			pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7505 			pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7506 
7507 			/*
7508 			 * Since the page is being marked non-reusable, we assume that it will be
7509 			 * modified soon.  Avoid the cost of another trap to handle the fast
7510 			 * fault when we next write to this page.
7511 			 */
7512 			clear_write_fault = true;
7513 		} else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7514 		    !is_reusable &&
7515 		    is_internal &&
7516 		    pmap != kernel_pmap) {
7517 			/* one more "reusable" */
7518 			pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7519 			pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7520 			pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7521 		}
7522 
7523 		if (skip_pte) {
7524 			++num_skipped_mappings;
7525 			goto fff_skip_pve;
7526 		}
7527 
7528 		tmplate = spte;
7529 
7530 		if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7531 			/* read protection sets the pte to fault */
7532 			tmplate =  tmplate & ~ARM_PTE_AF;
7533 			ref_fault = true;
7534 		}
7535 		if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7536 			/* take away write permission if set */
7537 			if (pmap == kernel_pmap) {
7538 				if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7539 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7540 					pte_set_was_writeable(tmplate, true);
7541 					mod_fault = true;
7542 				}
7543 			} else {
7544 				if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7545 					tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7546 					pte_set_was_writeable(tmplate, true);
7547 					mod_fault = true;
7548 				}
7549 			}
7550 		}
7551 
7552 		if (ptdp != NULL) {
7553 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
7554 			sptm_ops[num_mappings].vaddr = va;
7555 			sptm_ops[num_mappings].pte_template = tmplate;
7556 			++num_mappings;
7557 		} else if (pmap_insert_flush_range_template(tmplate, flush_range)) {
7558 			/**
7559 			 * We submit both the pending disjoint and pending region ops whenever
7560 			 * either category reaches the mapping limit.  Having pending operations
7561 			 * in either category will keep preemption disabled, and we want to ensure
7562 			 * that we can at least temporarily re-enable preemption roughly every
7563 			 * SPTM_MAPPING_LIMIT mappings.
7564 			 */
7565 			pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7566 			pvh_lock_sleep_mode_needed = true;
7567 			num_mappings = num_skipped_mappings = 0;
7568 		}
7569 fff_skip_pve:
7570 		if ((num_mappings + num_skipped_mappings) >= SPTM_MAPPING_LIMIT) {
7571 			if (flush_range != NULL) {
7572 				/* See comment above for why we submit both disjoint and region ops when we hit the limit. */
7573 				pmap_multipage_op_submit_disjoint(num_mappings, flush_range);
7574 				pmap_multipage_op_submit_region(flush_range);
7575 			} else if (num_mappings > 0) {
7576 				sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7577 			}
7578 			pvh_lock_sleep_mode_needed = true;
7579 			num_mappings = num_skipped_mappings = 0;
7580 		}
7581 		pte_p = PT_ENTRY_NULL;
7582 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7583 			pve_ptep_idx = 0;
7584 			pve_p = pve_next(pve_p);
7585 		}
7586 	}
7587 
7588 	if (num_mappings != 0) {
7589 		sptm_return_t sptm_ret;
7590 
7591 		if (flush_range == NULL) {
7592 			sptm_ret = sptm_update_disjoint(phys, sptm_pcpu->sptm_ops_pa, num_mappings, sptm_update_options);
7593 		} else {
7594 			/* Resync the pending mapping state in flush_range with our local state. */
7595 			assert(num_mappings >= flush_range->pending_disjoint_entries);
7596 			flush_range->pending_disjoint_entries = num_mappings;
7597 		}
7598 	}
7599 
7600 	/**
7601 	 * Undo the explicit disable_preemption() done in FFF_PERCPU_INIT().
7602 	 * Note that enable_preemption() decrements a per-thread counter, so if
7603 	 * we happen to still hold the PVH lock in spin mode then preemption won't
7604 	 * actually be re-enabled until we drop the lock (which also decrements
7605 	 * the per-thread counter.
7606 	 */
7607 	enable_preemption();
7608 
7609 	/*
7610 	 * If we are using the same approach for ref and mod
7611 	 * faults on this PTE, do not clear the write fault;
7612 	 * this would cause both ref and mod to be set on the
7613 	 * page again, and prevent us from taking ANY read/write
7614 	 * fault on the mapping.
7615 	 */
7616 	if (clear_write_fault && !ref_aliases_mod) {
7617 		arm_clear_fast_fault(ppnum, VM_PROT_WRITE, local_locked_pvh.pvh, PT_ENTRY_NULL, 0);
7618 	}
7619 
7620 	pp_attr_t attrs_to_clear = (result ? bits_to_clear : 0);
7621 	pp_attr_t attrs_to_set = 0;
7622 	/* update global "reusable" status for this page */
7623 	if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
7624 		attrs_to_clear |= PP_ATTR_REUSABLE;
7625 	} else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
7626 		attrs_to_set |= PP_ATTR_REUSABLE;
7627 	}
7628 
7629 	if (mod_fault) {
7630 		attrs_to_set |= PP_ATTR_MODFAULT;
7631 	}
7632 	if (ref_fault) {
7633 		attrs_to_set |= PP_ATTR_REFFAULT;
7634 	}
7635 
7636 	if (attrs_to_set | attrs_to_clear) {
7637 		ppattr_modify_bits(pai, attrs_to_clear, attrs_to_set);
7638 	}
7639 
7640 	if (__probable(locked_pvh == NULL)) {
7641 		pvh_unlock(&local_locked_pvh);
7642 	} else {
7643 		*locked_pvh = local_locked_pvh;
7644 	}
7645 	if ((flush_range != NULL) && !preemption_enabled()) {
7646 		flush_range->processed_entries += num_skipped_mappings;
7647 	}
7648 	return result;
7649 }
7650 
7651 MARK_AS_PMAP_TEXT boolean_t
7652 arm_force_fast_fault_internal(
7653 	ppnum_t         ppnum,
7654 	vm_prot_t       allow_mode,
7655 	int             options)
7656 {
7657 	if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
7658 		panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
7659 	}
7660 	return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL, 0, NULL);
7661 }
7662 
7663 /*
7664  *	Routine:	arm_force_fast_fault
7665  *
7666  *	Function:
7667  *		Force all mappings for this page to fault according
7668  *		to the access modes allowed, so we can gather ref/modify
7669  *		bits again.
7670  */
7671 
7672 boolean_t
7673 arm_force_fast_fault(
7674 	ppnum_t         ppnum,
7675 	vm_prot_t       allow_mode,
7676 	int             options,
7677 	__unused void   *arg)
7678 {
7679 	pmap_paddr_t    phys = ptoa(ppnum);
7680 
7681 	assert(ppnum != vm_page_fictitious_addr);
7682 
7683 	if (!pa_valid(phys)) {
7684 		return FALSE;   /* Not a managed page. */
7685 	}
7686 
7687 	return arm_force_fast_fault_internal(ppnum, allow_mode, options);
7688 }
7689 
7690 /**
7691  * Clear pending force fault for at most SPTM_MAPPING_LIMIT mappings for this
7692  * page based on the observed fault type, and update the appropriate ref/modify
7693  * bits for the physical page. This typically involves adding write permissions
7694  * back for write faults and setting the Access Flag for both read/write faults
7695  * (since the lack of those things is what caused the fault in the first place).
7696  *
7697  * @note Only SPTM_MAPPING_LIMIT number of mappings can be modified in a single
7698  *       arm_clear_fast_fault() call to prevent excessive PVH lock contention as
7699  *       the PVH lock should be held for `ppnum` already. If a fault is
7700  *       subsequently taken on a mapping we haven't processed, arm_fast_fault()
7701  *       will call this function with a non-NULL pte_p to perform a targeted
7702  *       fixup.
7703  *
7704  * @param ppnum Page number of the page to clear a pending force fault on.
7705  * @param fault_type The type of access/fault that triggered us wanting to clear
7706  *                   the pending force fault status. This determines how we
7707  *                   modify the PTE to not cause a fault in the future and also
7708  *                   whether we mark the PTE as referenced or modified.
7709  *                   Typically a write fault would cause the page to be marked
7710  *                   as referenced and modified, and a read fault would only
7711  *                   cause the page to be marked as referenced.
7712  * @param pvh pv_head_table entry value for [ppnum] returned by a previous call
7713  *            to pvh_lock().
7714  * @param pte_p If this value is non-PT_ENTRY_NULL then only this specified PTE
7715  *              will be modified. If it is PT_ENTRY_NULL, then every mapping to
7716  *              `ppnum` will be modified.
7717  * @param attrs_to_clear Mask of additional pp_attr_t bits to clear for the physical
7718  *                       page upon completion of this function.  This is typically
7719  *                       some combination of the REFFAULT and MODFAULT bits.
7720  *
7721  * @return TRUE if any PTEs were modified, FALSE otherwise.
7722  */
7723 MARK_AS_PMAP_TEXT static boolean_t
7724 arm_clear_fast_fault(
7725 	ppnum_t ppnum,
7726 	vm_prot_t fault_type,
7727 	uintptr_t pvh,
7728 	pt_entry_t *pte_p,
7729 	pp_attr_t attrs_to_clear)
7730 {
7731 	const pmap_paddr_t pa = ptoa(ppnum);
7732 	pv_entry_t     *pve_p;
7733 	boolean_t       result;
7734 	unsigned int    num_mappings = 0, num_skipped_mappings = 0;
7735 	pp_attr_t       attrs_to_set = 0;
7736 
7737 	assert(ppnum != vm_page_fictitious_addr);
7738 
7739 	if (!pa_valid(pa)) {
7740 		return FALSE;   /* Not a managed page. */
7741 	}
7742 
7743 	result = FALSE;
7744 	pve_p = PV_ENTRY_NULL;
7745 	if (pte_p == PT_ENTRY_NULL) {
7746 		if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
7747 			pte_p = pvh_ptep(pvh);
7748 		} else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
7749 			pve_p = pvh_pve_list(pvh);
7750 		} else if (__improbable(!pvh_test_type(pvh, PVH_TYPE_NULL))) {
7751 			panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)pvh, (uint64_t)pa);
7752 		}
7753 	}
7754 
7755 	disable_preemption();
7756 	pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
7757 	sptm_disjoint_op_t *sptm_ops = sptm_pcpu->sptm_ops;
7758 
7759 	int pve_ptep_idx = 0;
7760 
7761 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7762 		pt_entry_t spte;
7763 		pt_entry_t tmplate;
7764 
7765 		if (pve_p != PV_ENTRY_NULL) {
7766 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7767 			if (pte_p == PT_ENTRY_NULL) {
7768 				goto cff_skip_pve;
7769 			}
7770 		}
7771 
7772 #ifdef PVH_FLAG_IOMMU
7773 		if (pvh_ptep_is_iommu(pte_p)) {
7774 			++num_skipped_mappings;
7775 			goto cff_skip_pve;
7776 		}
7777 #endif
7778 		spte = os_atomic_load(pte_p, relaxed);
7779 		// A concurrent pmap_remove() may have cleared the PTE
7780 		if (__improbable(!pte_is_valid(spte))) {
7781 			++num_skipped_mappings;
7782 			goto cff_skip_pve;
7783 		}
7784 
7785 		const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7786 		const pmap_t pmap = ptdp->pmap;
7787 		const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7788 
7789 		assert(va >= pmap->min && va < pmap->max);
7790 
7791 		tmplate = spte;
7792 
7793 		if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
7794 			{
7795 				if (pmap == kernel_pmap) {
7796 					tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
7797 				} else {
7798 					assert(pmap->type != PMAP_TYPE_NESTED);
7799 					tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
7800 				}
7801 			}
7802 
7803 			tmplate |= ARM_PTE_AF;
7804 
7805 			pte_set_was_writeable(tmplate, false);
7806 			attrs_to_set |= (PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
7807 		} else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
7808 			tmplate = spte | ARM_PTE_AF;
7809 
7810 			{
7811 				attrs_to_set |= PP_ATTR_REFERENCED;
7812 			}
7813 		}
7814 
7815 		assert(spte != ARM_PTE_EMPTY);
7816 
7817 		if (spte != tmplate) {
7818 			sptm_ops[num_mappings].root_pt_paddr = pmap->ttep;
7819 			sptm_ops[num_mappings].vaddr = va;
7820 			sptm_ops[num_mappings].pte_template = tmplate;
7821 			++num_mappings;
7822 			result = TRUE;
7823 		}
7824 
7825 cff_skip_pve:
7826 		if ((num_mappings + num_skipped_mappings) == SPTM_MAPPING_LIMIT) {
7827 			if (num_mappings != 0) {
7828 				sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
7829 				    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
7830 				num_mappings = 0;
7831 			}
7832 			/*
7833 			 * We've reached the limit of mappings that can be processed in a single arm_clear_fast_fault()
7834 			 * call.  Bail out here to avoid excessive PVH lock duration on the fault path.  If a fault is
7835 			 * subsequently taken on a mapping we haven't processed, arm_fast_fault() will call this
7836 			 * function with a non-NULL pte_p to perform a targeted fixup.
7837 			 */
7838 			break;
7839 		}
7840 
7841 		pte_p = PT_ENTRY_NULL;
7842 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7843 			pve_ptep_idx = 0;
7844 			pve_p = pve_next(pve_p);
7845 		}
7846 	}
7847 
7848 	if (num_mappings != 0) {
7849 		assert(result == TRUE);
7850 		sptm_update_disjoint(pa, sptm_pcpu->sptm_ops_pa, num_mappings,
7851 		    SPTM_UPDATE_PERMS_AND_WAS_WRITABLE | SPTM_UPDATE_AF);
7852 	}
7853 
7854 	if (attrs_to_set | attrs_to_clear) {
7855 		ppattr_modify_bits(pa_index(pa), attrs_to_clear, attrs_to_set);
7856 	}
7857 	enable_preemption();
7858 
7859 	return result;
7860 }
7861 
7862 /*
7863  * Determine if the fault was induced by software tracking of
7864  * modify/reference bits.  If so, re-enable the mapping (and set
7865  * the appropriate bits).
7866  *
7867  * Returns KERN_SUCCESS if the fault was induced and was
7868  * successfully handled.
7869  *
7870  * Returns KERN_FAILURE if the fault was not induced and
7871  * the function was unable to deal with it.
7872  *
7873  * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
7874  * disallows this type of access.
7875  */
7876 MARK_AS_PMAP_TEXT kern_return_t
7877 arm_fast_fault_internal(
7878 	pmap_t pmap,
7879 	vm_map_address_t va,
7880 	vm_prot_t fault_type,
7881 	__unused bool was_af_fault,
7882 	__unused bool from_user)
7883 {
7884 	kern_return_t   result = KERN_FAILURE;
7885 	pt_entry_t     *ptep;
7886 	pt_entry_t      spte = ARM_PTE_EMPTY;
7887 	locked_pvh_t    locked_pvh = {.pvh = 0};
7888 	unsigned int    pai;
7889 	pmap_paddr_t    pa;
7890 	validate_pmap_mutable(pmap);
7891 
7892 	if (__probable(preemption_enabled())) {
7893 		pmap_lock(pmap, PMAP_LOCK_SHARED);
7894 	} else if (__improbable(!pmap_try_lock(pmap, PMAP_LOCK_SHARED))) {
7895 		/**
7896 		 * In certain cases, arm_fast_fault() may be invoked with preemption disabled
7897 		 * on the copyio path.  In theses cases the (in-kernel) caller expects that any
7898 		 * faults taken against the user address may not be handled successfully
7899 		 * (vm_fault() allows non-preemptible callers with the possibility that the
7900 		 * fault may not be successfully handled) and will result in the copyio operation
7901 		 * returning EFAULT.  It is then the caller's responsibility to retry the copyio
7902 		 * operation in a preemptible context.
7903 		 *
7904 		 * For these cases attempting to acquire the sleepable lock will panic, so
7905 		 * we simply make a best effort and return failure just as the VM does if we
7906 		 * can't acquire the lock without sleeping.
7907 		 */
7908 		return result;
7909 	}
7910 
7911 	/*
7912 	 * If the entry doesn't exist, is completely invalid, or is already
7913 	 * valid, we can't fix it here.
7914 	 */
7915 
7916 	const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
7917 	ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
7918 	if (ptep != PT_ENTRY_NULL) {
7919 		while (true) {
7920 			spte = os_atomic_load(ptep, relaxed);
7921 
7922 			pa = pte_to_pa(spte);
7923 
7924 			if ((spte == ARM_PTE_EMPTY) || pte_is_compressed(spte, ptep)) {
7925 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
7926 				return result;
7927 			}
7928 
7929 			if (!pa_valid(pa)) {
7930 				const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
7931 				if (frame_type == XNU_PROTECTED_IO) {
7932 					result = KERN_PROTECTION_FAILURE;
7933 				}
7934 				pmap_unlock(pmap, PMAP_LOCK_SHARED);
7935 				return result;
7936 			}
7937 			pai = pa_index(pa);
7938 			/**
7939 			 * Check for preemption disablement and in that case use pvh_try_lock()
7940 			 * for the same reason we use pmap_try_lock() above.
7941 			 */
7942 			if (__probable(preemption_enabled())) {
7943 				locked_pvh = pvh_lock(pai);
7944 			} else {
7945 				locked_pvh = pvh_try_lock(pai);
7946 				if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
7947 					pmap_unlock(pmap, PMAP_LOCK_SHARED);
7948 					return result;
7949 				}
7950 			}
7951 			assert(locked_pvh.pvh != 0);
7952 			if (os_atomic_load(ptep, relaxed) == spte) {
7953 				/*
7954 				 * Double-check the spte value, as we care about the AF bit.
7955 				 * It's also possible that pmap_page_protect() transitioned the
7956 				 * PTE to compressed/empty before we grabbed the PVH lock.
7957 				 */
7958 				break;
7959 			}
7960 			pvh_unlock(&locked_pvh);
7961 		}
7962 	} else {
7963 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
7964 		return result;
7965 	}
7966 
7967 
7968 	if (result == KERN_SUCCESS) {
7969 		goto ff_cleanup;
7970 	}
7971 
7972 	pp_attr_t attrs = os_atomic_load(&pp_attr_table[pai], relaxed);
7973 	if ((attrs & PP_ATTR_REFFAULT) || ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT))) {
7974 		/*
7975 		 * An attempted access will always clear ref/mod fault state, as
7976 		 * appropriate for the fault type.  arm_clear_fast_fault will
7977 		 * update the associated PTEs for the page as appropriate; if
7978 		 * any PTEs are updated, we redrive the access.  If the mapping
7979 		 * does not actually allow for the attempted access, the
7980 		 * following fault will (hopefully) fail to update any PTEs, and
7981 		 * thus cause arm_fast_fault to decide that it failed to handle
7982 		 * the fault.
7983 		 */
7984 		pp_attr_t attrs_to_clear = 0;
7985 		if (attrs & PP_ATTR_REFFAULT) {
7986 			attrs_to_clear |= PP_ATTR_REFFAULT;
7987 		}
7988 		if ((fault_type & VM_PROT_WRITE) && (attrs & PP_ATTR_MODFAULT)) {
7989 			attrs_to_clear |= PP_ATTR_MODFAULT;
7990 		}
7991 
7992 		if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, PT_ENTRY_NULL, attrs_to_clear)) {
7993 			/*
7994 			 * Should this preserve KERN_PROTECTION_FAILURE?  The
7995 			 * cost of not doing so is a another fault in a case
7996 			 * that should already result in an exception.
7997 			 */
7998 			result = KERN_SUCCESS;
7999 		}
8000 	}
8001 
8002 	/*
8003 	 * If the PTE already has sufficient permissions, we can report the fault as handled.
8004 	 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8005 	 * on mappings of the same page
8006 	 */
8007 	if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8008 		uintptr_t ap_ro, ap_rw, ap_x;
8009 		if (pmap == kernel_pmap) {
8010 			ap_ro = ARM_PTE_AP(AP_RONA);
8011 			ap_rw = ARM_PTE_AP(AP_RWNA);
8012 			ap_x = ARM_PTE_NX;
8013 		} else {
8014 			ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8015 			ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8016 			ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8017 		}
8018 		/*
8019 		 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8020 		 * hardware they may be xPRR-protected, in which case they'll be handled
8021 		 * by the is_pte_xprr_protected() case above.  Additionally, the exception
8022 		 * handling path currently does not call arm_fast_fault() without at least
8023 		 * VM_PROT_READ in fault_type.
8024 		 */
8025 		if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8026 		    (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8027 			if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8028 				result = KERN_SUCCESS;
8029 			}
8030 		}
8031 	}
8032 
8033 	if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, locked_pvh.pvh, ptep, 0)) {
8034 		/*
8035 		 * A prior arm_clear_fast_fault() operation may have returned early due to
8036 		 * another pending PV list operation or an excessively large PV list.
8037 		 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8038 		 * taking a fault on the same mapping.
8039 		 */
8040 		result = KERN_SUCCESS;
8041 	}
8042 
8043 ff_cleanup:
8044 
8045 	pvh_unlock(&locked_pvh);
8046 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
8047 	return result;
8048 }
8049 
8050 kern_return_t
8051 arm_fast_fault(
8052 	pmap_t pmap,
8053 	vm_map_address_t va,
8054 	vm_prot_t fault_type,
8055 	bool was_af_fault,
8056 	__unused bool from_user)
8057 {
8058 	kern_return_t   result = KERN_FAILURE;
8059 
8060 	if (va < pmap->min || va >= pmap->max) {
8061 		return result;
8062 	}
8063 
8064 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8065 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8066 	    from_user);
8067 
8068 
8069 	result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8070 
8071 	PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8072 
8073 	return result;
8074 }
8075 
8076 void
8077 pmap_copy_page(
8078 	ppnum_t psrc,
8079 	ppnum_t pdst,
8080 	int options)
8081 {
8082 	bcopy_phys_with_options((addr64_t) (ptoa(psrc)),
8083 	    (addr64_t) (ptoa(pdst)),
8084 	    PAGE_SIZE,
8085 	    options);
8086 }
8087 
8088 
8089 /*
8090  *	pmap_copy_page copies the specified (machine independent) pages.
8091  */
8092 void
8093 pmap_copy_part_page(
8094 	ppnum_t psrc,
8095 	vm_offset_t src_offset,
8096 	ppnum_t pdst,
8097 	vm_offset_t dst_offset,
8098 	vm_size_t len)
8099 {
8100 	bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8101 	    (addr64_t) (ptoa(pdst) + dst_offset),
8102 	    len);
8103 }
8104 
8105 
8106 /*
8107  *	pmap_zero_page zeros the specified (machine independent) page.
8108  */
8109 void
8110 pmap_zero_page(
8111 	ppnum_t pn)
8112 {
8113 	assert(pn != vm_page_fictitious_addr);
8114 	bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8115 }
8116 
8117 /*
8118  *	pmap_zero_page_with_options allows to specify further operations
8119  *	to perform with the zeroing.
8120  */
8121 void
8122 pmap_zero_page_with_options(
8123 	ppnum_t pn,
8124 	int options)
8125 {
8126 	assert(pn != vm_page_fictitious_addr);
8127 	bzero_phys_with_options((addr64_t) ptoa(pn), PAGE_SIZE, options);
8128 }
8129 
8130 /*
8131  *	pmap_zero_part_page
8132  *	zeros the specified (machine independent) part of a page.
8133  */
8134 void
8135 pmap_zero_part_page(
8136 	ppnum_t pn,
8137 	vm_offset_t offset,
8138 	vm_size_t len)
8139 {
8140 	assert(pn != vm_page_fictitious_addr);
8141 	assert(offset + len <= PAGE_SIZE);
8142 	bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8143 }
8144 
8145 void
8146 pmap_map_globals(
8147 	void)
8148 {
8149 	pt_entry_t      pte;
8150 
8151 	pte = pa_to_pte(kvtophys_nofail((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX |
8152 	    ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE_VALID;
8153 #if __ARM_KERNEL_PROTECT__
8154 	pte |= ARM_PTE_NG;
8155 #endif /* __ARM_KERNEL_PROTECT__ */
8156 	pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8157 	pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8158 	sptm_map_page(kernel_pmap->ttep, LOWGLOBAL_ALIAS, pte);
8159 
8160 
8161 #if KASAN
8162 	kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8163 #endif
8164 }
8165 
8166 vm_offset_t
8167 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8168 {
8169 	if (__improbable(index >= CPUWINDOWS_MAX)) {
8170 		panic("%s: invalid index %u", __func__, index);
8171 	}
8172 	return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8173 }
8174 
8175 MARK_AS_PMAP_TEXT unsigned int
8176 pmap_map_cpu_windows_copy_internal(
8177 	ppnum_t pn,
8178 	vm_prot_t prot,
8179 	unsigned int wimg_bits)
8180 {
8181 	pt_entry_t      *ptep = NULL, pte;
8182 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8183 	unsigned int    cpu_num;
8184 	unsigned int    cpu_window_index;
8185 	vm_offset_t     cpu_copywindow_vaddr = 0;
8186 	bool            need_strong_sync = false;
8187 
8188 	assert(get_preemption_level() > 0);
8189 	cpu_num = pmap_cpu_data->cpu_number;
8190 
8191 	for (cpu_window_index = 0; cpu_window_index < CPUWINDOWS_MAX; cpu_window_index++) {
8192 		cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, cpu_window_index);
8193 		ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8194 		assert(!pte_is_compressed(*ptep, ptep));
8195 		if (!pte_is_valid(*ptep)) {
8196 			break;
8197 		}
8198 	}
8199 	if (__improbable(cpu_window_index == CPUWINDOWS_MAX)) {
8200 		panic("%s: out of windows", __func__);
8201 	}
8202 
8203 	pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE_VALID | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8204 #if __ARM_KERNEL_PROTECT__
8205 	pte |= ARM_PTE_NG;
8206 #endif /* __ARM_KERNEL_PROTECT__ */
8207 	pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8208 
8209 	if (prot & VM_PROT_WRITE) {
8210 		pte |= ARM_PTE_AP(AP_RWNA);
8211 	} else {
8212 		pte |= ARM_PTE_AP(AP_RONA);
8213 	}
8214 
8215 	/*
8216 	 * It's expected to be safe for an interrupt handler to nest copy-window usage with the
8217 	 * active thread on a CPU, as long as a sufficient number of copy windows are available.
8218 	 * --If the interrupt handler executes before the active thread creates the per-CPU mapping,
8219 	 *   or after the active thread completely removes the mapping, it may use the same mapping
8220 	 *   but will finish execution and tear down the mapping without the thread needing to know.
8221 	 * --If the interrupt handler executes after the active thread creates the per-CPU mapping,
8222 	 *   it will observe the valid mapping and use a different copy window.
8223 	 * --If the interrupt handler executes after the active thread clears the PTE in
8224 	 *   pmap_unmap_cpu_windows_copy() but before the active thread flushes the TLB, the code
8225 	 *   for computing cpu_window_index above will observe the PTE_INVALID_IN_FLIGHT token set
8226 	 *   by the SPTM, and will select a different index.
8227 	 */
8228 	const sptm_return_t sptm_status = sptm_map_page(kernel_pmap->ttep, cpu_copywindow_vaddr, pte);
8229 	if (__improbable(sptm_status != SPTM_SUCCESS)) {
8230 		panic("%s: failed to map CPU copy-window VA 0x%llx with SPTM status %d",
8231 		    __func__, (unsigned long long)cpu_copywindow_vaddr, sptm_status);
8232 	}
8233 
8234 
8235 	/*
8236 	 * Clean up any pending strong TLB flush for the same window in a thread we may have
8237 	 * interrupted.
8238 	 */
8239 	if (__improbable(pmap_cpu_data->copywindow_strong_sync[cpu_window_index])) {
8240 		arm64_sync_tlb(true);
8241 	}
8242 	pmap_cpu_data->copywindow_strong_sync[cpu_window_index] = need_strong_sync;
8243 
8244 	return cpu_window_index;
8245 }
8246 
8247 unsigned int
8248 pmap_map_cpu_windows_copy(
8249 	ppnum_t pn,
8250 	vm_prot_t prot,
8251 	unsigned int wimg_bits)
8252 {
8253 	return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8254 }
8255 
8256 MARK_AS_PMAP_TEXT void
8257 pmap_unmap_cpu_windows_copy_internal(
8258 	unsigned int index)
8259 {
8260 	unsigned int    cpu_num;
8261 	vm_offset_t     cpu_copywindow_vaddr = 0;
8262 	pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8263 
8264 	assert(index < CPUWINDOWS_MAX);
8265 	assert(get_preemption_level() > 0);
8266 
8267 	cpu_num = pmap_cpu_data->cpu_number;
8268 
8269 	cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8270 	/* Issue full-system DSB to ensure prior operations on the per-CPU window
8271 	 * (which are likely to have been on I/O memory) are complete before
8272 	 * tearing down the mapping. */
8273 	__builtin_arm_dsb(DSB_SY);
8274 	sptm_unmap_region(kernel_pmap->ttep, cpu_copywindow_vaddr, 1, 0);
8275 	if (__improbable(pmap_cpu_data->copywindow_strong_sync[index])) {
8276 		arm64_sync_tlb(true);
8277 		pmap_cpu_data->copywindow_strong_sync[index] = false;
8278 	}
8279 }
8280 
8281 void
8282 pmap_unmap_cpu_windows_copy(
8283 	unsigned int index)
8284 {
8285 	return pmap_unmap_cpu_windows_copy_internal(index);
8286 }
8287 
8288 /*
8289  * Indicate that a pmap is intended to be used as a nested pmap
8290  * within one or more larger address spaces.  This must be set
8291  * before pmap_nest() is called with this pmap as the 'subordinate'.
8292  */
8293 MARK_AS_PMAP_TEXT void
8294 pmap_set_nested_internal(
8295 	pmap_t pmap)
8296 {
8297 	validate_pmap_mutable(pmap);
8298 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8299 	if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8300 		panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8301 		    __func__, pmap, pmap->type);
8302 	}
8303 	pmap->type = PMAP_TYPE_NESTED;
8304 	sptm_retype_params_t retype_params = {.raw = SPTM_RETYPE_PARAMS_NULL};
8305 	retype_params.attr_idx = (pt_attr_page_size(pt_attr) == 4096) ? SPTM_PT_GEOMETRY_4K : SPTM_PT_GEOMETRY_16K;
8306 	pmap_txm_acquire_exclusive_lock(pmap);
8307 	sptm_retype(pmap->ttep, XNU_USER_ROOT_TABLE, XNU_SHARED_ROOT_TABLE, retype_params);
8308 	pmap_txm_release_exclusive_lock(pmap);
8309 	pmap_get_pt_ops(pmap)->free_id(pmap);
8310 }
8311 
8312 void
8313 pmap_set_nested(
8314 	pmap_t pmap)
8315 {
8316 	pmap_set_nested_internal(pmap);
8317 }
8318 
8319 bool
8320 pmap_is_nested(
8321 	pmap_t pmap)
8322 {
8323 	return pmap->type == PMAP_TYPE_NESTED;
8324 }
8325 
8326 /*
8327  * pmap_trim_range(pmap, start, end)
8328  *
8329  * pmap  = pmap to operate on
8330  * start = start of the range
8331  * end   = end of the range
8332  *
8333  * Attempts to deallocate TTEs for the given range in the nested range.
8334  */
8335 MARK_AS_PMAP_TEXT static void
8336 pmap_trim_range(
8337 	pmap_t pmap,
8338 	addr64_t start,
8339 	addr64_t end)
8340 {
8341 	addr64_t cur;
8342 	addr64_t nested_region_start;
8343 	addr64_t nested_region_end;
8344 	addr64_t adjusted_start;
8345 	addr64_t adjusted_end;
8346 	addr64_t adjust_offmask;
8347 	tt_entry_t * tte_p;
8348 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8349 
8350 	if (__improbable(end < start)) {
8351 		panic("%s: invalid address range, "
8352 		    "pmap=%p, start=%p, end=%p",
8353 		    __func__,
8354 		    pmap, (void*)start, (void*)end);
8355 	}
8356 
8357 	nested_region_start = pmap->nested_region_addr;
8358 	nested_region_end = nested_region_start + pmap->nested_region_size;
8359 
8360 	if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8361 		panic("%s: range outside nested region %p-%p, "
8362 		    "pmap=%p, start=%p, end=%p",
8363 		    __func__, (void *)nested_region_start, (void *)nested_region_end,
8364 		    pmap, (void*)start, (void*)end);
8365 	}
8366 
8367 	/* Contract the range to TT page boundaries. */
8368 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8369 
8370 	adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8371 	adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8372 	adjusted_end = end & ~adjust_offmask;
8373 
8374 	/* Iterate over the range, trying to remove TTEs. */
8375 	for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += (pt_attr_twig_size(pt_attr) * page_ratio)) {
8376 		pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8377 
8378 		tte_p = pmap_tte(pmap, cur);
8379 
8380 		if ((tte_p != NULL) && tte_is_valid_table(*tte_p)) {
8381 			/* pmap_tte_deallocate()/pmap_tte_trim() will drop the pmap lock */
8382 			if ((pmap->type == PMAP_TYPE_NESTED) && (sptm_get_page_table_refcnt(tte_to_pa(*tte_p)) == 0)) {
8383 				/* Deallocate for the nested map. */
8384 				pmap_tte_deallocate(pmap, cur, tte_p, pt_attr_twig_level(pt_attr));
8385 			} else if (pmap->type == PMAP_TYPE_USER) {
8386 				/**
8387 				 * Just remove for the parent map. If the leaf table pointed
8388 				 * to by the TTE being removed (owned by the nested pmap)
8389 				 * has any mappings, then this call will panic. This
8390 				 * enforces the policy that tables being trimmed must be
8391 				 * empty to prevent possible use-after-free attacks.
8392 				 */
8393 				pmap_tte_trim(pmap, cur, tte_p);
8394 			} else {
8395 				panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8396 			}
8397 		} else {
8398 			pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8399 		}
8400 	}
8401 }
8402 
8403 /*
8404  * pmap_trim_internal(grand, subord, vstart, size)
8405  *
8406  * grand  = pmap subord is nested in
8407  * subord = nested pmap
8408  * vstart = start of the used range in grand
8409  * size   = size of the used range
8410  *
8411  * Attempts to trim the shared region page tables down to only cover the given
8412  * range in subord and grand.
8413  */
8414 MARK_AS_PMAP_TEXT void
8415 pmap_trim_internal(
8416 	pmap_t grand,
8417 	pmap_t subord,
8418 	addr64_t vstart,
8419 	uint64_t size)
8420 {
8421 	addr64_t vend;
8422 	addr64_t adjust_offmask;
8423 
8424 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
8425 		panic("%s: grand addr wraps around, "
8426 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8427 		    __func__, grand, subord, (void*)vstart, size);
8428 	}
8429 
8430 	validate_pmap_mutable(grand);
8431 	validate_pmap(subord);
8432 
8433 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8434 
8435 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8436 
8437 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8438 		panic("%s: subord is of non-nestable type 0x%hhx, "
8439 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8440 		    __func__, subord->type, grand, subord, (void*)vstart, size);
8441 	}
8442 
8443 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
8444 		panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
8445 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8446 		    __func__, grand->type, grand, subord, (void*)vstart, size);
8447 	}
8448 
8449 	if (__improbable(grand->nested_pmap != subord)) {
8450 		panic("%s: grand->nested != subord, "
8451 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8452 		    __func__, grand, subord, (void*)vstart, size);
8453 	}
8454 
8455 	if (__improbable((size != 0) &&
8456 	    ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
8457 		panic("%s: grand range not in nested region, "
8458 		    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8459 		    __func__, grand, subord, (void*)vstart, size);
8460 	}
8461 
8462 
8463 	if (!grand->nested_has_no_bounds_ref) {
8464 		assert(subord->nested_bounds_set);
8465 
8466 		if (!grand->nested_bounds_set) {
8467 			/* Inherit the bounds from subord. */
8468 			grand->nested_region_true_start = subord->nested_region_true_start;
8469 			grand->nested_region_true_end = subord->nested_region_true_end;
8470 			grand->nested_bounds_set = true;
8471 		}
8472 
8473 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8474 		return;
8475 	}
8476 
8477 	if ((!subord->nested_bounds_set) && size) {
8478 		const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
8479 		adjust_offmask = pt_attr_leaf_table_offmask(pt_attr) * page_ratio;
8480 
8481 		subord->nested_region_true_start = vstart;
8482 		subord->nested_region_true_end = vend;
8483 		subord->nested_region_true_start &= ~adjust_offmask;
8484 
8485 		if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
8486 			panic("%s: padded true end wraps around, "
8487 			    "grand=%p, subord=%p, vstart=%p, size=%#llx",
8488 			    __func__, grand, subord, (void*)vstart, size);
8489 		}
8490 
8491 		subord->nested_region_true_end &= ~adjust_offmask;
8492 		subord->nested_bounds_set = true;
8493 	}
8494 
8495 	if (subord->nested_bounds_set) {
8496 		/* Inherit the bounds from subord. */
8497 		grand->nested_region_true_start = subord->nested_region_true_start;
8498 		grand->nested_region_true_end = subord->nested_region_true_end;
8499 		grand->nested_bounds_set = true;
8500 
8501 		/* If we know the bounds, we can trim the pmap. */
8502 		grand->nested_has_no_bounds_ref = false;
8503 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8504 	} else {
8505 		/* Don't trim if we don't know the bounds. */
8506 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8507 		return;
8508 	}
8509 
8510 	/* Trim grand to only cover the given range. */
8511 	pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
8512 	pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
8513 
8514 	/* Try to trim subord. */
8515 	pmap_trim_subord(subord);
8516 }
8517 
8518 MARK_AS_PMAP_TEXT static void
8519 pmap_trim_self(pmap_t pmap)
8520 {
8521 	if (pmap->nested_has_no_bounds_ref && pmap->nested_pmap) {
8522 		/* If we have a no bounds ref, we need to drop it. */
8523 		pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
8524 		pmap->nested_has_no_bounds_ref = false;
8525 		boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
8526 		vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
8527 		vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
8528 		pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
8529 
8530 		if (nested_bounds_set) {
8531 			pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
8532 			pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
8533 		}
8534 		/*
8535 		 * Try trimming the nested pmap, in case we had the
8536 		 * last reference.
8537 		 */
8538 		pmap_trim_subord(pmap->nested_pmap);
8539 	}
8540 }
8541 
8542 /*
8543  * pmap_trim_subord(grand, subord)
8544  *
8545  * grand  = pmap that we have nested subord in
8546  * subord = nested pmap we are attempting to trim
8547  *
8548  * Trims subord if possible
8549  */
8550 MARK_AS_PMAP_TEXT static void
8551 pmap_trim_subord(pmap_t subord)
8552 {
8553 	bool contract_subord = false;
8554 
8555 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8556 
8557 	subord->nested_no_bounds_refcnt--;
8558 
8559 	if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
8560 		/* If this was the last no bounds reference, trim subord. */
8561 		contract_subord = true;
8562 	}
8563 
8564 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8565 
8566 	if (contract_subord) {
8567 		pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
8568 		pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
8569 	}
8570 }
8571 
8572 void
8573 pmap_trim(
8574 	pmap_t grand,
8575 	pmap_t subord,
8576 	addr64_t vstart,
8577 	uint64_t size)
8578 {
8579 	pmap_trim_internal(grand, subord, vstart, size);
8580 }
8581 
8582 #if HAS_APPLE_PAC
8583 
8584 void *
8585 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8586 {
8587 	void *res = NULL;
8588 	const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8589 
8590 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8591 	__compiler_materialize_and_prevent_reordering_on(value);
8592 	res = sptm_sign_user_pointer(value, key, discriminator, jop_key);
8593 	__compiler_materialize_and_prevent_reordering_on(res);
8594 	ml_disable_user_jop_key(jop_key, saved_jop_state);
8595 
8596 	ml_set_interrupts_enabled(current_intr_state);
8597 
8598 	return res;
8599 }
8600 
8601 void *
8602 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
8603 {
8604 	void *res = NULL;
8605 	const boolean_t current_intr_state = ml_set_interrupts_enabled(FALSE);
8606 
8607 	uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
8608 	__compiler_materialize_and_prevent_reordering_on(value);
8609 	res = sptm_auth_user_pointer(value, key, discriminator, jop_key);
8610 	__compiler_materialize_and_prevent_reordering_on(res);
8611 	ml_disable_user_jop_key(jop_key, saved_jop_state);
8612 
8613 	if (res == SPTM_AUTH_FAILURE) {
8614 		res = ml_poison_ptr(value, key);
8615 	}
8616 
8617 	ml_set_interrupts_enabled(current_intr_state);
8618 
8619 	return res;
8620 }
8621 #endif /* HAS_APPLE_PAC */
8622 
8623 /*
8624  *	kern_return_t pmap_nest(grand, subord, vstart, size)
8625  *
8626  *	grand  = the pmap that we will nest subord into
8627  *	subord = the pmap that goes into the grand
8628  *	vstart  = start of range in pmap to be inserted
8629  *	size   = Size of nest area (up to 16TB)
8630  *
8631  *	Inserts a pmap into another.  This is used to implement shared segments.
8632  *
8633  */
8634 
8635 /**
8636  * Embeds a range of mappings from one pmap ('subord') into another ('grand')
8637  * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
8638  * This function operates in 3 main phases:
8639  * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
8640  * 2. Expansion of subord to ensure the required leaf-level page table pages for
8641  *    the mapping range are present in subord.
8642  * 3. Expansion of grand to ensure the required twig-level page table pages for
8643  *    the mapping range are present in grand.
8644  * 4. Invoke sptm_nest_region() to copy the relevant TTEs from subord to grand.
8645  *
8646  * This function may return early due to pending AST_URGENT preemption; if so
8647  * it will indicate the need to be re-entered.
8648  *
8649  * @param grand pmap to insert the TTEs into.  Must be a user pmap.
8650  * @param subord pmap from which to extract the TTEs.  Must be a nested pmap.
8651  * @param vstart twig-aligned virtual address for the beginning of the nesting range
8652  * @param size twig-aligned size of the nesting range
8653  *
8654  * @return KERN_RESOURCE_SHORTAGE on allocation failure, KERN_SUCCESS otherwise
8655  */
8656 MARK_AS_PMAP_TEXT kern_return_t
8657 pmap_nest_internal(
8658 	pmap_t grand,
8659 	pmap_t subord,
8660 	addr64_t vstart,
8661 	uint64_t size)
8662 {
8663 	kern_return_t kr = KERN_SUCCESS;
8664 	vm_map_offset_t vaddr;
8665 	tt_entry_t     *stte_p;
8666 	tt_entry_t     *gtte_p;
8667 	bitmap_t       *nested_region_unnested_table_bitmap;
8668 	int             expand_options = 0;
8669 	bool            deref_subord = true;
8670 
8671 	addr64_t vend;
8672 	if (__improbable(os_add_overflow(vstart, size, &vend))) {
8673 		panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
8674 	}
8675 
8676 	validate_pmap_mutable(grand);
8677 	validate_pmap(subord);
8678 	os_ref_retain_raw(&subord->ref_count, &pmap_refgrp);
8679 
8680 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8681 	if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
8682 		panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
8683 	}
8684 
8685 	if (__improbable(((size | vstart) &
8686 	    (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
8687 		panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx",
8688 		    grand, vstart, size);
8689 	}
8690 
8691 	if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
8692 		panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
8693 	}
8694 
8695 	if (__improbable(grand->type != PMAP_TYPE_USER)) {
8696 		panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
8697 	}
8698 
8699 	/**
8700 	 * Use an acquire barrier to ensure that subsequent loads of nested_region_* fields are not
8701 	 * speculated ahead of the load of nested_region_unnested_table_bitmap, so that if we observe a non-NULL
8702 	 * nested_region_unnested_table_bitmap then we can be sure the other fields have been initialized as well.
8703 	 */
8704 	if (os_atomic_load(&subord->nested_region_unnested_table_bitmap, acquire) == NULL) {
8705 		uint64_t nested_region_unnested_table_bits = size >> pt_attr_twig_shift(pt_attr);
8706 
8707 		if (__improbable((nested_region_unnested_table_bits > UINT_MAX))) {
8708 			panic("%s: bitmap allocation size %llu will truncate, "
8709 			    "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
8710 			    __func__, nested_region_unnested_table_bits,
8711 			    grand, subord, vstart, size);
8712 		}
8713 
8714 		nested_region_unnested_table_bitmap = bitmap_alloc((uint) nested_region_unnested_table_bits);
8715 
8716 		pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8717 		if (subord->nested_region_unnested_table_bitmap == NULL) {
8718 			subord->nested_region_addr = vstart;
8719 			subord->nested_region_size = (mach_vm_offset_t) size;
8720 			sptm_configure_shared_region(subord->ttep, vstart, size >> pt_attr->pta_page_shift);
8721 
8722 			/**
8723 			 * Ensure that the rest of the subord->nested_region_* fields are
8724 			 * initialized and visible before setting the nested_region_unnested_table_bitmap
8725 			 * field (which is used as the flag to say that the rest are initialized).
8726 			 */
8727 			os_atomic_store(&subord->nested_region_unnested_table_bitmap, nested_region_unnested_table_bitmap, release);
8728 			nested_region_unnested_table_bitmap = NULL;
8729 		}
8730 		pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8731 		if (nested_region_unnested_table_bitmap != NULL) {
8732 			bitmap_free(nested_region_unnested_table_bitmap, nested_region_unnested_table_bits);
8733 		}
8734 	}
8735 
8736 	assertf(subord->nested_region_addr == vstart, "%s: pmap %p nested region addr 0x%llx doesn't match vstart 0x%llx",
8737 	    __func__, subord, (unsigned long long)subord->nested_region_addr, (unsigned long long)vstart);
8738 	assertf(subord->nested_region_size == size, "%s: pmap %p nested region size 0x%llx doesn't match size 0x%llx",
8739 	    __func__, subord, (unsigned long long)subord->nested_region_size, (unsigned long long)size);
8740 
8741 	pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8742 
8743 	if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
8744 		/*
8745 		 * If this is grand's first nesting operation, keep the reference on subord.
8746 		 * It will be released by pmap_destroy_internal() when grand is destroyed.
8747 		 */
8748 		deref_subord = false;
8749 
8750 		if (!subord->nested_bounds_set) {
8751 			/*
8752 			 * We are nesting without the shared regions bounds
8753 			 * being known.  We'll have to trim the pmap later.
8754 			 */
8755 			grand->nested_has_no_bounds_ref = true;
8756 			subord->nested_no_bounds_refcnt++;
8757 		}
8758 
8759 		grand->nested_region_addr = vstart;
8760 		grand->nested_region_size = (mach_vm_offset_t) size;
8761 	} else {
8762 		if (__improbable(grand->nested_pmap != subord)) {
8763 			panic("pmap_nest() pmap %p has a nested pmap", grand);
8764 		} else if (__improbable(grand->nested_region_addr > vstart)) {
8765 			panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
8766 		} else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
8767 			grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
8768 		}
8769 	}
8770 
8771 	vaddr = vstart;
8772 	if (vaddr < subord->nested_region_true_start) {
8773 		vaddr = subord->nested_region_true_start;
8774 	}
8775 
8776 	addr64_t true_end = vend;
8777 	if (true_end > subord->nested_region_true_end) {
8778 		true_end = subord->nested_region_true_end;
8779 	}
8780 
8781 	while (vaddr < true_end) {
8782 		stte_p = pmap_tte(subord, vaddr);
8783 		if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
8784 			pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8785 			kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
8786 
8787 			if (kr != KERN_SUCCESS) {
8788 				pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8789 				goto done;
8790 			}
8791 
8792 			pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
8793 		}
8794 		vaddr += pt_attr_twig_size(pt_attr);
8795 	}
8796 
8797 	/*
8798 	 * copy TTEs from subord pmap into grand pmap
8799 	 */
8800 
8801 	vaddr = (vm_map_offset_t) vstart;
8802 	if (vaddr < subord->nested_region_true_start) {
8803 		vaddr = subord->nested_region_true_start;
8804 	}
8805 
8806 	pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
8807 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8808 
8809 	while (vaddr < true_end) {
8810 		gtte_p = pmap_tte(grand, vaddr);
8811 		if (gtte_p == PT_ENTRY_NULL) {
8812 			pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
8813 			kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
8814 			pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
8815 
8816 			if (kr != KERN_SUCCESS) {
8817 				goto done;
8818 			}
8819 		}
8820 
8821 		vaddr += pt_attr_twig_size(pt_attr);
8822 	}
8823 
8824 	vaddr = (vm_map_offset_t) vstart;
8825 
8826 	/*
8827 	 * It is possible to have a preempted nest operation execute concurrently
8828 	 * with a trim operation that sets nested_region_true_start.  In this case,
8829 	 * update the nesting bounds.  This is useful both as a performance
8830 	 * optimization and to prevent an attempt to nest a just-trimmed TTE,
8831 	 * which will trigger an SPTM violation.
8832 	 * Note that pmap_trim() may concurrently update grand's bounds as we are
8833 	 * making these checks, but in that case pmap_trim_range() has not yet
8834 	 * been called on grand and will wait for us to drop grand's lock, so it
8835 	 * should see any TTEs we've nested here and clear them appropriately.
8836 	 */
8837 	if (vaddr < subord->nested_region_true_start) {
8838 		vaddr = subord->nested_region_true_start;
8839 	}
8840 	if (vaddr < grand->nested_region_true_start) {
8841 		vaddr = grand->nested_region_true_start;
8842 	}
8843 	if (true_end > subord->nested_region_true_end) {
8844 		true_end = subord->nested_region_true_end;
8845 	}
8846 	if (true_end > grand->nested_region_true_end) {
8847 		true_end = grand->nested_region_true_end;
8848 	}
8849 
8850 	while (vaddr < true_end) {
8851 		/*
8852 		 * The SPTM requires the run of TTE updates to all reside within the same L2 page, so the region
8853 		 * we supply to the SPTM can't span multiple L1 TTEs.
8854 		 */
8855 		vm_map_offset_t vlim = ((vaddr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8856 		if (vlim > true_end) {
8857 			vlim = true_end;
8858 		}
8859 		pmap_txm_acquire_exclusive_lock(grand);
8860 		pmap_txm_acquire_shared_lock(subord);
8861 		sptm_nest_region(grand->ttep, subord->ttep, vaddr, (vlim - vaddr) >> pt_attr->pta_page_shift);
8862 		pmap_txm_release_shared_lock(subord);
8863 		pmap_txm_release_exclusive_lock(grand);
8864 		vaddr = vlim;
8865 	}
8866 
8867 done:
8868 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
8869 	if (deref_subord) {
8870 		pmap_destroy_internal(subord);
8871 	}
8872 
8873 	return kr;
8874 }
8875 
8876 kern_return_t
8877 pmap_nest(
8878 	pmap_t grand,
8879 	pmap_t subord,
8880 	addr64_t vstart,
8881 	uint64_t size)
8882 {
8883 	kern_return_t kr = KERN_SUCCESS;
8884 
8885 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
8886 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
8887 	    VM_KERNEL_ADDRHIDE(vstart));
8888 
8889 	pmap_verify_preemptible();
8890 	kr = pmap_nest_internal(grand, subord, vstart, size);
8891 
8892 	PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
8893 
8894 	return kr;
8895 }
8896 
8897 /*
8898  *	kern_return_t pmap_unnest(grand, vaddr)
8899  *
8900  *	grand  = the pmap that will have the virtual range unnested
8901  *	vaddr  = start of range in pmap to be unnested
8902  *	size   = size of range in pmap to be unnested
8903  *
8904  */
8905 
8906 kern_return_t
8907 pmap_unnest(
8908 	pmap_t grand,
8909 	addr64_t vaddr,
8910 	uint64_t size)
8911 {
8912 	return pmap_unnest_options(grand, vaddr, size, 0);
8913 }
8914 
8915 /**
8916  * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
8917  * from a top-level pmap ('grand').  The corresponding mappings in the nested
8918  * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
8919  * still have the region nested.  The mappings in 'grand' will be left empty
8920  * with the assumption that they will be demand-filled by subsequent access faults.
8921  *
8922  * This function operates in 2 main phases:
8923  * 1. Iteration over the nested pmap's mappings for the specified range to mark
8924  *    them non-global.
8925  * 2. Calling the SPTM to clear the twig-level TTEs for the address range in grand.
8926  *
8927  * This function may return early due to pending AST_URGENT preemption; if so
8928  * it will indicate the need to be re-entered.
8929  *
8930  * @param grand pmap from which to unnest mappings
8931  * @param vaddr twig-aligned virtual address for the beginning of the nested range
8932  * @param size twig-aligned size of the nested range
8933  * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
8934  *        grand is being torn down and step 1) above is not needed.
8935  */
8936 MARK_AS_PMAP_TEXT void
8937 pmap_unnest_options_internal(
8938 	pmap_t grand,
8939 	addr64_t vaddr,
8940 	uint64_t size,
8941 	unsigned int option)
8942 {
8943 	vm_map_offset_t start;
8944 	vm_map_offset_t addr;
8945 	unsigned int    current_index;
8946 	unsigned int    start_index;
8947 	unsigned int    max_index;
8948 
8949 	addr64_t vend;
8950 	addr64_t true_end;
8951 	if (__improbable(os_add_overflow(vaddr, size, &vend))) {
8952 		panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
8953 	}
8954 
8955 	validate_pmap_mutable(grand);
8956 
8957 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
8958 
8959 	if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
8960 		panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
8961 		    (unsigned long long)vaddr, (unsigned long long)size);
8962 	}
8963 
8964 	if (__improbable(grand->nested_pmap == NULL)) {
8965 		panic("%s: %p has no nested pmap", __func__, grand);
8966 	}
8967 
8968 	true_end = vend;
8969 	if (true_end > grand->nested_pmap->nested_region_true_end) {
8970 		true_end = grand->nested_pmap->nested_region_true_end;
8971 	}
8972 
8973 	if ((option & PMAP_UNNEST_CLEAN) == 0) {
8974 		if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
8975 			panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
8976 		}
8977 
8978 		/*
8979 		 * SPTM TODO: I suspect we may be able to hold the nested pmap lock shared here.
8980 		 * We would need to use atomic_bitmap_set below where we currently use bitmap_test + bitmap_set.
8981 		 * The risk is that a concurrent pmap_enter() against the nested pmap could observe the relevant
8982 		 * bit in the nested region bitmap to be clear, but could then create the (global) mapping after
8983 		 * we've made our SPTM sweep below to set NG.  In that case we could end up with a mix of global
8984 		 * and non-global mappings for the same VA region and thus a TLB conflict.  I'm uncertain if the
8985 		 * VM would allow these operation to happen concurrently.  Even if it does, we could still do
8986 		 * something fancier here such as waiting for concurrent pmap_enter() to drain after updating
8987 		 * the bitmap.
8988 		 */
8989 		pmap_lock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
8990 
8991 		disable_preemption();
8992 		pmap_sptm_percpu_data_t *sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
8993 		unsigned int num_mappings = 0;
8994 		start = vaddr;
8995 		if (start < grand->nested_pmap->nested_region_true_start) {
8996 			start = grand->nested_pmap->nested_region_true_start;
8997 		}
8998 		start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
8999 		max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
9000 
9001 		for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
9002 			pt_entry_t  *bpte, *cpte;
9003 
9004 			vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
9005 
9006 			bpte = pmap_pte(grand->nested_pmap, addr);
9007 
9008 			if (!bitmap_test(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index)) {
9009 				/*
9010 				 * We've marked the 'twig' region as being unnested.  Every mapping entered within
9011 				 * the nested pmap in this region will now be marked non-global.
9012 				 */
9013 				bitmap_set(grand->nested_pmap->nested_region_unnested_table_bitmap, current_index);
9014 				for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
9015 					pt_entry_t  spte = os_atomic_load(cpte, relaxed);
9016 
9017 					if (pte_is_valid(spte)) {
9018 						spte |= ARM_PTE_NG;
9019 					}
9020 
9021 					addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
9022 
9023 					sptm_pcpu->sptm_templates[num_mappings] = spte;
9024 					++num_mappings;
9025 
9026 					if (num_mappings == SPTM_MAPPING_LIMIT) {
9027 						pmap_retype_epoch_enter();
9028 						sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9029 						    sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9030 						pmap_retype_epoch_exit();
9031 						enable_preemption();
9032 						num_mappings = 0;
9033 						start = addr;
9034 						disable_preemption();
9035 						sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9036 					}
9037 				}
9038 			}
9039 			/**
9040 			 * The SPTM does not allow region updates to span multiple leaf page tables, so request
9041 			 * any remaining updates up to vlim before moving to the next page table page.
9042 			 */
9043 			if (num_mappings != 0) {
9044 				pmap_retype_epoch_enter();
9045 				sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9046 				    sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9047 				pmap_retype_epoch_exit();
9048 				enable_preemption();
9049 				num_mappings = 0;
9050 				disable_preemption();
9051 				sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9052 			}
9053 			addr = start = vlim;
9054 		}
9055 
9056 		if (num_mappings != 0) {
9057 			pmap_retype_epoch_enter();
9058 			sptm_update_region(grand->nested_pmap->ttep, start, num_mappings,
9059 			    sptm_pcpu->sptm_templates_pa, SPTM_UPDATE_NG);
9060 			pmap_retype_epoch_exit();
9061 		}
9062 
9063 		enable_preemption();
9064 		pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
9065 	}
9066 
9067 	/*
9068 	 * invalidate all pdes for segment at vaddr in pmap grand
9069 	 */
9070 	addr = vaddr;
9071 
9072 	pmap_lock(grand, PMAP_LOCK_EXCLUSIVE);
9073 
9074 	if (addr < grand->nested_pmap->nested_region_true_start) {
9075 		addr = grand->nested_pmap->nested_region_true_start;
9076 	}
9077 
9078 	if (true_end > grand->nested_pmap->nested_region_true_end) {
9079 		true_end = grand->nested_pmap->nested_region_true_end;
9080 	}
9081 
9082 	while (addr < true_end) {
9083 		vm_map_offset_t vlim = ((addr + pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9084 		if (vlim > true_end) {
9085 			vlim = true_end;
9086 		}
9087 		sptm_unnest_region(grand->ttep, grand->nested_pmap->ttep, addr, (vlim - addr) >> pt_attr->pta_page_shift);
9088 		addr = vlim;
9089 	}
9090 
9091 	pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9092 }
9093 
9094 kern_return_t
9095 pmap_unnest_options(
9096 	pmap_t grand,
9097 	addr64_t vaddr,
9098 	uint64_t size,
9099 	unsigned int option)
9100 {
9101 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
9102 	    VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
9103 
9104 	pmap_verify_preemptible();
9105 	pmap_unnest_options_internal(grand, vaddr, size, option);
9106 
9107 	PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
9108 
9109 	return KERN_SUCCESS;
9110 }
9111 
9112 boolean_t
9113 pmap_adjust_unnest_parameters(
9114 	__unused pmap_t p,
9115 	__unused vm_map_offset_t *s,
9116 	__unused vm_map_offset_t *e)
9117 {
9118 	return TRUE; /* to get to log_unnest_badness()... */
9119 }
9120 
9121 #if PMAP_FORK_NEST
9122 /**
9123  * Perform any necessary pre-nesting of the parent's shared region at fork()
9124  * time.
9125  *
9126  * @note This should only be called from vm_map_fork().
9127  *
9128  * @param old_pmap The pmap of the parent task.
9129  * @param new_pmap The pmap of the child task.
9130  * @param nesting_start An output parameter that is updated with the start
9131  *                      address of the range that was pre-nested
9132  * @param nesting_end An output parameter that is updated with the end
9133  *                      address of the range that was pre-nested
9134  *
9135  * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
9136  *         KERN_INVALID_ARGUMENT if the arguments were not valid.
9137  */
9138 kern_return_t
9139 pmap_fork_nest(
9140 	pmap_t old_pmap,
9141 	pmap_t new_pmap,
9142 	vm_map_offset_t *nesting_start,
9143 	vm_map_offset_t *nesting_end)
9144 {
9145 	if (old_pmap == NULL || new_pmap == NULL) {
9146 		return KERN_INVALID_ARGUMENT;
9147 	}
9148 	if (old_pmap->nested_pmap == NULL) {
9149 		return KERN_SUCCESS;
9150 	}
9151 	pmap_nest(new_pmap,
9152 	    old_pmap->nested_pmap,
9153 	    old_pmap->nested_region_addr,
9154 	    old_pmap->nested_region_size);
9155 	assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
9156 	    new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
9157 	    new_pmap->nested_region_size == old_pmap->nested_region_size,
9158 	    "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
9159 	    new_pmap->nested_pmap,
9160 	    new_pmap->nested_region_addr,
9161 	    new_pmap->nested_region_size,
9162 	    old_pmap->nested_pmap,
9163 	    old_pmap->nested_region_addr,
9164 	    old_pmap->nested_region_size);
9165 	*nesting_start = old_pmap->nested_region_addr;
9166 	*nesting_end = *nesting_start + old_pmap->nested_region_size;
9167 	return KERN_SUCCESS;
9168 }
9169 #endif /* PMAP_FORK_NEST */
9170 
9171 /*
9172  * disable no-execute capability on
9173  * the specified pmap
9174  */
9175 #if DEVELOPMENT || DEBUG
9176 void
9177 pmap_disable_NX(
9178 	pmap_t pmap)
9179 {
9180 	pmap->nx_enabled = FALSE;
9181 }
9182 #else
9183 void
9184 pmap_disable_NX(
9185 	__unused pmap_t pmap)
9186 {
9187 }
9188 #endif
9189 
9190 /*
9191  * flush a range of hardware TLB entries.
9192  * NOTE: assumes the smallest TLB entry in use will be for
9193  * an ARM small page (4K).
9194  */
9195 
9196 #if __ARM_RANGE_TLBI__
9197 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD 1
9198 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  ARM64_TLB_RANGE_MAX_PAGES
9199 #else
9200 #define ARM64_FULL_TLB_FLUSH_THRESHOLD  256
9201 #endif // __ARM_RANGE_TLBI__
9202 
9203 static void
9204 flush_mmu_tlb_region_asid_async(
9205 	vm_offset_t va,
9206 	size_t length,
9207 	pmap_t pmap,
9208 	bool last_level_only __unused)
9209 {
9210 	unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
9211 	const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
9212 	ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
9213 	const uint16_t asid = PMAP_HWASID(pmap);
9214 
9215 	if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
9216 		boolean_t       flush_all = FALSE;
9217 
9218 		if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
9219 			flush_all = TRUE;
9220 		}
9221 		if (flush_all) {
9222 			flush_mmu_tlb_async();
9223 		} else {
9224 			flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, false);
9225 		}
9226 		return;
9227 	}
9228 #if __ARM_RANGE_TLBI__
9229 	if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
9230 		va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
9231 		if (pmap->type == PMAP_TYPE_NESTED) {
9232 			flush_mmu_tlb_allrange_async(va, last_level_only, false);
9233 		} else {
9234 			flush_mmu_tlb_range_async(va, last_level_only, false);
9235 		}
9236 		return;
9237 	}
9238 #endif
9239 	vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
9240 	va = tlbi_asid(asid) | tlbi_addr(va);
9241 
9242 	if (pmap->type == PMAP_TYPE_NESTED) {
9243 		flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, false);
9244 	} else {
9245 		flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, false);
9246 	}
9247 }
9248 
9249 void
9250 flush_mmu_tlb_region(
9251 	vm_offset_t va,
9252 	unsigned length)
9253 {
9254 	flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true);
9255 	sync_tlb_flush();
9256 }
9257 
9258 unsigned int
9259 pmap_cache_attributes(
9260 	ppnum_t pn)
9261 {
9262 	pmap_paddr_t    paddr;
9263 	unsigned int    pai;
9264 	unsigned int    result;
9265 	pp_attr_t       pp_attr_current;
9266 
9267 	paddr = ptoa(pn);
9268 
9269 	assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
9270 
9271 	if (!pa_valid(paddr)) {
9272 		pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
9273 		return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
9274 	}
9275 
9276 	result = VM_WIMG_DEFAULT;
9277 
9278 	pai = pa_index(paddr);
9279 
9280 	pp_attr_current = pp_attr_table[pai];
9281 	if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9282 		result = pp_attr_current & PP_ATTR_WIMG_MASK;
9283 	}
9284 	return result;
9285 }
9286 
9287 MARK_AS_PMAP_TEXT static void
9288 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
9289 {
9290 	if ((wimg_bits_prev != wimg_bits_new)
9291 	    && ((wimg_bits_prev == VM_WIMG_COPYBACK)
9292 	    || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
9293 	    && (wimg_bits_new != VM_WIMG_COPYBACK))
9294 	    || ((wimg_bits_prev == VM_WIMG_WTHRU)
9295 	    && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
9296 		pmap_sync_page_attributes_phys(pn);
9297 	}
9298 
9299 	if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
9300 		pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
9301 	}
9302 }
9303 
9304 MARK_AS_PMAP_TEXT __unused void
9305 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
9306 {
9307 	pmap_paddr_t paddr = ptoa(pn);
9308 
9309 	if (__improbable(!pa_valid(paddr))) {
9310 		panic("%s called on non-managed page 0x%08x", __func__, pn);
9311 	}
9312 
9313 	pmap_set_cache_attributes_internal(pn, new_cacheattr, false);
9314 
9315 	pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
9316 }
9317 
9318 static inline bool
9319 cacheattr_supports_compressor(unsigned int cacheattr)
9320 {
9321 	switch (cacheattr) {
9322 	case VM_WIMG_DEFAULT:
9323 		return true;
9324 	default:
9325 		return false;
9326 	}
9327 }
9328 
9329 void *
9330 pmap_map_compressor_page(ppnum_t pn)
9331 {
9332 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9333 	if (!cacheattr_supports_compressor(cacheattr)) {
9334 		pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
9335 	}
9336 
9337 	return (void*)phystokv(ptoa(pn));
9338 }
9339 
9340 void
9341 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
9342 {
9343 	unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
9344 	if (!cacheattr_supports_compressor(cacheattr)) {
9345 		pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
9346 	}
9347 }
9348 
9349 /**
9350  * Flushes TLB entries associated with the page specified by paddr, but do not
9351  * issue barriers yet.
9352  *
9353  * @param paddr The physical address to be flushed from TLB. Must be a managed address.
9354  */
9355 static void
9356 pmap_flush_tlb_for_paddr_async(pmap_paddr_t paddr)
9357 {
9358 	/* Flush the physical aperture mappings. */
9359 	const vm_offset_t kva = phystokv(paddr);
9360 	flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true);
9361 
9362 	/* Flush the mappings tracked in the ptes. */
9363 	const unsigned int pai = pa_index(paddr);
9364 	locked_pvh_t locked_pvh = pvh_lock(pai);
9365 
9366 	pt_entry_t *pte_p = PT_ENTRY_NULL;
9367 	pv_entry_t *pve_p = PV_ENTRY_NULL;
9368 
9369 	if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PTEP)) {
9370 		pte_p = pvh_ptep(locked_pvh.pvh);
9371 	} else if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
9372 		pve_p = pvh_pve_list(locked_pvh.pvh);
9373 		pte_p = PT_ENTRY_NULL;
9374 	}
9375 
9376 	unsigned int nptes = 0;
9377 	int pve_ptep_idx = 0;
9378 	while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
9379 		if (pve_p != PV_ENTRY_NULL) {
9380 			pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
9381 			if (pte_p == PT_ENTRY_NULL) {
9382 				goto flush_tlb_skip_pte;
9383 			}
9384 		}
9385 
9386 		if (__improbable(nptes == SPTM_MAPPING_LIMIT)) {
9387 			pvh_lock_enter_sleep_mode(&locked_pvh);
9388 		}
9389 		++nptes;
9390 #ifdef PVH_FLAG_IOMMU
9391 		if (pvh_ptep_is_iommu(pte_p)) {
9392 			goto flush_tlb_skip_pte;
9393 		}
9394 #endif /* PVH_FLAG_IOMMU */
9395 		const pmap_t pmap = ptep_get_pmap(pte_p);
9396 		const vm_map_address_t va = ptep_get_va(pte_p);
9397 
9398 		pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true);
9399 
9400 flush_tlb_skip_pte:
9401 		pte_p = PT_ENTRY_NULL;
9402 		if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9403 			pve_ptep_idx = 0;
9404 			pve_p = pve_next(pve_p);
9405 		}
9406 	}
9407 	pvh_unlock(&locked_pvh);
9408 }
9409 
9410 /**
9411  * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
9412  *
9413  * @param pai The Physical Address Index of the entry.
9414  * @param cacheattr The new cache attribute.
9415  */
9416 MARK_AS_PMAP_TEXT static void
9417 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
9418 {
9419 	pvh_assert_locked(pai);
9420 
9421 	pp_attr_t pp_attr_current, pp_attr_template;
9422 	do {
9423 		pp_attr_current = pp_attr_table[pai];
9424 		pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
9425 
9426 		/**
9427 		 * WIMG bits should only be updated under the PVH lock, but we should do
9428 		 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
9429 		 */
9430 	} while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
9431 }
9432 
9433 /**
9434  * Structure for tracking where we are during the collection of mappings for batch
9435  * cache attribute updates.
9436  *
9437  * @note We need to track where in the per-cpu ops table we are filling the next mappings into,
9438  *       because the collection routine can return with a not completely filled ops table when
9439  *       it exhausts the PV list for a page. In such case, the remaining slots in the ops table
9440  *       will be used for mappings of the next page.
9441  *
9442  * @note We also need to record where we are in the PV list, because the collection routine can
9443  *       also return when the ops table is filled but it's still in the middle of the PV list.
9444  *       Those remaining items in the PV list need to be handled by the next batch operation in
9445  *       a new ops table.
9446  */
9447 typedef struct {
9448 	/* Where we are in the sptm ops table. */
9449 	unsigned int sptm_ops_index;
9450 
9451 	/**
9452 	 * The last collected physical address from the previous full ops array (and in turn, SPTM
9453 	 * call). This is used to know whether the SPTM call for the latest full ops table should
9454 	 * skip updating the PAPT mapping (seeing as the last call would have handled updating it).
9455 	 */
9456 	pmap_paddr_t last_table_last_papt_pa;
9457 
9458 	/**
9459 	 * Where we are in the pv list.
9460 	 *
9461 	 * When ptep is non-null, there's only one mapping to the page and the ptep is the address
9462 	 * of it.
9463 	 *
9464 	 * When pvep is non-null, there's more than one mapping and the mappings are tracked by the
9465 	 * PV list.
9466 	 *
9467 	 * When they are both null, it indicates we are collecting for a new page and the collection
9468 	 * function will initialize them to be one of the two states above.
9469 	 *
9470 	 * It is undefined when they are both non-null.
9471 	 */
9472 	pt_entry_t *ptep;
9473 	pv_entry_t *pvep;
9474 	unsigned int pve_ptep_idx;
9475 } pmap_sptm_update_cache_attr_ops_collect_state_t;
9476 
9477 /**
9478  * Reports whether there is any pending ops in an sptm cache attr ops table.
9479  *
9480  * @param state A pmap_sptm_update_cache_attr_ops_collect_state_t structure.
9481  *
9482  * @return True if there's any outstanding cache attr op.
9483  *         False otherwise.
9484  */
9485 static inline bool
9486 pmap_is_sptm_update_cache_attr_ops_pending(pmap_sptm_update_cache_attr_ops_collect_state_t state)
9487 {
9488 	return state.sptm_ops_index > 0;
9489 }
9490 
9491 /**
9492  * Struct for encoding the collection status into pmap_sptm_update_cache_attr_ops_collect()'s
9493  * return value indicating what kind of attention it needs.
9494  */
9495 typedef enum {
9496 	OPS_COLLECT_NOTHING = 0x0,
9497 
9498 	/* The ops table is full, and the caller should commit the table to SPTM. */
9499 	OPS_COLLECT_RETURN_FULL_TABLE = 0x1,
9500 
9501 	/**
9502 	 * The page has its mappings completely collected, and the caller should
9503 	 * pass in a new page next time.
9504 	 */
9505 	OPS_COLLECT_RETURN_COMPLETED_PAGE = 0x2,
9506 } pmap_sptm_update_cache_attr_ops_collect_return_t;
9507 
9508 /**
9509  * Collects mappings of a physical page into an SPTM ops table for cache attribute updates.
9510  *
9511  * @note This routine returns either when the ops table is full or the page represented by
9512  *       pa has no more mapping to collect. The caller should call this routine again with
9513  *       a fresh ops table, or a new page, or both, depending on the return code.
9514  *
9515  * @note The PVH lock needs to be held for pa.
9516  *
9517  * @param state Tracks the state of PV list traversal and SPTM ops table filling. It is used
9518  *              by this routine to save the progress of the collection.
9519  * @param sptm_ops Pointer to the SPTM ops table.
9520  * @param pa The physical address whose mappings are to be collected.
9521  * @param attributes The new cache attributes.
9522  *
9523  * @return A pmap_sptm_update_cache_attr_ops_collect_return_t that encodes what the caller
9524  *         should do before calling this routine again. See the inline comments around
9525  *         pmap_sptm_update_cache_attr_ops_collect_return_t for details.
9526  */
9527 static pmap_sptm_update_cache_attr_ops_collect_return_t
9528 pmap_sptm_update_cache_attr_ops_collect(
9529 	pmap_sptm_update_cache_attr_ops_collect_state_t *state,
9530 	sptm_update_disjoint_multipage_op_t *sptm_ops,
9531 	pmap_paddr_t pa,
9532 	unsigned int attributes)
9533 {
9534 	if (state == NULL || sptm_ops == NULL) {
9535 		panic("%s: unexpected null arguments - state: %p, sptm_ops: %p", __func__, state, sptm_ops);
9536 	}
9537 
9538 	PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_START, pa, attributes, state->sptm_ops_index);
9539 
9540 	/* Copy the states into local variables. */
9541 	unsigned int sptm_ops_index = state->sptm_ops_index;
9542 	pmap_paddr_t last_table_last_papt_pa = state->last_table_last_papt_pa;
9543 	pv_entry_t *pvep = state->pvep;
9544 	pt_entry_t *ptep = state->ptep;
9545 	unsigned int pve_ptep_idx = state->pve_ptep_idx;
9546 
9547 	unsigned int pai = pa_index(pa);
9548 
9549 	/* We should at least have one free slot in the ops table. */
9550 	assert(sptm_ops_index < SPTM_MAPPING_LIMIT);
9551 
9552 	/* The PVH lock for pa has to be locked. */
9553 	pvh_assert_locked(pai);
9554 
9555 	/* If pvep and ptep are both null in the state, it's a new page. Initialize the states. */
9556 	if (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL) {
9557 		const uintptr_t pvh = pai_to_pvh(pai);
9558 		if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
9559 			ptep = PT_ENTRY_NULL;
9560 			pvep = pvh_pve_list(pvh);
9561 			pve_ptep_idx = 0;
9562 		} else if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
9563 			ptep  = pvh_ptep(pvh);
9564 			pvep = PV_ENTRY_NULL;
9565 			pve_ptep_idx = 0;
9566 		}
9567 	}
9568 
9569 	/**
9570 	 * The first entry filled in is always the PAPT header entry:
9571 	 *
9572 	 * 1) In the case of a fresh ops table, the first entry has to be a PAPT header.
9573 	 * 2) In the case of a fresh page, we need to insert a new PAPT header to request
9574 	 *    SPTM to operate on a new page.
9575 	 *
9576 	 * Remember the index of the PAPT header here so that we can update the number
9577 	 * of mappings field later when we finish collecting.
9578 	 */
9579 	const unsigned int papt_sptm_ops_index = sptm_ops_index;
9580 	unsigned int num_mappings = 0;
9581 
9582 	/* Assemble the PTE template for the PAPT mapping. */
9583 	const vm_address_t kva = phystokv(pa);
9584 	const pt_entry_t *papt_ptep = pmap_pte(kernel_pmap, kva);
9585 
9586 	pt_entry_t template = os_atomic_load(papt_ptep, relaxed);
9587 	template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9588 	template |= wimg_to_pte(attributes, pa);
9589 
9590 	/* Fill in the PAPT header entry. */
9591 	sptm_ops[papt_sptm_ops_index].per_paddr_header.paddr = pa;
9592 	sptm_ops[papt_sptm_ops_index].per_paddr_header.papt_pte_template = template;
9593 	sptm_ops[papt_sptm_ops_index].per_paddr_header.options = SPTM_UPDATE_SH | SPTM_UPDATE_MAIR | SPTM_UPDATE_DEFER_TLBI;
9594 
9595 	if ((papt_sptm_ops_index == 0) && (pa == last_table_last_papt_pa)) {
9596 		/**
9597 		 * If the previous SPTM call was made with an ops table that already included
9598 		 * updating the PA of the page that this table starts with, then we can assume
9599 		 * that call already updated the PAPT and we can safely skip it in this
9600 		 * upcoming one.
9601 		 */
9602 		sptm_ops[0].per_paddr_header.options |= SPTM_UPDATE_SKIP_PAPT;
9603 	}
9604 
9605 	sptm_ops_index++;
9606 
9607 	/**
9608 	 * Main loop for collecting the mappings into the ops table. It terminates either
9609 	 * when the ops table is full or the PV list is exhausted.
9610 	 */
9611 	while ((sptm_ops_index < SPTM_MAPPING_LIMIT) && (pvep != PV_ENTRY_NULL || ptep != PT_ENTRY_NULL)) {
9612 		/**
9613 		 * Update ptep. There are really two cases here:
9614 		 *
9615 		 * 1) pvep is PV_ENTRY_NULL. In this case, ptep holds the pointer to
9616 		 *    the only mapping to the page.
9617 		 * 2) pvep is not PV_ENTRY_NULL. In such case, ptep is updated accroding to
9618 		 *    pvep and pve_ptep_idx.
9619 		 */
9620 		if (pvep != PV_ENTRY_NULL) {
9621 			ptep = pve_get_ptep(pvep, pve_ptep_idx);
9622 
9623 			/* This pve is empty, so skip to next one. */
9624 			if (ptep == PT_ENTRY_NULL) {
9625 				goto sucaoc_skip_pte;
9626 			}
9627 		}
9628 
9629 #ifdef PVH_FLAG_IOMMU
9630 		/* Skip IOMMU pteps. */
9631 		if (pvh_ptep_is_iommu(ptep)) {
9632 			goto sucaoc_skip_pte;
9633 		}
9634 #endif
9635 		/* Assemble the PTE template for the mapping. */
9636 		const vm_address_t va = ptep_get_va(ptep);
9637 		const pmap_t pmap = ptep_get_pmap(ptep);
9638 
9639 		template = os_atomic_load(ptep, relaxed);
9640 		template &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
9641 		template |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, pa);
9642 
9643 		/* Fill into the ops table. */
9644 		sptm_ops[sptm_ops_index].disjoint_op.root_pt_paddr = pmap->ttep;
9645 		sptm_ops[sptm_ops_index].disjoint_op.vaddr = va;
9646 		sptm_ops[sptm_ops_index].disjoint_op.pte_template = template;
9647 
9648 		/* Move the sptm ops table cursor. */
9649 		sptm_ops_index++;
9650 
9651 		/* Increment the mappings counter. */
9652 		num_mappings++;
9653 
9654 sucaoc_skip_pte:
9655 		/**
9656 		 * Reset ptep to PT_ENTRY_NULL to keep the loop precondition of either ptep
9657 		 * or pvep is nonnull (not both, not neither) true.
9658 		 */
9659 		ptep = PT_ENTRY_NULL;
9660 
9661 		/* Advance to next pvep if we have exhausted the pteps in it. */
9662 		if ((pvep != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
9663 			pve_ptep_idx = 0;
9664 			pvep = pve_next(pvep);
9665 		}
9666 	}
9667 
9668 	/* Update the PAPT header for the number of mappings. */
9669 	sptm_ops[papt_sptm_ops_index].per_paddr_header.num_mappings = num_mappings;
9670 
9671 	const bool full_table = (sptm_ops_index >= SPTM_MAPPING_LIMIT);
9672 	const bool collection_done_for_page = (pvep == PV_ENTRY_NULL && ptep == PT_ENTRY_NULL);
9673 
9674 	/**
9675 	 * The ops table is full, so the caller should now invoke the SPTM before calling
9676 	 * into this function again.
9677 	 */
9678 	if (full_table) {
9679 		/* Update last_table_last_papt_pa to be the pa collected in this call. */
9680 		last_table_last_papt_pa = pa;
9681 
9682 		/* Reset sptm_ops_index. */
9683 		sptm_ops_index = 0;
9684 	}
9685 
9686 	/* Copy the updated collection states back to the parameter structure. */
9687 	state->sptm_ops_index = sptm_ops_index;
9688 	state->last_table_last_papt_pa = last_table_last_papt_pa;
9689 	state->pvep = pvep;
9690 	state->ptep = ptep;
9691 	state->pve_ptep_idx = pve_ptep_idx;
9692 
9693 	/* Assemble the return value. */
9694 	pmap_sptm_update_cache_attr_ops_collect_return_t retval = OPS_COLLECT_NOTHING;
9695 
9696 	if (full_table) {
9697 		retval |= OPS_COLLECT_RETURN_FULL_TABLE;
9698 	}
9699 
9700 	if (collection_done_for_page) {
9701 		retval |= OPS_COLLECT_RETURN_COMPLETED_PAGE;
9702 	}
9703 
9704 	PMAP_TRACE(2, PMAP_CODE(PMAP__COLLECT_CACHE_OPS) | DBG_FUNC_END, pa, attributes, sptm_ops_index);
9705 
9706 	return retval;
9707 }
9708 
9709 /* At least one PAPT header plus one mapping. */
9710 static_assert(SPTM_MAPPING_LIMIT >= 2);
9711 
9712 /**
9713  * Returns if a cache attribute is allowed (on managed pages).
9714  *
9715  * @param attributes A 32-bit value whose VM_WIMG_MASK bits represent the
9716  *                   cache attribute.
9717  *
9718  * @return True if the cache attribute is allowed on managed pages.
9719  *         False otherwise.
9720  */
9721 static bool
9722 pmap_is_cache_attribute_allowed(unsigned int attributes)
9723 {
9724 	if (pmap_panic_dev_wimg_on_managed) {
9725 		switch (attributes & VM_WIMG_MASK) {
9726 		/* supported on DRAM, but slow, so we disallow */
9727 		case VM_WIMG_IO:                        // nGnRnE
9728 		case VM_WIMG_POSTED:                    // nGnRE
9729 
9730 		/* unsupported on DRAM */
9731 		case VM_WIMG_POSTED_REORDERED:          // nGRE
9732 		case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
9733 			return false;
9734 
9735 		default:
9736 			return true;
9737 		}
9738 	}
9739 
9740 	return true;
9741 }
9742 
9743 /**
9744  * Batch updates the cache attributes of a list of pages in three passes.
9745  *
9746  * In pass one, the pp_attr_table and the pte are updated (by SPTM) for the pages in the list.
9747  * In pass two, TLB entries are flushed for each page in the list if necessary.
9748  * In pass three, caches are cleaned for each page in the list if necessary.
9749  *
9750  * @param page_list List of pages to be updated.
9751  * @param cacheattr The new cache attributes.
9752  * @param update_attr_table Whether the pp_attr_table should be updated. This is useful for compressor
9753  *                          pages where it's desired to keep the old WIMG bits.
9754  */
9755 void
9756 pmap_batch_set_cache_attributes_internal(
9757 	const unified_page_list_t *page_list,
9758 	unsigned int cacheattr,
9759 	bool update_attr_table)
9760 {
9761 	bool tlb_flush_pass_needed = false;
9762 	bool rt_cache_flush_pass_needed = false;
9763 	bool preemption_disabled = false;
9764 
9765 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE1);
9766 
9767 	pmap_sptm_percpu_data_t *sptm_pcpu = NULL;
9768 	sptm_update_disjoint_multipage_op_t *sptm_ops = NULL;
9769 
9770 	pmap_sptm_update_cache_attr_ops_collect_state_t state = {0};
9771 
9772 	unified_page_list_iterator_t iter;
9773 
9774 	for (unified_page_list_iterator_init(page_list, &iter);
9775 	    !unified_page_list_iterator_end(&iter);
9776 	    unified_page_list_iterator_next(&iter)) {
9777 		bool is_fictitious = false;
9778 		const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
9779 		const pmap_paddr_t paddr = ptoa(pn);
9780 
9781 		/**
9782 		 * Skip if the page is not managed.
9783 		 *
9784 		 * We don't panic here because sometimes the user just blindly pass in
9785 		 * pages that are not managed. We need to handle that gracefully.
9786 		 */
9787 		if (__improbable(!pa_valid(paddr) || is_fictitious)) {
9788 			continue;
9789 		}
9790 
9791 		const unsigned int pai = pa_index(paddr);
9792 		locked_pvh_t locked_pvh = {.pvh = 0};
9793 
9794 		if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
9795 			/**
9796 			 * If we're partway through processing a multi-page batched call,
9797 			 * preemption will already be disabled so we can't simply call
9798 			 * pvh_lock() which may block.  Instead, we first try to acquire
9799 			 * the lock without waiting, which in most cases should succeed.
9800 			 * If it fails, we submit the pending batched operations to re-
9801 			 * enable preemption and then acquire the lock normally.
9802 			 */
9803 			locked_pvh = pvh_try_lock(pai);
9804 			if (__improbable(!pvh_try_lock_success(&locked_pvh))) {
9805 				assert(preemption_disabled);
9806 				const sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
9807 				pmap_retype_epoch_exit();
9808 				enable_preemption();
9809 				preemption_disabled = false;
9810 				if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9811 					tlb_flush_pass_needed = true;
9812 				}
9813 				state.sptm_ops_index = 0;
9814 				locked_pvh = pvh_lock(pai);
9815 			}
9816 		} else {
9817 			locked_pvh = pvh_lock(pai);
9818 		}
9819 		assert(locked_pvh.pvh != 0);
9820 
9821 		const pp_attr_t pp_attr_current = pp_attr_table[pai];
9822 
9823 		unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
9824 		if (pp_attr_current & PP_ATTR_WIMG_MASK) {
9825 			wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
9826 		}
9827 
9828 		const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
9829 
9830 		unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
9831 		if (pp_attr_template & PP_ATTR_WIMG_MASK) {
9832 			wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
9833 		}
9834 
9835 		/**
9836 		 * When update_attr_table is false, we know that wimg_bits_prev read from pp_attr_table is not to be trusted,
9837 		 * and we should force update the cache attribute.
9838 		 */
9839 		const bool force_update = !update_attr_table;
9840 		/* Update the cache attributes in PTE and PP_ATTR table. */
9841 		if ((wimg_bits_new != wimg_bits_prev) || force_update) {
9842 			if (!pmap_is_cache_attribute_allowed(cacheattr)) {
9843 				panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, pn=%#x",
9844 				    __func__, cacheattr & VM_WIMG_MASK, pn);
9845 			}
9846 
9847 			/* Update PP_ATTR_TABLE */
9848 			if (update_attr_table) {
9849 				pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
9850 			}
9851 
9852 			bool mapping_collection_done = false;
9853 			bool pvh_lock_sleep_mode_needed = false;
9854 			do {
9855 				if (__improbable(pvh_lock_sleep_mode_needed)) {
9856 					assert(!preemption_disabled);
9857 					pvh_lock_enter_sleep_mode(&locked_pvh);
9858 					pvh_lock_sleep_mode_needed = false;
9859 				}
9860 
9861 				/* Disable preemption to use the per-CPU structure safely. */
9862 				if (!preemption_disabled) {
9863 					preemption_disabled = true;
9864 					disable_preemption();
9865 					/**
9866 					 * Enter the retype epoch while we gather the disjoint update arguments
9867 					 * and issue the SPTM call.  Since this operation may cover multiple physical
9868 					 * pages, we may construct the argument array and invoke the SPTM without holding
9869 					 * all relevant PVH locks, we need to record that we are collecting and modifying
9870 					 * mapping state so that e.g. pmap_page_protect() does not attempt to retype the
9871 					 * underlying pages and pmap_remove() does not attempt to free the page tables
9872 					 * used for these mappings without first draining our epoch.
9873 					 */
9874 					pmap_retype_epoch_enter();
9875 
9876 					sptm_pcpu = PERCPU_GET(pmap_sptm_percpu);
9877 					sptm_ops = (sptm_update_disjoint_multipage_op_t *) sptm_pcpu->sptm_ops;
9878 				}
9879 
9880 				/* The return value indicates if we should call into SPTM in this iteration. */
9881 				pmap_sptm_update_cache_attr_ops_collect_return_t retval =
9882 				    pmap_sptm_update_cache_attr_ops_collect(&state, sptm_ops, paddr, cacheattr);
9883 
9884 				/* The collection routine should only return if it needs attention. */
9885 				assert(retval != OPS_COLLECT_NOTHING);
9886 
9887 				/* Gather information for next step from the return value. */
9888 				mapping_collection_done = retval & OPS_COLLECT_RETURN_COMPLETED_PAGE;
9889 				const bool call_sptm = retval & OPS_COLLECT_RETURN_FULL_TABLE;
9890 
9891 				if (call_sptm) {
9892 					/* Call into SPTM with this SPTM ops table. */
9893 					sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, SPTM_MAPPING_LIMIT);
9894 					/**
9895 					 * We may be submitting the batch and exiting the epoch partway through
9896 					 * processing the PV list for a page.  That's fine, because in that case we'll
9897 					 * hold the PV lock for that page, which will prevent mappings of that page from
9898 					 * being disconnected and will prevent the completion of pmap_remove() against
9899 					 * any of those mappings, thus also guaranteeing the relevant page table pages
9900 					 * can't be freed.  The epoch still protects mappings for any prior page in
9901 					 * the batch, whose PV locks are no longer held.
9902 					 */
9903 					pmap_retype_epoch_exit();
9904 					/**
9905 					 * Balance out the explicit disable_preemption() made either at the beginning of
9906 					 * the function or on a prior iteration of the loop that placed the PVH lock in
9907 					 * sleep mode.  Note that enable_preemption() decrements a per-thread counter,
9908 					 * so if we still happen to hold the PVH lock in spin mode preemption won't
9909 					 * actually be re-enabled until we switch the lock over to sleep mode on
9910 					 * the next iteration.
9911 					 */
9912 					enable_preemption();
9913 					preemption_disabled = false;
9914 					pvh_lock_sleep_mode_needed = true;
9915 
9916 					if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9917 						tlb_flush_pass_needed = true;
9918 					}
9919 				}
9920 
9921 				/* We cannot be in a situation where we didn't call into SPTM while also having not finished walking the pv list. */
9922 				assert(call_sptm || mapping_collection_done);
9923 			} while (!mapping_collection_done);
9924 
9925 			/**
9926 			 * We could technically force the cache flush pass here when force_update is true, but
9927 			 * since the compressor mapping/unmapping path handles cache flushing itself, it's fine
9928 			 * leaving this as is.
9929 			 */
9930 			if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
9931 				rt_cache_flush_pass_needed = true;
9932 			}
9933 		}
9934 
9935 		pvh_unlock(&locked_pvh);
9936 	}
9937 
9938 	if (pmap_is_sptm_update_cache_attr_ops_pending(state)) {
9939 		assert(preemption_disabled);
9940 		sptm_return_t sptm_ret = sptm_update_disjoint_multipage(sptm_pcpu->sptm_ops_pa, state.sptm_ops_index);
9941 		pmap_retype_epoch_exit();
9942 		if (sptm_ret == SPTM_UPDATE_DELAYED_TLBI) {
9943 			tlb_flush_pass_needed = true;
9944 		}
9945 
9946 		/**
9947 		 * This is the last sptm_update_cache_attr() call whatsoever, so it's
9948 		 * okay not to update the state variables.
9949 		 */
9950 
9951 		enable_preemption();
9952 	} else if (preemption_disabled) {
9953 		pmap_retype_epoch_exit();
9954 		enable_preemption();
9955 	}
9956 
9957 	if (tlb_flush_pass_needed) {
9958 		/* Sync the PTE writes before potential TLB/Cache flushes. */
9959 		FLUSH_PTE_STRONG();
9960 
9961 		/**
9962 		 * Pass 2: for each physical page and for each mapping, we need to flush
9963 		 * the TLB for it.
9964 		 */
9965 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE2);
9966 		for (unified_page_list_iterator_init(page_list, &iter);
9967 		    !unified_page_list_iterator_end(&iter);
9968 		    unified_page_list_iterator_next(&iter)) {
9969 			bool is_fictitious = false;
9970 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
9971 			const pmap_paddr_t paddr = ptoa(pn);
9972 
9973 			if (__improbable(!pa_valid(paddr) || is_fictitious)) {
9974 				continue;
9975 			}
9976 
9977 			pmap_flush_tlb_for_paddr_async(paddr);
9978 		}
9979 
9980 #if HAS_FEAT_XS
9981 		/* With FEAT_XS, ordinary DSBs drain the prefetcher. */
9982 		arm64_sync_tlb(false);
9983 #else
9984 		/**
9985 		 * For targets that distinguish between mild and strong DSB, mild DSB
9986 		 * will not drain the prefetcher.  This can lead to prefetch-driven
9987 		 * cache fills that defeat the uncacheable requirement of the RT memory type.
9988 		 * In those cases, strong DSB must instead be employed to drain the prefetcher.
9989 		 */
9990 		arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
9991 #endif
9992 	}
9993 
9994 	if (rt_cache_flush_pass_needed) {
9995 		/* Pass 3: Flush the cache if the page is recently set to RT */
9996 		PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE3);
9997 		/**
9998 		 * We disable preemption to ensure we are not preempted
9999 		 * in the state where DC by VA instructions remain enabled.
10000 		 */
10001 		disable_preemption();
10002 
10003 		assert(get_preemption_level() > 0);
10004 
10005 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10006 		/**
10007 		 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10008 		 * and the host will handle cache maintenance for it. So we don't need to
10009 		 * worry about enabling the ops here for AVP.
10010 		 */
10011 		enable_dc_mva_ops();
10012 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10013 		/**
10014 		 * DMB should be sufficient to ensure prior accesses to the memory in question are
10015 		 * correctly ordered relative to the upcoming cache maintenance operations.
10016 		 */
10017 		__builtin_arm_dmb(DMB_SY);
10018 
10019 		for (unified_page_list_iterator_init(page_list, &iter);
10020 		    !unified_page_list_iterator_end(&iter);) {
10021 			bool is_fictitious = false;
10022 			const ppnum_t pn = unified_page_list_iterator_page(&iter, &is_fictitious);
10023 			const pmap_paddr_t paddr = ptoa(pn);
10024 
10025 			if (__improbable(!pa_valid(paddr) || is_fictitious)) {
10026 				unified_page_list_iterator_next(&iter);
10027 				continue;
10028 			}
10029 
10030 			CleanPoC_DcacheRegion_Force_nopreempt_nohid_nobarrier(phystokv(paddr), PAGE_SIZE);
10031 
10032 			unified_page_list_iterator_next(&iter);
10033 			if (__improbable(pmap_pending_preemption() && !unified_page_list_iterator_end(&iter))) {
10034 				__builtin_arm_dsb(DSB_SY);
10035 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10036 				disable_dc_mva_ops();
10037 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10038 				enable_preemption();
10039 				assert(preemption_enabled());
10040 				disable_preemption();
10041 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10042 				enable_dc_mva_ops();
10043 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10044 			}
10045 		}
10046 
10047 		/* Issue DSB to ensure cache maintenance is fully complete before subsequent accesses. */
10048 		__builtin_arm_dsb(DSB_SY);
10049 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10050 		disable_dc_mva_ops();
10051 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10052 
10053 		enable_preemption();
10054 	}
10055 
10056 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_list, cacheattr, 0xCECC0DE4);
10057 }
10058 
10059 /**
10060  * Batch updates the cache attributes of a list of pages. This is a wrapper for
10061  * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10062  *
10063  * @param page_list List of pages to be updated.
10064  * @param cacheattr The new cache attribute.
10065  */
10066 void
10067 pmap_batch_set_cache_attributes(
10068 	const unified_page_list_t *page_list,
10069 	unsigned int cacheattr)
10070 {
10071 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_list, cacheattr, 0xCECC0DE0);
10072 
10073 	/* Verify we are being called from a preemptible context. */
10074 	pmap_verify_preemptible();
10075 
10076 	pmap_batch_set_cache_attributes_internal(page_list, cacheattr, true);
10077 
10078 	PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_list, cacheattr, 0xCECC0DEF);
10079 }
10080 
10081 MARK_AS_PMAP_TEXT void
10082 pmap_set_cache_attributes_internal(
10083 	ppnum_t pn,
10084 	unsigned int cacheattr,
10085 	bool update_attr_table)
10086 {
10087 	upl_page_info_t single_page_upl = { .phys_addr = pn };
10088 	const unified_page_list_t page_list = {
10089 		.upl = {.upl_info = &single_page_upl, .upl_size = 1},
10090 		.type = UNIFIED_PAGE_LIST_TYPE_UPL_ARRAY,
10091 	};
10092 
10093 	pmap_batch_set_cache_attributes_internal(&page_list, cacheattr, update_attr_table);
10094 }
10095 
10096 void
10097 pmap_set_cache_attributes(
10098 	ppnum_t pn,
10099 	unsigned int cacheattr)
10100 {
10101 	pmap_set_cache_attributes_internal(pn, cacheattr, true);
10102 }
10103 
10104 void
10105 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
10106     vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
10107 {
10108 	pmap_paddr_t data_pa = 0; // data address
10109 	pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
10110 	pmap_paddr_t text_pa = 0; // text address
10111 
10112 	*kernel_data_addr = 0;
10113 	*kernel_text_addr = 0;
10114 	*user_text_addr = 0;
10115 
10116 	kern_return_t kr = pmap_page_alloc(&data_pa, PMAP_PAGE_ALLOCATE_NONE);
10117 	assert(kr == KERN_SUCCESS);
10118 
10119 	kr = pmap_page_alloc(&ro_data_pa, PMAP_PAGE_ALLOCATE_NONE);
10120 	assert(kr == KERN_SUCCESS);
10121 
10122 #if CONFIG_ARM_PFZ
10123 	kr = pmap_page_alloc(&text_pa, PMAP_PAGE_ALLOCATE_NONE);
10124 	assert(kr == KERN_SUCCESS);
10125 
10126 	/**
10127 	 *  User mapping of comm page text section for 64 bit mapping only
10128 	 *
10129 	 * We don't insert it into the 32 bit mapping because we don't want 32 bit
10130 	 * user processes to get this page mapped in, they should never call into
10131 	 * this page.
10132 	 *
10133 	 * The data comm page is in a pre-reserved L3 VA range and the text commpage
10134 	 * is slid in the same L3 as the data commpage.  It is either outside the
10135 	 * max of user VA or is pre-reserved in vm_map_exec(). This means that
10136 	 * it is reserved and unavailable to mach VM for future mappings.
10137 	 */
10138 	const int num_ptes = pt_attr_leaf_size(native_pt_attr) >> PTE_SHIFT;
10139 
10140 	do {
10141 		const int text_leaf_index = random() % num_ptes;
10142 
10143 		/**
10144 		 * Generate a VA for the commpage text with the same root and twig index as data
10145 		 * comm page, but with new leaf index we've just generated.
10146 		 */
10147 		commpage_text_user_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(native_pt_attr));
10148 		commpage_text_user_va |= (text_leaf_index << pt_attr_leaf_shift(native_pt_attr));
10149 	} while ((commpage_text_user_va == _COMM_PAGE64_BASE_ADDRESS) ||
10150 	    (commpage_text_user_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
10151 
10152 	*user_text_addr = commpage_text_user_va;
10153 	*kernel_text_addr = phystokv(text_pa);
10154 #endif
10155 
10156 	/* For manipulation in kernel, go straight to physical page */
10157 	commpage_data_pa = data_pa;
10158 	*kernel_data_addr = phystokv(data_pa);
10159 	assert(commpage_ro_data_pa == 0);
10160 	commpage_ro_data_pa = ro_data_pa;
10161 	*kernel_ro_data_addr = phystokv(ro_data_pa);
10162 	assert(commpage_text_pa == 0);
10163 	commpage_text_pa = text_pa;
10164 }
10165 
10166 
10167 /*
10168  * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
10169  * with user controlled TTEs for regions that aren't explicitly reserved by the
10170  * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
10171  */
10172 #if (ARM_PGSHIFT == 14)
10173 /**
10174  * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
10175  * commpage completely above the maximum 32-bit userspace VA.
10176  */
10177 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
10178 static_assert(_COMM_PAGE64_NESTING_START == SPTM_ARM64_COMMPAGE_REGION_START);
10179 static_assert(_COMM_PAGE64_NESTING_SIZE == SPTM_ARM64_COMMPAGE_REGION_SIZE);
10180 
10181 /**
10182  * Normally there'd be an assert to check that 64-bit devices with 64-bit
10183  * userspace VAs can nest the commpage completely above the maximum 64-bit
10184  * userpace VA, but that technically isn't true on macOS. On those systems, the
10185  * commpage lives within the userspace VA range, but is protected by the VM as
10186  * a reserved region (see vm_reserved_regions[] definition for more info).
10187  */
10188 
10189 #elif (ARM_PGSHIFT == 12)
10190 /**
10191  * Ensure that 64-bit devices using 4K pages can nest the commpage completely
10192  * above the maximum userspace VA.
10193  */
10194 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
10195 #else
10196 #error Nested shared page mapping is unsupported on this config
10197 #endif
10198 
10199 MARK_AS_PMAP_TEXT kern_return_t
10200 pmap_insert_commpage_internal(
10201 	pmap_t pmap)
10202 {
10203 	kern_return_t kr = KERN_SUCCESS;
10204 	vm_offset_t commpage_vaddr;
10205 	pt_entry_t *ttep;
10206 	pmap_paddr_t commpage_table = commpage_default_table;
10207 
10208 	/* Validate the pmap input before accessing its data. */
10209 	validate_pmap_mutable(pmap);
10210 
10211 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10212 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10213 
10214 #if __ARM_MIXED_PAGE_SIZE__
10215 #if !__ARM_16K_PG__
10216 	/* The following code assumes that commpage_pmap_default is a 16KB pmap. */
10217 	#error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
10218 #endif /* !__ARM_16K_PG__ */
10219 
10220 	/* Choose the correct shared page pmap to use. */
10221 	const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
10222 	if (pmap_page_size == 4096) {
10223 		if (pmap_is_64bit(pmap)) {
10224 			commpage_table = commpage_4k_table;
10225 		} else {
10226 			panic("32-bit commpage not currently supported for SPTM configurations");
10227 			//commpage_table = commpage32_4k_table;
10228 		}
10229 	} else if (pmap_page_size != 16384) {
10230 		panic("No commpage table exists for the wanted page size: %llu", pmap_page_size);
10231 	} else
10232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
10233 	{
10234 		if (pmap_is_64bit(pmap)) {
10235 			commpage_table = commpage_default_table;
10236 		} else {
10237 			panic("32-bit commpage not currently supported for SPTM configurations");
10238 			//commpage_table = commpage32_default_table;
10239 		}
10240 	}
10241 
10242 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
10243 #error We assume a single page.
10244 #endif
10245 
10246 	if (pmap_is_64bit(pmap)) {
10247 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10248 	} else {
10249 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10250 	}
10251 
10252 
10253 	pmap_lock(pmap, PMAP_LOCK_SHARED);
10254 
10255 	/*
10256 	 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
10257 	 * two (2MB) depending on the address space layout. For 16KB pages, each level
10258 	 * one entry is 64GB, so we must go to the second level entry (32MB) in order
10259 	 * to "nest".
10260 	 *
10261 	 * Note: This is not "nesting" in the shared cache sense. This definition of
10262 	 * nesting just means inserting pointers to pre-allocated tables inside of
10263 	 * the passed in pmap to allow us to share page tables (which map the shared
10264 	 * page) for every task. This saves at least one page of memory per process
10265 	 * compared to creating new page tables in every process for mapping the
10266 	 * shared page.
10267 	 */
10268 
10269 	/**
10270 	 * Allocate the twig page tables if needed, and slam a pointer to the shared
10271 	 * page's tables into place.
10272 	 */
10273 	while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
10274 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
10275 
10276 		kr = pmap_expand(pmap, commpage_vaddr, 0, commpage_level);
10277 
10278 		if (kr != KERN_SUCCESS) {
10279 			panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
10280 		}
10281 
10282 		pmap_lock(pmap, PMAP_LOCK_SHARED);
10283 	}
10284 
10285 	if (*ttep != ARM_PTE_EMPTY) {
10286 		panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
10287 	}
10288 
10289 	sptm_map_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level,
10290 	    (commpage_table & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID);
10291 
10292 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
10293 
10294 	return kr;
10295 }
10296 
10297 static void
10298 pmap_unmap_commpage(
10299 	pmap_t pmap)
10300 {
10301 	pt_entry_t *ptep;
10302 	vm_offset_t commpage_vaddr;
10303 
10304 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10305 	const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
10306 	__assert_only pmap_paddr_t commpage_pa = commpage_data_pa;
10307 
10308 	if (pmap_is_64bit(pmap)) {
10309 		commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
10310 	} else {
10311 		commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
10312 	}
10313 
10314 
10315 	ptep = pmap_pte(pmap, commpage_vaddr);
10316 
10317 	if (ptep == NULL) {
10318 		return;
10319 	}
10320 
10321 	/* It had better be mapped to the shared page. */
10322 	if (pte_to_pa(*ptep) != commpage_pa) {
10323 		panic("%s: non-commpage PA 0x%llx mapped at VA 0x%llx in pmap %p; expected 0x%llx",
10324 		    __func__, (unsigned long long)pte_to_pa(*ptep), (unsigned long long)commpage_vaddr,
10325 		    pmap, (unsigned long long)commpage_pa);
10326 	}
10327 
10328 	sptm_unmap_table(pmap->ttep, pt_attr_align_va(pt_attr, commpage_level, commpage_vaddr), (sptm_pt_level_t)commpage_level);
10329 }
10330 
10331 void
10332 pmap_insert_commpage(
10333 	pmap_t pmap)
10334 {
10335 	pmap_insert_commpage_internal(pmap);
10336 }
10337 
10338 static boolean_t
10339 pmap_is_64bit(
10340 	pmap_t pmap)
10341 {
10342 	return pmap->is_64bit;
10343 }
10344 
10345 bool
10346 pmap_is_exotic(
10347 	pmap_t pmap __unused)
10348 {
10349 	return false;
10350 }
10351 
10352 
10353 /* ARMTODO -- an implementation that accounts for
10354  * holes in the physical map, if any.
10355  */
10356 boolean_t
10357 pmap_valid_page(
10358 	ppnum_t pn)
10359 {
10360 	return pa_valid(ptoa(pn));
10361 }
10362 
10363 boolean_t
10364 pmap_bootloader_page(
10365 	ppnum_t pn)
10366 {
10367 	pmap_paddr_t paddr = ptoa(pn);
10368 
10369 	if (pa_valid(paddr)) {
10370 		return FALSE;
10371 	}
10372 	pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10373 	return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
10374 }
10375 
10376 MARK_AS_PMAP_TEXT boolean_t
10377 pmap_is_empty_internal(
10378 	pmap_t pmap,
10379 	vm_map_offset_t va_start,
10380 	vm_map_offset_t va_end)
10381 {
10382 	vm_map_offset_t block_start, block_end;
10383 	tt_entry_t *tte_p;
10384 
10385 	if (pmap == NULL) {
10386 		return TRUE;
10387 	}
10388 
10389 	validate_pmap(pmap);
10390 
10391 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10392 	unsigned int initial_not_in_kdp = not_in_kdp;
10393 
10394 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10395 		pmap_lock(pmap, PMAP_LOCK_SHARED);
10396 	}
10397 
10398 
10399 	/* TODO: This will be faster if we increment ttep at each level. */
10400 	block_start = va_start;
10401 
10402 	while (block_start < va_end) {
10403 		pt_entry_t     *bpte_p, *epte_p;
10404 		pt_entry_t     *pte_p;
10405 
10406 		block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10407 		if (block_end > va_end) {
10408 			block_end = va_end;
10409 		}
10410 
10411 		tte_p = pmap_tte(pmap, block_start);
10412 		if ((tte_p != PT_ENTRY_NULL) && tte_is_valid_table(*tte_p)) {
10413 			pte_p = (pt_entry_t *) ttetokv(*tte_p);
10414 			bpte_p = &pte_p[pte_index(pt_attr, block_start)];
10415 			epte_p = &pte_p[pte_index(pt_attr, block_end)];
10416 
10417 			for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
10418 				if (*pte_p != ARM_PTE_EMPTY) {
10419 					if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10420 						pmap_unlock(pmap, PMAP_LOCK_SHARED);
10421 					}
10422 					return FALSE;
10423 				}
10424 			}
10425 		}
10426 		block_start = block_end;
10427 	}
10428 
10429 	if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
10430 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
10431 	}
10432 
10433 	return TRUE;
10434 }
10435 
10436 boolean_t
10437 pmap_is_empty(
10438 	pmap_t pmap,
10439 	vm_map_offset_t va_start,
10440 	vm_map_offset_t va_end)
10441 {
10442 	return pmap_is_empty_internal(pmap, va_start, va_end);
10443 }
10444 
10445 vm_map_offset_t
10446 pmap_max_offset(
10447 	boolean_t               is64,
10448 	unsigned int    option)
10449 {
10450 	return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
10451 }
10452 
10453 vm_map_offset_t
10454 pmap_max_64bit_offset(
10455 	__unused unsigned int option)
10456 {
10457 	vm_map_offset_t max_offset_ret = 0;
10458 
10459 	const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
10460 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10461 		max_offset_ret = arm64_pmap_max_offset_default;
10462 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10463 		max_offset_ret = min_max_offset;
10464 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10465 		max_offset_ret = MACH_VM_MAX_ADDRESS;
10466 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10467 		if (arm64_pmap_max_offset_default) {
10468 			max_offset_ret = arm64_pmap_max_offset_default;
10469 		} else if (max_mem > 0xC0000000) {
10470 			// devices with > 3GB of memory
10471 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
10472 		} else if (max_mem > 0x40000000) {
10473 			// devices with > 1GB and <= 3GB of memory
10474 			max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
10475 		} else {
10476 			// devices with <= 1 GB of memory
10477 			max_offset_ret = min_max_offset;
10478 		}
10479 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10480 		if (arm64_pmap_max_offset_default) {
10481 			// Allow the boot-arg to override jumbo size
10482 			max_offset_ret = arm64_pmap_max_offset_default;
10483 		} else {
10484 			max_offset_ret = MACH_VM_JUMBO_ADDRESS;     // Max offset is 64GB for pmaps with special "jumbo" blessing
10485 		}
10486 #if XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT
10487 	} else if (option == ARM_PMAP_MAX_OFFSET_EXTRA_JUMBO) {
10488 		max_offset_ret = MACH_VM_MAX_ADDRESS;
10489 #endif /* XNU_TARGET_OS_IOS && EXTENDED_USER_VA_SUPPORT */
10490 	} else {
10491 		panic("pmap_max_64bit_offset illegal option 0x%x", option);
10492 	}
10493 
10494 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10495 	if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
10496 		assert(max_offset_ret >= min_max_offset);
10497 	}
10498 
10499 	return max_offset_ret;
10500 }
10501 
10502 vm_map_offset_t
10503 pmap_max_32bit_offset(
10504 	unsigned int option)
10505 {
10506 	vm_map_offset_t max_offset_ret = 0;
10507 
10508 	if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
10509 		max_offset_ret = arm_pmap_max_offset_default;
10510 	} else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
10511 		max_offset_ret = VM_MAX_ADDRESS;
10512 	} else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
10513 		max_offset_ret = VM_MAX_ADDRESS;
10514 	} else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
10515 		if (arm_pmap_max_offset_default) {
10516 			max_offset_ret = arm_pmap_max_offset_default;
10517 		} else if (max_mem > 0x20000000) {
10518 			max_offset_ret = VM_MAX_ADDRESS;
10519 		} else {
10520 			max_offset_ret = VM_MAX_ADDRESS;
10521 		}
10522 	} else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
10523 		max_offset_ret = VM_MAX_ADDRESS;
10524 	} else {
10525 		panic("pmap_max_32bit_offset illegal option 0x%x", option);
10526 	}
10527 
10528 	assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
10529 	return max_offset_ret;
10530 }
10531 
10532 #if CONFIG_DTRACE
10533 /*
10534  * Constrain DTrace copyin/copyout actions
10535  */
10536 extern kern_return_t dtrace_copyio_preflight(addr64_t);
10537 extern kern_return_t dtrace_copyio_postflight(addr64_t);
10538 
10539 kern_return_t
10540 dtrace_copyio_preflight(
10541 	__unused addr64_t va)
10542 {
10543 	if (current_map() == kernel_map) {
10544 		return KERN_FAILURE;
10545 	} else {
10546 		return KERN_SUCCESS;
10547 	}
10548 }
10549 
10550 kern_return_t
10551 dtrace_copyio_postflight(
10552 	__unused addr64_t va)
10553 {
10554 	return KERN_SUCCESS;
10555 }
10556 #endif /* CONFIG_DTRACE */
10557 
10558 
10559 void
10560 pmap_flush_context_init(__unused pmap_flush_context *pfc)
10561 {
10562 }
10563 
10564 
10565 void
10566 pmap_flush(
10567 	__unused pmap_flush_context *cpus_to_flush)
10568 {
10569 	/* not implemented yet */
10570 	return;
10571 }
10572 
10573 /**
10574  * Perform basic validation checks on the destination only and
10575  * corresponding offset/sizes prior to writing to a read only allocation.
10576  *
10577  * @note Should be called before writing to an allocation from the read
10578  * only allocator.
10579  *
10580  * @param zid The ID of the zone the allocation belongs to.
10581  * @param va VA of element being modified (destination).
10582  * @param offset Offset being written to, in the element.
10583  * @param new_data_size Size of modification.
10584  *
10585  */
10586 
10587 MARK_AS_PMAP_TEXT static void
10588 pmap_ro_zone_validate_element_dst(
10589 	zone_id_t           zid,
10590 	vm_offset_t         va,
10591 	vm_offset_t         offset,
10592 	vm_size_t           new_data_size)
10593 {
10594 	if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
10595 		panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
10596 		    ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
10597 	}
10598 
10599 	vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
10600 
10601 	/* Check element is from correct zone and properly aligned */
10602 	zone_require_ro(zid, elem_size, (void*)va);
10603 
10604 	if (__improbable(new_data_size > (elem_size - offset))) {
10605 		panic("%s: New data size %lu too large for elem size %lu at addr %p",
10606 		    __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
10607 	}
10608 	if (__improbable(offset >= elem_size)) {
10609 		panic("%s: Offset %lu too large for elem size %lu at addr %p",
10610 		    __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
10611 	}
10612 }
10613 
10614 
10615 /**
10616  * Perform basic validation checks on the source, destination and
10617  * corresponding offset/sizes prior to writing to a read only allocation.
10618  *
10619  * @note Should be called before writing to an allocation from the read
10620  * only allocator.
10621  *
10622  * @param zid The ID of the zone the allocation belongs to.
10623  * @param va VA of element being modified (destination).
10624  * @param offset Offset being written to, in the element.
10625  * @param new_data Pointer to new data (source).
10626  * @param new_data_size Size of modification.
10627  *
10628  */
10629 
10630 MARK_AS_PMAP_TEXT static void
10631 pmap_ro_zone_validate_element(
10632 	zone_id_t           zid,
10633 	vm_offset_t         va,
10634 	vm_offset_t         offset,
10635 	const vm_offset_t   new_data,
10636 	vm_size_t           new_data_size)
10637 {
10638 	vm_offset_t sum = 0;
10639 
10640 	if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
10641 		panic("%s: Integer addition overflow %p + %lu = %lu",
10642 		    __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
10643 	}
10644 
10645 	pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
10646 }
10647 
10648 /**
10649  * Function to configure RO zone access permissions for a forthcoming write operation.
10650  */
10651 static void
10652 pmap_ro_zone_prepare_write(void)
10653 {
10654 }
10655 
10656 /**
10657  * Function to indicate that a preceding RO zone write operation is complete.
10658  */
10659 static void
10660 pmap_ro_zone_complete_write(void)
10661 {
10662 }
10663 
10664 /**
10665  * Function to align an address or size to the required RO zone mapping alignment.
10666  *
10667  * For the SPTM the RO zone region must be aligned on a twig boundary so that at least
10668  * the last-level kernel pagetable can be of the appropriate SPTM RO zone table type,
10669  * which allows the SPTM to enforce RO zone mapping permission restrictions.
10670  *
10671  * @param value the address or size to be aligned.
10672  *
10673  * @return the aligned value
10674  */
10675 vm_offset_t
10676 pmap_ro_zone_align(vm_offset_t value)
10677 {
10678 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(kernel_pmap);
10679 	return PMAP_ALIGN(value, pt_attr_twig_size(pt_attr));
10680 }
10681 
10682 /**
10683  * Function to copy kauth_cred from new_data to kv.
10684  * Function defined in "kern_prot.c"
10685  *
10686  * @note Will be removed upon completion of
10687  * <rdar://problem/72635194> Compiler PAC support for memcpy.
10688  *
10689  * @param kv Address to copy new data to.
10690  * @param new_data Pointer to new data.
10691  *
10692  */
10693 
10694 extern void
10695 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
10696 
10697 /**
10698  * Zalloc-specific memcpy that writes through the physical aperture
10699  * and ensures the element being modified is from a read-only zone.
10700  *
10701  * @note Designed to work only with the zone allocator's read-only submap.
10702  *
10703  * @param zid The ID of the zone to allocate from.
10704  * @param va VA of element to be modified.
10705  * @param offset Offset from element.
10706  * @param new_data Pointer to new data.
10707  * @param new_data_size	Size of modification.
10708  *
10709  */
10710 
10711 void
10712 pmap_ro_zone_memcpy(
10713 	zone_id_t           zid,
10714 	vm_offset_t         va,
10715 	vm_offset_t         offset,
10716 	const vm_offset_t   new_data,
10717 	vm_size_t           new_data_size)
10718 {
10719 	pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
10720 }
10721 
10722 MARK_AS_PMAP_TEXT void
10723 pmap_ro_zone_memcpy_internal(
10724 	zone_id_t             zid,
10725 	vm_offset_t           va,
10726 	vm_offset_t           offset,
10727 	const vm_offset_t     new_data,
10728 	vm_size_t             new_data_size)
10729 {
10730 	if (!new_data || new_data_size == 0) {
10731 		return;
10732 	}
10733 
10734 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10735 	const bool istate = ml_set_interrupts_enabled(FALSE);
10736 	pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
10737 	pmap_ro_zone_prepare_write();
10738 	memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
10739 	pmap_ro_zone_complete_write();
10740 	ml_set_interrupts_enabled(istate);
10741 }
10742 
10743 /**
10744  * Zalloc-specific function to atomically mutate fields of an element that
10745  * belongs to a read-only zone, via the physcial aperture.
10746  *
10747  * @note Designed to work only with the zone allocator's read-only submap.
10748  *
10749  * @param zid The ID of the zone the element belongs to.
10750  * @param va VA of element to be modified.
10751  * @param offset Offset in element.
10752  * @param op Atomic operation to perform.
10753  * @param value	Mutation value.
10754  *
10755  */
10756 
10757 uint64_t
10758 pmap_ro_zone_atomic_op(
10759 	zone_id_t             zid,
10760 	vm_offset_t           va,
10761 	vm_offset_t           offset,
10762 	zro_atomic_op_t       op,
10763 	uint64_t              value)
10764 {
10765 	return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
10766 }
10767 
10768 MARK_AS_PMAP_TEXT uint64_t
10769 pmap_ro_zone_atomic_op_internal(
10770 	zone_id_t             zid,
10771 	vm_offset_t           va,
10772 	vm_offset_t           offset,
10773 	zro_atomic_op_t       op,
10774 	uint64_t              value)
10775 {
10776 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10777 	vm_size_t value_size = op & 0xf;
10778 	const boolean_t istate = ml_set_interrupts_enabled(FALSE);
10779 
10780 	pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
10781 	pmap_ro_zone_prepare_write();
10782 	value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
10783 	pmap_ro_zone_complete_write();
10784 	ml_set_interrupts_enabled(istate);
10785 
10786 	return value;
10787 }
10788 
10789 /**
10790  * bzero for allocations from read only zones, that writes through the
10791  * physical aperture.
10792  *
10793  * @note This is called by the zfree path of all allocations from read
10794  * only zones.
10795  *
10796  * @param zid The ID of the zone the allocation belongs to.
10797  * @param va VA of element to be zeroed.
10798  * @param offset Offset in the element.
10799  * @param size	Size of allocation.
10800  *
10801  */
10802 
10803 void
10804 pmap_ro_zone_bzero(
10805 	zone_id_t       zid,
10806 	vm_offset_t     va,
10807 	vm_offset_t     offset,
10808 	vm_size_t       size)
10809 {
10810 	pmap_ro_zone_bzero_internal(zid, va, offset, size);
10811 }
10812 
10813 MARK_AS_PMAP_TEXT void
10814 pmap_ro_zone_bzero_internal(
10815 	zone_id_t       zid,
10816 	vm_offset_t     va,
10817 	vm_offset_t     offset,
10818 	vm_size_t       size)
10819 {
10820 	const pmap_paddr_t pa = kvtophys_nofail(va + offset);
10821 	const boolean_t istate = ml_set_interrupts_enabled(FALSE);
10822 	pmap_ro_zone_validate_element(zid, va, offset, 0, size);
10823 	pmap_ro_zone_prepare_write();
10824 	bzero((void*)phystokv(pa), size);
10825 	pmap_ro_zone_complete_write();
10826 	ml_set_interrupts_enabled(istate);
10827 }
10828 
10829 #define PMAP_RESIDENT_INVALID   ((mach_vm_size_t)-1)
10830 
10831 MARK_AS_PMAP_TEXT mach_vm_size_t
10832 pmap_query_resident_internal(
10833 	pmap_t                  pmap,
10834 	vm_map_address_t        start,
10835 	vm_map_address_t        end,
10836 	mach_vm_size_t          *compressed_bytes_p)
10837 {
10838 	mach_vm_size_t  resident_bytes = 0;
10839 	mach_vm_size_t  compressed_bytes = 0;
10840 
10841 	pt_entry_t     *bpte, *epte;
10842 	pt_entry_t     *pte_p;
10843 	tt_entry_t     *tte_p;
10844 
10845 	if (pmap == NULL) {
10846 		return PMAP_RESIDENT_INVALID;
10847 	}
10848 
10849 	validate_pmap(pmap);
10850 
10851 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10852 
10853 	/* Ensure that this request is valid, and addresses exactly one TTE. */
10854 	if (__improbable((start % pt_attr_page_size(pt_attr)) ||
10855 	    (end % pt_attr_page_size(pt_attr)))) {
10856 		panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
10857 	}
10858 
10859 	if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
10860 		panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
10861 	}
10862 
10863 	pmap_lock(pmap, PMAP_LOCK_SHARED);
10864 	tte_p = pmap_tte(pmap, start);
10865 	if (tte_p == (tt_entry_t *) NULL) {
10866 		pmap_unlock(pmap, PMAP_LOCK_SHARED);
10867 		return PMAP_RESIDENT_INVALID;
10868 	}
10869 	if (tte_is_valid_table(*tte_p)) {
10870 		pte_p = (pt_entry_t *) ttetokv(*tte_p);
10871 		bpte = &pte_p[pte_index(pt_attr, start)];
10872 		epte = &pte_p[pte_index(pt_attr, end)];
10873 
10874 		for (; bpte < epte; bpte++) {
10875 			if (pte_is_compressed(*bpte, bpte)) {
10876 				compressed_bytes += pt_attr_page_size(pt_attr);
10877 			} else if (pa_valid(pte_to_pa(*bpte))) {
10878 				resident_bytes += pt_attr_page_size(pt_attr);
10879 			}
10880 		}
10881 	}
10882 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
10883 
10884 	if (compressed_bytes_p) {
10885 		*compressed_bytes_p += compressed_bytes;
10886 	}
10887 
10888 	return resident_bytes;
10889 }
10890 
10891 mach_vm_size_t
10892 pmap_query_resident(
10893 	pmap_t                  pmap,
10894 	vm_map_address_t        start,
10895 	vm_map_address_t        end,
10896 	mach_vm_size_t          *compressed_bytes_p)
10897 {
10898 	mach_vm_size_t          total_resident_bytes;
10899 	mach_vm_size_t          compressed_bytes;
10900 	vm_map_address_t        va;
10901 
10902 
10903 	if (pmap == PMAP_NULL) {
10904 		if (compressed_bytes_p) {
10905 			*compressed_bytes_p = 0;
10906 		}
10907 		return 0;
10908 	}
10909 
10910 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
10911 
10912 	total_resident_bytes = 0;
10913 	compressed_bytes = 0;
10914 
10915 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
10916 	    VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
10917 	    VM_KERNEL_ADDRHIDE(end));
10918 
10919 	va = start;
10920 	while (va < end) {
10921 		vm_map_address_t l;
10922 		mach_vm_size_t resident_bytes;
10923 
10924 		l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
10925 
10926 		if (l > end) {
10927 			l = end;
10928 		}
10929 		resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
10930 		if (resident_bytes == PMAP_RESIDENT_INVALID) {
10931 			break;
10932 		}
10933 
10934 		total_resident_bytes += resident_bytes;
10935 
10936 		va = l;
10937 	}
10938 
10939 	if (compressed_bytes_p) {
10940 		*compressed_bytes_p = compressed_bytes;
10941 	}
10942 
10943 	PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
10944 	    total_resident_bytes);
10945 
10946 	return total_resident_bytes;
10947 }
10948 
10949 #if MACH_ASSERT
10950 static void
10951 pmap_check_ledgers(
10952 	pmap_t pmap)
10953 {
10954 	int     pid;
10955 	char    *procname;
10956 
10957 	if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
10958 		/*
10959 		 * This pmap was not or is no longer fully associated
10960 		 * with a task (e.g. the old pmap after a fork()/exec() or
10961 		 * spawn()).  Its "ledger" still points at a task that is
10962 		 * now using a different (and active) address space, so
10963 		 * we can't check that all the pmap ledgers are balanced here.
10964 		 *
10965 		 * If the "pid" is set, that means that we went through
10966 		 * pmap_set_process() in task_terminate_internal(), so
10967 		 * this task's ledger should not have been re-used and
10968 		 * all the pmap ledgers should be back to 0.
10969 		 */
10970 		return;
10971 	}
10972 
10973 	pid = pmap->pmap_pid;
10974 	procname = pmap->pmap_procname;
10975 
10976 	vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
10977 }
10978 #endif /* MACH_ASSERT */
10979 
10980 void
10981 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
10982 {
10983 }
10984 
10985 /**
10986  * The minimum shared region nesting size is used by the VM to determine when to
10987  * break up large mappings to nested regions. The smallest size that these
10988  * mappings can be broken into is determined by what page table level those
10989  * regions are being nested in at and the size of the page tables.
10990  *
10991  * For instance, if a nested region is nesting at L2 for a process utilizing
10992  * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
10993  * block entry).
10994  *
10995  * @param pmap The target pmap to determine the block size based on whether it's
10996  *             using 16KB or 4KB page tables.
10997  */
10998 uint64_t
10999 pmap_shared_region_size_min(__unused pmap_t pmap)
11000 {
11001 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11002 
11003 	/**
11004 	 * We always nest the shared region at L2 (32MB for 16KB pages, 8MB for
11005 	 * 4KB pages). This means that a target pmap will contain L2 entries that
11006 	 * point to shared L3 page tables in the shared region pmap.
11007 	 */
11008 	const uint64_t page_ratio = PAGE_SIZE / pt_attr_page_size(pt_attr);
11009 	return pt_attr_twig_size(pt_attr) * page_ratio;
11010 }
11011 
11012 boolean_t
11013 pmap_enforces_execute_only(
11014 	pmap_t pmap)
11015 {
11016 	return pmap != kernel_pmap;
11017 }
11018 
11019 MARK_AS_PMAP_TEXT void
11020 pmap_set_vm_map_cs_enforced_internal(
11021 	pmap_t pmap,
11022 	bool new_value)
11023 {
11024 	validate_pmap_mutable(pmap);
11025 	pmap->pmap_vm_map_cs_enforced = new_value;
11026 }
11027 
11028 void
11029 pmap_set_vm_map_cs_enforced(
11030 	pmap_t pmap,
11031 	bool new_value)
11032 {
11033 	pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
11034 }
11035 
11036 extern int cs_process_enforcement_enable;
11037 bool
11038 pmap_get_vm_map_cs_enforced(
11039 	pmap_t pmap)
11040 {
11041 	if (cs_process_enforcement_enable) {
11042 		return true;
11043 	}
11044 	return pmap->pmap_vm_map_cs_enforced;
11045 }
11046 
11047 MARK_AS_PMAP_TEXT void
11048 pmap_set_jit_entitled_internal(
11049 	__unused pmap_t pmap)
11050 {
11051 }
11052 
11053 void
11054 pmap_set_jit_entitled(
11055 	pmap_t pmap)
11056 {
11057 	pmap_set_jit_entitled_internal(pmap);
11058 }
11059 
11060 bool
11061 pmap_get_jit_entitled(
11062 	__unused pmap_t pmap)
11063 {
11064 	return false;
11065 }
11066 
11067 MARK_AS_PMAP_TEXT void
11068 pmap_set_tpro_internal(
11069 	__unused pmap_t pmap)
11070 {
11071 	return;
11072 }
11073 
11074 void
11075 pmap_set_tpro(
11076 	pmap_t pmap)
11077 {
11078 	pmap_set_tpro_internal(pmap);
11079 }
11080 
11081 bool
11082 pmap_get_tpro(
11083 	__unused pmap_t pmap)
11084 {
11085 	return false;
11086 }
11087 
11088 
11089 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
11090 
11091 MARK_AS_PMAP_TEXT kern_return_t
11092 pmap_query_page_info_internal(
11093 	pmap_t          pmap,
11094 	vm_map_offset_t va,
11095 	int             *disp_p)
11096 {
11097 	pmap_paddr_t    pa;
11098 	int             disp;
11099 	unsigned int    pai;
11100 	pt_entry_t      *pte_p;
11101 	pv_entry_t      *pve_p;
11102 
11103 	if (pmap == PMAP_NULL || pmap == kernel_pmap) {
11104 		*disp_p = 0;
11105 		return KERN_INVALID_ARGUMENT;
11106 	}
11107 
11108 	validate_pmap(pmap);
11109 	pmap_lock(pmap, PMAP_LOCK_SHARED);
11110 
11111 try_again:
11112 	disp = 0;
11113 
11114 	pte_p = pmap_pte(pmap, va);
11115 	if (pte_p == PT_ENTRY_NULL) {
11116 		goto done;
11117 	}
11118 
11119 	const pt_entry_t pte = os_atomic_load(pte_p, relaxed);
11120 	pa = pte_to_pa(pte);
11121 	if (pa == 0) {
11122 		if (pte_is_compressed(pte, pte_p)) {
11123 			disp |= PMAP_QUERY_PAGE_COMPRESSED;
11124 			if (pte & ARM_PTE_COMPRESSED_ALT) {
11125 				disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
11126 			}
11127 		}
11128 	} else {
11129 		disp |= PMAP_QUERY_PAGE_PRESENT;
11130 		pai = pa_index(pa);
11131 		if (!pa_valid(pa)) {
11132 			goto done;
11133 		}
11134 		locked_pvh_t locked_pvh = pvh_lock(pai);
11135 		if (__improbable(pte != os_atomic_load(pte_p, relaxed))) {
11136 			/* something changed: try again */
11137 			pvh_unlock(&locked_pvh);
11138 			pmap_query_page_info_retries++;
11139 			goto try_again;
11140 		}
11141 		pve_p = PV_ENTRY_NULL;
11142 		int pve_ptep_idx = 0;
11143 		if (pvh_test_type(locked_pvh.pvh, PVH_TYPE_PVEP)) {
11144 			unsigned int npves = 0;
11145 			pve_p = pvh_pve_list(locked_pvh.pvh);
11146 			while (pve_p != PV_ENTRY_NULL &&
11147 			    (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
11148 				if (__improbable(npves == (SPTM_MAPPING_LIMIT / PTE_PER_PVE))) {
11149 					pvh_lock_enter_sleep_mode(&locked_pvh);
11150 				}
11151 				pve_p = pve_next(pve_p);
11152 				npves++;
11153 			}
11154 		}
11155 
11156 		if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
11157 			disp |= PMAP_QUERY_PAGE_ALTACCT;
11158 		} else if (ppattr_test_reusable(pai)) {
11159 			disp |= PMAP_QUERY_PAGE_REUSABLE;
11160 		} else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
11161 			disp |= PMAP_QUERY_PAGE_INTERNAL;
11162 		}
11163 		pvh_unlock(&locked_pvh);
11164 	}
11165 
11166 done:
11167 	pmap_unlock(pmap, PMAP_LOCK_SHARED);
11168 	*disp_p = disp;
11169 	return KERN_SUCCESS;
11170 }
11171 
11172 kern_return_t
11173 pmap_query_page_info(
11174 	pmap_t          pmap,
11175 	vm_map_offset_t va,
11176 	int             *disp_p)
11177 {
11178 	return pmap_query_page_info_internal(pmap, va, disp_p);
11179 }
11180 
11181 
11182 
11183 uint32_t
11184 pmap_user_va_bits(pmap_t pmap __unused)
11185 {
11186 #if __ARM_MIXED_PAGE_SIZE__
11187 	uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
11188 	return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
11189 #else
11190 	return 64 - T0SZ_BOOT;
11191 #endif
11192 }
11193 
11194 uint32_t
11195 pmap_kernel_va_bits(void)
11196 {
11197 	return 64 - T1SZ_BOOT;
11198 }
11199 
11200 static vm_map_size_t
11201 pmap_user_va_size(pmap_t pmap)
11202 {
11203 	return 1ULL << pmap_user_va_bits(pmap);
11204 }
11205 
11206 
11207 
11208 bool
11209 pmap_in_ppl(void)
11210 {
11211 	return false;
11212 }
11213 
11214 MARK_AS_PMAP_TEXT void
11215 pmap_footprint_suspend_internal(
11216 	vm_map_t        map,
11217 	boolean_t       suspend)
11218 {
11219 #if DEVELOPMENT || DEBUG
11220 	if (suspend) {
11221 		current_thread()->pmap_footprint_suspended = TRUE;
11222 		map->pmap->footprint_was_suspended = TRUE;
11223 	} else {
11224 		current_thread()->pmap_footprint_suspended = FALSE;
11225 	}
11226 #else /* DEVELOPMENT || DEBUG */
11227 	(void) map;
11228 	(void) suspend;
11229 #endif /* DEVELOPMENT || DEBUG */
11230 }
11231 
11232 void
11233 pmap_footprint_suspend(
11234 	vm_map_t map,
11235 	boolean_t suspend)
11236 {
11237 	pmap_footprint_suspend_internal(map, suspend);
11238 }
11239 
11240 void
11241 pmap_nop(pmap_t pmap)
11242 {
11243 	validate_pmap_mutable(pmap);
11244 }
11245 
11246 pmap_t
11247 pmap_txm_kernel_pmap(void)
11248 {
11249 	return kernel_pmap;
11250 }
11251 
11252 TXMAddressSpace_t*
11253 pmap_txm_addr_space(const pmap_t pmap)
11254 {
11255 	if (pmap) {
11256 		return pmap->txm_addr_space;
11257 	}
11258 
11259 	/*
11260 	 * When the passed in PMAP is NULL, it means the caller wishes to operate
11261 	 * on the current_pmap(). We could resolve and return that, but it is actually
11262 	 * safer to return NULL since these TXM interfaces also accept NULL inputs
11263 	 * which causes TXM to resolve to the current_pmap() equivalent internally.
11264 	 */
11265 	return NULL;
11266 }
11267 
11268 void
11269 pmap_txm_set_addr_space(
11270 	pmap_t pmap,
11271 	TXMAddressSpace_t *txm_addr_space)
11272 {
11273 	assert(pmap != NULL);
11274 
11275 	if (pmap->txm_addr_space && txm_addr_space) {
11276 		/* Attempted to overwrite the address space in the PMAP */
11277 		panic("attempted ovewrite of TXM address space: %p | %p | %p",
11278 		    pmap, pmap->txm_addr_space, txm_addr_space);
11279 	} else if (!pmap->txm_addr_space && !txm_addr_space) {
11280 		/* This should never happen */
11281 		panic("attempted NULL overwrite of TXM address space: %p", pmap);
11282 	}
11283 
11284 	pmap->txm_addr_space = txm_addr_space;
11285 }
11286 
11287 void
11288 pmap_txm_set_trust_level(
11289 	pmap_t pmap,
11290 	CSTrust_t trust_level)
11291 {
11292 	assert(pmap != NULL);
11293 
11294 	CSTrust_t current_trust = pmap->txm_trust_level;
11295 	if (current_trust != kCSTrustUntrusted) {
11296 		panic("attempted to overwrite TXM trust on the pmap: %p", pmap);
11297 	}
11298 
11299 	pmap->txm_trust_level = trust_level;
11300 }
11301 
11302 kern_return_t
11303 pmap_txm_get_trust_level_kdp(
11304 	pmap_t pmap,
11305 	CSTrust_t *trust_level)
11306 {
11307 	if (pmap == NULL) {
11308 		return KERN_INVALID_ARGUMENT;
11309 	} else if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11310 		return KERN_INVALID_ARGUMENT;
11311 	}
11312 
11313 	if (trust_level != NULL) {
11314 		*trust_level = pmap->txm_trust_level;
11315 	}
11316 	return KERN_SUCCESS;
11317 }
11318 
11319 kern_return_t
11320 pmap_txm_get_jit_address_range_kdp(
11321 	pmap_t pmap,
11322 	uintptr_t *jit_region_start,
11323 	uintptr_t *jit_region_end)
11324 {
11325 	if (ml_validate_nofault((vm_offset_t)pmap, sizeof(*pmap)) == false) {
11326 		return KERN_INVALID_ARGUMENT;
11327 	}
11328 	TXMAddressSpace_t *txm_addr_space = pmap_txm_addr_space(pmap);
11329 	if (NULL == txm_addr_space) {
11330 		return KERN_INVALID_ARGUMENT;
11331 	}
11332 	if (ml_validate_nofault((vm_offset_t)txm_addr_space, sizeof(*txm_addr_space)) == false) {
11333 		return KERN_INVALID_ARGUMENT;
11334 	}
11335 	/**
11336 	 * It's a bit gross that we're dereferencing what is supposed to be an abstract type.
11337 	 * If we were running in the TXM, we would always perform additional checks on txm_addr_space,
11338 	 * but this isn't necessary here, since we are running in the kernel and only using the results for
11339 	 * diagnostic purposes, rather than any policy enforcement.
11340 	 */
11341 	if (txm_addr_space->jitRegion) {
11342 		if (ml_validate_nofault((vm_offset_t)txm_addr_space->jitRegion, sizeof(txm_addr_space->jitRegion)) == false) {
11343 			return KERN_INVALID_ARGUMENT;
11344 		}
11345 		if (txm_addr_space->jitRegion->addr && txm_addr_space->jitRegion->addrEnd) {
11346 			*jit_region_start = txm_addr_space->jitRegion->addr;
11347 			*jit_region_end = txm_addr_space->jitRegion->addrEnd;
11348 			return KERN_SUCCESS;
11349 		}
11350 	}
11351 	return KERN_NOT_FOUND;
11352 }
11353 
11354 static pmap_t
11355 _pmap_txm_resolve_pmap(pmap_t pmap)
11356 {
11357 	if (pmap == NULL) {
11358 		pmap = current_pmap();
11359 		if (pmap == kernel_pmap) {
11360 			return NULL;
11361 		}
11362 	}
11363 
11364 	return pmap;
11365 }
11366 
11367 void
11368 pmap_txm_acquire_shared_lock(pmap_t pmap)
11369 {
11370 	pmap = _pmap_txm_resolve_pmap(pmap);
11371 	if (!pmap) {
11372 		return;
11373 	}
11374 
11375 	lck_rw_lock_shared(&pmap->txm_lck);
11376 }
11377 
11378 void
11379 pmap_txm_release_shared_lock(pmap_t pmap)
11380 {
11381 	pmap = _pmap_txm_resolve_pmap(pmap);
11382 	if (!pmap) {
11383 		return;
11384 	}
11385 
11386 	lck_rw_unlock_shared(&pmap->txm_lck);
11387 }
11388 
11389 void
11390 pmap_txm_acquire_exclusive_lock(pmap_t pmap)
11391 {
11392 	pmap = _pmap_txm_resolve_pmap(pmap);
11393 	if (!pmap) {
11394 		return;
11395 	}
11396 
11397 	lck_rw_lock_exclusive(&pmap->txm_lck);
11398 }
11399 
11400 void
11401 pmap_txm_release_exclusive_lock(pmap_t pmap)
11402 {
11403 	pmap = _pmap_txm_resolve_pmap(pmap);
11404 	if (!pmap) {
11405 		return;
11406 	}
11407 
11408 	lck_rw_unlock_exclusive(&pmap->txm_lck);
11409 }
11410 
11411 static void
11412 _pmap_txm_transfer_page(const pmap_paddr_t addr)
11413 {
11414 	sptm_retype_params_t retype_params = {
11415 		.raw = SPTM_RETYPE_PARAMS_NULL
11416 	};
11417 
11418 	/* Retype through the SPTM */
11419 	sptm_retype(addr, XNU_DEFAULT, TXM_DEFAULT, retype_params);
11420 }
11421 
11422 /**
11423  * Prepare a page for retyping to TXM_DEFAULT by clearing its
11424  * internal flags.
11425  *
11426  * @param pa Physical address of the page.
11427  */
11428 static inline void
11429 _pmap_txm_retype_prepare(const pmap_paddr_t pa)
11430 {
11431 	const sptm_retype_params_t retype_params = {
11432 		.raw = SPTM_RETYPE_PARAMS_NULL
11433 	};
11434 
11435 	/**
11436 	 * SPTM allows XNU_DEFAULT pages to request deferral of TLB flushing
11437 	 * when their PTE is updated, which is an important performance
11438 	 * optimization. However, this also allows an attacker controlled
11439 	 * XNU to exploit a read reference with a stale write-enabled PTE in
11440 	 * TLB. This is fine as long as the page is not retyped and the damage
11441 	 * will be contained within XNU domain. However, when such a page needs
11442 	 * to be retyped, SPTM has to make sure there's no outstanding
11443 	 * reference, or there's no history of deferring TLBIs. Internally,
11444 	 * SPTM maintains a flag tracking past deferred TLBIs that only gets
11445 	 * cleared on retyping with no outstanding reference. Therefore, we
11446 	 * do a dummy retype to XNU_DEFAULT itself to clear the internal flag,
11447 	 * before we actually transfer this page to TXM domain. To make sure
11448 	 * SPTM won't throw a violation, all the mappings to the page have to
11449 	 * be removed before calling this.
11450 	 */
11451 	sptm_retype(pa, XNU_DEFAULT, XNU_DEFAULT, retype_params);
11452 }
11453 
11454 /**
11455  * Transfer an XNU owned page to TXM domain.
11456  *
11457  * @param addr Kernel virtual address of the page. It has to be page size
11458  *             aligned.
11459  */
11460 void
11461 pmap_txm_transfer_page(const vm_address_t addr)
11462 {
11463 	assert((addr & PAGE_MASK) == 0);
11464 
11465 	const pmap_paddr_t pa = kvtophys_nofail(addr);
11466 	const unsigned int pai = pa_index(pa);
11467 
11468 	/* Lock the PVH lock to prevent concurrent updates to the mappings during the self retype below. */
11469 	locked_pvh_t locked_pvh = pvh_lock(pai);
11470 
11471 	/* Disconnect the mapping to assure SPTM of no pending TLBI. */
11472 	pmap_page_protect_options_with_flush_range((ppnum_t)atop(pa), VM_PROT_NONE,
11473 	    PMAP_OPTIONS_PPO_PENDING_RETYPE, &locked_pvh, NULL);
11474 
11475 	/* Self retype to clear the SPTM internal flags tracking delayed TLBIs for revoked writes. */
11476 	_pmap_txm_retype_prepare(pa);
11477 
11478 	pvh_unlock(&locked_pvh);
11479 
11480 	/* XNU needs to hold an RO reference to the page despite the ownership being transferred to TXM. */
11481 	pmap_enter_addr(kernel_pmap, addr, pa, VM_PROT_READ, VM_PROT_NONE, 0, true, PMAP_MAPPING_TYPE_INFER);
11482 
11483 	/* Finally, retype the page to TXM_DEFAULT. */
11484 	_pmap_txm_transfer_page(pa);
11485 }
11486 
11487 struct vm_object txm_vm_object_storage VM_PAGE_PACKED_ALIGNED;
11488 SECURITY_READ_ONLY_LATE(vm_object_t) txm_vm_object = &txm_vm_object_storage;
11489 
11490 _Static_assert(sizeof(vm_map_address_t) == sizeof(pmap_paddr_t),
11491     "sizeof(vm_map_address_t) != sizeof(pmap_paddr_t)");
11492 
11493 vm_map_address_t
11494 pmap_txm_allocate_page(void)
11495 {
11496 	pmap_paddr_t phys_addr = 0;
11497 	vm_page_t page = VM_PAGE_NULL;
11498 	boolean_t thread_vm_privileged = false;
11499 
11500 	/* We are allowed to allocate privileged memory */
11501 	thread_vm_privileged = set_vm_privilege(true);
11502 
11503 	/* Allocate a page from the VM free list */
11504 	vm_grab_options_t grab_options = VM_PAGE_GRAB_OPTIONS_NONE;
11505 	while ((page = vm_page_grab_options(grab_options)) == VM_PAGE_NULL) {
11506 		VM_PAGE_WAIT();
11507 	}
11508 
11509 	/* Wire all of the pages allocated for TXM */
11510 	vm_page_lock_queues();
11511 	vm_page_wire(page, VM_KERN_MEMORY_SECURITY, TRUE);
11512 	vm_page_unlock_queues();
11513 
11514 	phys_addr = (pmap_paddr_t)ptoa(VM_PAGE_GET_PHYS_PAGE(page));
11515 	if (phys_addr == 0) {
11516 		panic("invalid VM page allocated for TXM: %llu", phys_addr);
11517 	}
11518 
11519 	/* Add the physical page to the TXM VM object */
11520 	vm_object_lock(txm_vm_object);
11521 	vm_page_insert_wired(
11522 		page,
11523 		txm_vm_object,
11524 		phys_addr - gPhysBase,
11525 		VM_KERN_MEMORY_SECURITY);
11526 	vm_object_unlock(txm_vm_object);
11527 
11528 	/* Reset thread privilege */
11529 	set_vm_privilege(thread_vm_privileged);
11530 
11531 	/* Retype the page */
11532 	_pmap_txm_transfer_page(phys_addr);
11533 
11534 	return phys_addr;
11535 }
11536 
11537 int
11538 pmap_cs_configuration(void)
11539 {
11540 	code_signing_config_t config = 0;
11541 
11542 	/* Compute the code signing configuration */
11543 	code_signing_configuration(NULL, &config);
11544 
11545 	return (int)config;
11546 }
11547 
11548 bool
11549 pmap_performs_stage2_translations(
11550 	__unused pmap_t pmap)
11551 {
11552 	return false;
11553 }
11554 
11555 bool
11556 pmap_has_iofilter_protected_write(void)
11557 {
11558 #if HAS_GUARDED_IO_FILTER
11559 	return true;
11560 #else
11561 	return false;
11562 #endif
11563 }
11564 
11565 #if HAS_GUARDED_IO_FILTER
11566 
11567 void
11568 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11569 {
11570 	/**
11571 	 * Even though this is done from EL1/2 for an address potentially owned by Guarded
11572 	 * Mode, we should be fine as mmu_kvtop uses "at s1e1r" checking for read access
11573 	 * only.
11574 	 */
11575 	const pmap_paddr_t pa = mmu_kvtop(addr);
11576 
11577 	if (!pa) {
11578 		panic("%s: addr 0x%016llx doesn't have a valid kernel mapping", __func__, (uint64_t) addr);
11579 	}
11580 
11581 	const sptm_frame_type_t frame_type = sptm_get_frame_type(pa);
11582 	if (frame_type == XNU_PROTECTED_IO) {
11583 		sptm_iofilter_protected_write(pa, value, width);
11584 	} else {
11585 		/* Mappings is valid but not specified by I/O filter. However, we still try
11586 		 * accessing the address from kernel mode. This allows addresses that are not
11587 		 * owned by SPTM to be accessed by this interface.
11588 		 */
11589 		switch (width) {
11590 		case 1:
11591 			*(volatile uint8_t *)addr = (uint8_t) value;
11592 			break;
11593 		case 2:
11594 			*(volatile uint16_t *)addr = (uint16_t) value;
11595 			break;
11596 		case 4:
11597 			*(volatile uint32_t *)addr = (uint32_t) value;
11598 			break;
11599 		case 8:
11600 			*(volatile uint64_t *)addr = (uint64_t) value;
11601 			break;
11602 		default:
11603 			panic("%s: width %llu not supported", __func__, width);
11604 		}
11605 	}
11606 }
11607 
11608 #else /* HAS_GUARDED_IO_FILTER */
11609 
11610 __attribute__((__noreturn__))
11611 void
11612 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
11613 {
11614 	panic("%s called on an unsupported platform.", __FUNCTION__);
11615 }
11616 
11617 #endif /* HAS_GUARDED_IO_FILTER */
11618 
11619 void * __attribute__((noreturn))
11620 pmap_claim_reserved_ppl_page(void)
11621 {
11622 	panic("%s: function not supported in this environment", __FUNCTION__);
11623 }
11624 
11625 void __attribute__((noreturn))
11626 pmap_free_reserved_ppl_page(void __unused *kva)
11627 {
11628 	panic("%s: function not supported in this environment", __FUNCTION__);
11629 }
11630 
11631 bool
11632 pmap_lookup_in_loaded_trust_caches(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11633 {
11634 	kern_return_t kr = query_trust_cache(
11635 		kTCQueryTypeLoadable,
11636 		cdhash,
11637 		NULL);
11638 
11639 	if (kr == KERN_SUCCESS) {
11640 		return true;
11641 	}
11642 	return false;
11643 }
11644 
11645 uint32_t
11646 pmap_lookup_in_static_trust_cache(__unused const uint8_t cdhash[CS_CDHASH_LEN])
11647 {
11648 	TrustCacheQueryToken_t query_token = {0};
11649 	kern_return_t kr = KERN_NOT_FOUND;
11650 	uint64_t flags = 0;
11651 	uint8_t hash_type = 0;
11652 
11653 	kr = query_trust_cache(
11654 		kTCQueryTypeStatic,
11655 		cdhash,
11656 		&query_token);
11657 
11658 	if (kr == KERN_SUCCESS) {
11659 		amfi->TrustCache.queryGetFlags(&query_token, &flags);
11660 		amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
11661 
11662 		return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
11663 		       (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
11664 		       ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
11665 	}
11666 
11667 	return 0;
11668 }
11669 
11670 #if DEVELOPMENT || DEBUG
11671 
11672 struct page_table_dump_header {
11673 	uint64_t pa;
11674 	uint64_t num_entries;
11675 	uint64_t start_va;
11676 	uint64_t end_va;
11677 };
11678 
11679 static kern_return_t
11680 pmap_dump_page_tables_recurse(pmap_t pmap,
11681     const tt_entry_t *ttp,
11682     unsigned int cur_level,
11683     unsigned int level_mask,
11684     uint64_t start_va,
11685     void *buf_start,
11686     void *buf_end,
11687     size_t *bytes_copied)
11688 {
11689 	const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11690 	uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
11691 
11692 	uint64_t size = pt_attr->pta_level_info[cur_level].size;
11693 	uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
11694 	uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
11695 	uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
11696 
11697 	void *bufp = (uint8_t*)buf_start + *bytes_copied;
11698 
11699 	if (cur_level == pt_attr_root_level(pt_attr)) {
11700 		start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
11701 		num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
11702 	}
11703 
11704 	uint64_t tt_size = num_entries * sizeof(tt_entry_t);
11705 	const tt_entry_t *tt_end = &ttp[num_entries];
11706 
11707 	if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
11708 		return KERN_INSUFFICIENT_BUFFER_SIZE;
11709 	}
11710 
11711 	if (level_mask & (1U << cur_level)) {
11712 		struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
11713 		header->pa = kvtophys_nofail((vm_offset_t)ttp);
11714 		header->num_entries = num_entries;
11715 		header->start_va = start_va;
11716 		header->end_va = start_va + (num_entries * size);
11717 
11718 		bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
11719 		*bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
11720 	}
11721 	uint64_t current_va = start_va;
11722 
11723 	for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
11724 		tt_entry_t tte = *ttep;
11725 
11726 		if (!(tte & valid_mask)) {
11727 			continue;
11728 		}
11729 
11730 		if ((tte & type_mask) == type_block) {
11731 			continue;
11732 		} else {
11733 			if (cur_level >= pt_attr_leaf_level(pt_attr)) {
11734 				panic("%s: corrupt entry %#llx at %p, "
11735 				    "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
11736 				    __FUNCTION__, tte, ttep,
11737 				    ttp, cur_level, bufp, buf_end);
11738 			}
11739 
11740 			const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
11741 
11742 			kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
11743 			    level_mask, current_va, buf_start, buf_end, bytes_copied);
11744 
11745 			if (recurse_result != KERN_SUCCESS) {
11746 				return recurse_result;
11747 			}
11748 		}
11749 	}
11750 
11751 	return KERN_SUCCESS;
11752 }
11753 
11754 kern_return_t
11755 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
11756 {
11757 	if (not_in_kdp) {
11758 		panic("pmap_dump_page_tables must only be called from kernel debugger context");
11759 	}
11760 	return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
11761 	           level_mask, pmap->min, bufp, buf_end, bytes_copied);
11762 }
11763 
11764 #else /* DEVELOPMENT || DEBUG */
11765 
11766 kern_return_t
11767 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
11768     unsigned int level_mask __unused, size_t *bytes_copied __unused)
11769 {
11770 	return KERN_NOT_SUPPORTED;
11771 }
11772 #endif /* !(DEVELOPMENT || DEBUG) */
11773 
11774 
11775 #ifdef CONFIG_XNUPOST
11776 static volatile bool pmap_test_took_fault = false;
11777 
11778 static bool
11779 pmap_test_fault_handler(arm_saved_state_t * state)
11780 {
11781 	bool retval                 = false;
11782 	uint64_t esr                = get_saved_state_esr(state);
11783 	esr_exception_class_t class = ESR_EC(esr);
11784 	fault_status_t fsc          = ISS_IA_FSC(ESR_ISS(esr));
11785 
11786 	if ((class == ESR_EC_DABORT_EL1) &&
11787 	    ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
11788 		pmap_test_took_fault = true;
11789 		/* return to the instruction immediately after the call to NX page */
11790 		set_saved_state_pc(state, get_saved_state_pc(state) + 4);
11791 		retval = true;
11792 	}
11793 
11794 	return retval;
11795 }
11796 
11797 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
11798 static NOKASAN bool
11799 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
11800 {
11801 	pmap_t old_pmap = NULL;
11802 	thread_t thread = current_thread();
11803 
11804 	pmap_test_took_fault = false;
11805 
11806 	/*
11807 	 * We're potentially switching pmaps without using the normal thread
11808 	 * mechanism; disable interrupts and preemption to avoid any unexpected
11809 	 * memory accesses.
11810 	 */
11811 	const boolean_t old_int_state = ml_set_interrupts_enabled(FALSE);
11812 	mp_disable_preemption();
11813 
11814 	if (pmap != NULL) {
11815 		old_pmap = current_pmap();
11816 		pmap_switch(pmap, thread);
11817 
11818 		/* Disable PAN; pmap shouldn't be the kernel pmap. */
11819 #if __ARM_PAN_AVAILABLE__
11820 		__builtin_arm_wsr("pan", 0);
11821 #endif /* __ARM_PAN_AVAILABLE__ */
11822 	}
11823 
11824 	ml_expect_fault_begin(pmap_test_fault_handler, va);
11825 
11826 	if (is_write) {
11827 		*((volatile uint64_t*)(va)) = 0xdec0de;
11828 	} else {
11829 		volatile uint64_t tmp = *((volatile uint64_t*)(va));
11830 		(void)tmp;
11831 	}
11832 
11833 	/* Save the fault bool, and undo the gross stuff we did. */
11834 	bool took_fault = pmap_test_took_fault;
11835 	ml_expect_fault_end();
11836 
11837 	if (pmap != NULL) {
11838 #if __ARM_PAN_AVAILABLE__
11839 		__builtin_arm_wsr("pan", 1);
11840 #endif /* __ARM_PAN_AVAILABLE__ */
11841 
11842 		pmap_switch(old_pmap, thread);
11843 	}
11844 
11845 	mp_enable_preemption();
11846 	ml_set_interrupts_enabled(old_int_state);
11847 	bool retval = (took_fault == should_fault);
11848 	return retval;
11849 }
11850 
11851 static bool
11852 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
11853 {
11854 	bool retval = pmap_test_access(pmap, va, should_fault, false);
11855 
11856 	if (!retval) {
11857 		T_FAIL("%s: %s, "
11858 		    "pmap=%p, va=%p, should_fault=%u",
11859 		    __func__, should_fault ? "did not fault" : "faulted",
11860 		    pmap, (void*)va, (unsigned)should_fault);
11861 	}
11862 
11863 	return retval;
11864 }
11865 
11866 static bool
11867 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
11868 {
11869 	bool retval = pmap_test_access(pmap, va, should_fault, true);
11870 
11871 	if (!retval) {
11872 		T_FAIL("%s: %s, "
11873 		    "pmap=%p, va=%p, should_fault=%u",
11874 		    __func__, should_fault ? "did not fault" : "faulted",
11875 		    pmap, (void*)va, (unsigned)should_fault);
11876 	}
11877 
11878 	return retval;
11879 }
11880 
11881 static bool
11882 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
11883 {
11884 	unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
11885 	unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
11886 
11887 	bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
11888 
11889 	if (!retval) {
11890 		T_FAIL("%s: bits=%u, "
11891 		    "pa=%p, should_be_set=%u",
11892 		    __func__, bits,
11893 		    (void*)pa, should_be_set);
11894 	}
11895 
11896 	return retval;
11897 }
11898 
11899 static __attribute__((noinline)) bool
11900 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
11901 {
11902 	bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
11903 	return retval;
11904 }
11905 
11906 static int
11907 pmap_test_test_config(unsigned int flags)
11908 {
11909 	T_LOG("running pmap_test_test_config flags=0x%X", flags);
11910 	unsigned int map_count = 0;
11911 	unsigned long page_ratio = 0;
11912 	pmap_t pmap = pmap_create_options(NULL, 0, flags);
11913 
11914 	if (!pmap) {
11915 		panic("Failed to allocate pmap");
11916 	}
11917 
11918 	__unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11919 	uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
11920 	uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
11921 	uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
11922 
11923 	if (pmap_page_size <= native_page_size) {
11924 		page_ratio = native_page_size / pmap_page_size;
11925 	} else {
11926 		/*
11927 		 * We claim to support a page_ratio of less than 1, which is
11928 		 * not currently supported by the pmap layer; panic.
11929 		 */
11930 		panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
11931 		    "flags=%u",
11932 		    __func__, native_page_size, pmap_page_size,
11933 		    flags);
11934 	}
11935 
11936 	if (PAGE_RATIO > 1) {
11937 		/*
11938 		 * The kernel is deliberately pretending to have 16KB pages.
11939 		 * The pmap layer has code that supports this, so pretend the
11940 		 * page size is larger than it is.
11941 		 */
11942 		pmap_page_size = PAGE_SIZE;
11943 		native_page_size = PAGE_SIZE;
11944 	}
11945 
11946 	/*
11947 	 * Get two pages from the VM; one to be mapped wired, and one to be
11948 	 * mapped nonwired.
11949 	 */
11950 	vm_page_t unwired_vm_page = vm_page_grab();
11951 	vm_page_t wired_vm_page = vm_page_grab();
11952 
11953 	if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
11954 		panic("Failed to grab VM pages");
11955 	}
11956 
11957 	ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
11958 	ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
11959 
11960 	pmap_paddr_t pa = ptoa(pn);
11961 	pmap_paddr_t wired_pa = ptoa(wired_pn);
11962 
11963 	/*
11964 	 * We'll start mappings at the second twig TT.  This keeps us from only
11965 	 * using the first entry in each TT, which would trivially be address
11966 	 * 0; one of the things we will need to test is retrieving the VA for
11967 	 * a given PTE.
11968 	 */
11969 	vm_map_address_t va_base = pmap_twig_size;
11970 	vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
11971 
11972 	if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
11973 		/*
11974 		 * Not exactly a functional failure, but this test relies on
11975 		 * there being a spare PTE slot we can use to pin the TT.
11976 		 */
11977 		panic("Cannot pin translation table");
11978 	}
11979 
11980 	/*
11981 	 * Create the wired mapping; this will prevent the pmap layer from
11982 	 * reclaiming our test TTs, which would interfere with this test
11983 	 * ("interfere" -> "make it panic").
11984 	 */
11985 	pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true, PMAP_MAPPING_TYPE_INFER);
11986 
11987 	T_LOG("Validate that kernel cannot write to SPTM memory.");
11988 	pt_entry_t * ptep = pmap_pte(pmap, va_base);
11989 	pmap_test_write(NULL, (vm_map_address_t)ptep, true);
11990 
11991 	/*
11992 	 * Create read-only mappings of the nonwired page; if the pmap does
11993 	 * not use the same page size as the kernel, create multiple mappings
11994 	 * so that the kernel page is fully mapped.
11995 	 */
11996 	for (map_count = 0; map_count < page_ratio; map_count++) {
11997 		pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)),
11998 		    VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
11999 	}
12000 
12001 	/* Validate that all the PTEs have the expected PA and VA. */
12002 	for (map_count = 0; map_count < page_ratio; map_count++) {
12003 		ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
12004 
12005 		if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
12006 			T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
12007 			    (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
12008 		}
12009 
12010 		if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
12011 			T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
12012 			    (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
12013 		}
12014 	}
12015 
12016 	T_LOG("Validate that reads to our mapping do not fault.");
12017 	pmap_test_read(pmap, va_base, false);
12018 
12019 	T_LOG("Validate that writes to our mapping fault.");
12020 	pmap_test_write(pmap, va_base, true);
12021 
12022 	T_LOG("Make the first mapping writable.");
12023 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12024 
12025 	T_LOG("Validate that writes to our mapping do not fault.");
12026 	pmap_test_write(pmap, va_base, false);
12027 
12028 	/*
12029 	 * For page ratios of greater than 1: validate that writes to the other
12030 	 * mappings still fault.  Remove the mappings afterwards (we're done
12031 	 * with page ratio testing).
12032 	 */
12033 	for (map_count = 1; map_count < page_ratio; map_count++) {
12034 		pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
12035 		pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
12036 	}
12037 
12038 	/* Remove remaining mapping */
12039 	pmap_remove(pmap, va_base, va_base + pmap_page_size);
12040 
12041 	T_LOG("Make the first mapping execute-only");
12042 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false, PMAP_MAPPING_TYPE_INFER);
12043 
12044 
12045 	T_LOG("Validate that reads to our mapping do not fault.");
12046 	pmap_test_read(pmap, va_base, false);
12047 
12048 	T_LOG("Validate that reads to our mapping do not fault.");
12049 	pmap_test_read(pmap, va_base, false);
12050 
12051 	T_LOG("Validate that writes to our mapping fault.");
12052 	pmap_test_write(pmap, va_base, true);
12053 
12054 	pmap_remove(pmap, va_base, va_base + pmap_page_size);
12055 
12056 	T_LOG("Mark the page unreferenced and unmodified.");
12057 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12058 	pmap_test_check_refmod(pa, 0);
12059 
12060 	/*
12061 	 * Begin testing the ref/mod state machine.  Re-enter the mapping with
12062 	 * different protection/fault_type settings, and confirm that the
12063 	 * ref/mod state matches our expectations at each step.
12064 	 */
12065 	T_LOG("!ref/!mod: read, no fault.  Expect ref/!mod");
12066 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12067 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12068 
12069 	T_LOG("!ref/!mod: read, read fault.  Expect ref/!mod");
12070 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12071 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12072 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12073 
12074 	T_LOG("!ref/!mod: rw, read fault.  Expect ref/!mod");
12075 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12076 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false, PMAP_MAPPING_TYPE_INFER);
12077 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12078 
12079 	T_LOG("ref/!mod: rw, read fault.  Expect ref/!mod");
12080 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12081 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12082 
12083 	T_LOG("!ref/!mod: rw, rw fault.  Expect ref/mod");
12084 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12085 	pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12086 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12087 
12088 	/*
12089 	 * Shared memory testing; we'll have two mappings; one read-only,
12090 	 * one read-write.
12091 	 */
12092 	vm_map_address_t rw_base = va_base;
12093 	vm_map_address_t ro_base = va_base + pmap_page_size;
12094 
12095 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12096 	pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false, PMAP_MAPPING_TYPE_INFER);
12097 
12098 	/*
12099 	 * Test that we take faults as expected for unreferenced/unmodified
12100 	 * pages.  Also test the arm_fast_fault interface, to ensure that
12101 	 * mapping permissions change as expected.
12102 	 */
12103 	T_LOG("!ref/!mod: expect no access");
12104 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12105 	pmap_test_read_write(pmap, ro_base, false, false);
12106 	pmap_test_read_write(pmap, rw_base, false, false);
12107 
12108 	T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
12109 	arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
12110 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
12111 	pmap_test_read_write(pmap, ro_base, true, false);
12112 	pmap_test_read_write(pmap, rw_base, true, false);
12113 
12114 	T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
12115 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12116 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12117 	pmap_test_read_write(pmap, ro_base, true, false);
12118 	pmap_test_read_write(pmap, rw_base, true, true);
12119 
12120 	T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
12121 	pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
12122 	arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
12123 	pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
12124 	pmap_test_read_write(pmap, ro_base, true, false);
12125 	pmap_test_read_write(pmap, rw_base, true, true);
12126 
12127 	T_LOG("RW protect both mappings; should not change protections.");
12128 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12129 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
12130 	pmap_test_read_write(pmap, ro_base, true, false);
12131 	pmap_test_read_write(pmap, rw_base, true, true);
12132 
12133 	T_LOG("Read protect both mappings; RW mapping should become RO.");
12134 	pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
12135 	pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
12136 	pmap_test_read_write(pmap, ro_base, true, false);
12137 	pmap_test_read_write(pmap, rw_base, true, false);
12138 
12139 	T_LOG("RW protect the page; mappings should not change protections.");
12140 	pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false, PMAP_MAPPING_TYPE_INFER);
12141 	pmap_page_protect(pn, VM_PROT_ALL);
12142 	pmap_test_read_write(pmap, ro_base, true, false);
12143 	pmap_test_read_write(pmap, rw_base, true, true);
12144 
12145 	T_LOG("Read protect the page; RW mapping should become RO.");
12146 	pmap_page_protect(pn, VM_PROT_READ);
12147 	pmap_test_read_write(pmap, ro_base, true, false);
12148 	pmap_test_read_write(pmap, rw_base, true, false);
12149 
12150 	T_LOG("Validate that disconnect removes all known mappings of the page.");
12151 	pmap_disconnect(pn);
12152 	if (!pmap_verify_free(pn)) {
12153 		T_FAIL("Page still has mappings");
12154 	}
12155 
12156 #if defined(ARM_LARGE_MEMORY)
12157 #define PMAP_TEST_LARGE_MEMORY_VA 64 * (1ULL << 40) /* 64 TB */
12158 
12159 	T_LOG("Create new wired mapping in the extended address space enabled by ARM_LARGE_MEMORY.");
12160 	pmap_enter_addr(pmap, PMAP_TEST_LARGE_MEMORY_VA, wired_pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, true, PMAP_MAPPING_TYPE_INFER);
12161 	pmap_test_read_write(pmap, PMAP_TEST_LARGE_MEMORY_VA, true, true);
12162 	pmap_remove(pmap, PMAP_TEST_LARGE_MEMORY_VA, PMAP_TEST_LARGE_MEMORY_VA + pmap_page_size);
12163 #endif /* ARM_LARGE_MEMORY */
12164 
12165 	T_LOG("Remove the wired mapping, so we can tear down the test map.");
12166 	pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
12167 	pmap_destroy(pmap);
12168 
12169 	T_LOG("Release the pages back to the VM.");
12170 	vm_page_lock_queues();
12171 	vm_page_free(unwired_vm_page);
12172 	vm_page_free(wired_vm_page);
12173 	vm_page_unlock_queues();
12174 
12175 	T_LOG("Testing successful!");
12176 	return 0;
12177 }
12178 
12179 kern_return_t
12180 pmap_test(void)
12181 {
12182 	T_LOG("Starting pmap_tests");
12183 	int flags = 0;
12184 	flags |= PMAP_CREATE_64BIT;
12185 
12186 #if __ARM_MIXED_PAGE_SIZE__ && !CONFIG_SPTM
12187 	T_LOG("Testing VM_PAGE_SIZE_4KB");
12188 	pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
12189 	T_LOG("Testing VM_PAGE_SIZE_16KB");
12190 	pmap_test_test_config(flags);
12191 #else /* __ARM_MIXED_PAGE_SIZE__ */
12192 	pmap_test_test_config(flags);
12193 #endif /* __ARM_MIXED_PAGE_SIZE__ */
12194 
12195 	T_PASS("completed pmap_test successfully");
12196 	return KERN_SUCCESS;
12197 }
12198 #endif /* CONFIG_XNUPOST */
12199 
12200 /*
12201  * The following function should never make it to RELEASE code, since
12202  * it provides a way to get the PPL to modify text pages.
12203  */
12204 #if DEVELOPMENT || DEBUG
12205 
12206 /**
12207  * Forcibly overwrite executable text with an illegal instruction.
12208  *
12209  * @note Only used for xnu unit testing.
12210  *
12211  * @param pa The physical address to corrupt.
12212  *
12213  * @return KERN_SUCCESS on success.
12214  */
12215 kern_return_t
12216 pmap_test_text_corruption(pmap_paddr_t pa __unused)
12217 {
12218 	/*
12219 	 * SPTM TODO: implement an SPTM version of this.
12220 	 * The physical apertue is owned by the SPTM and text
12221 	 * pages have RO physical aperture mappings.
12222 	 */
12223 	return KERN_SUCCESS;
12224 }
12225 
12226 #endif /* DEVELOPMENT || DEBUG */
12227 
12228