1 /*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_attr pmap_pt_attr_4k = {
222 .pta_level_info = pmap_table_level_info_4k,
223 .pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 .pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 .pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 .pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 .pta_max_level = PMAP_TT_L3_LEVEL,
234 .pta_ops = &native_pt_ops,
235 .ap_ro = ARM_PTE_AP(AP_RORO),
236 .ap_rw = ARM_PTE_AP(AP_RWRW),
237 .ap_rona = ARM_PTE_AP(AP_RONA),
238 .ap_rwna = ARM_PTE_AP(AP_RWNA),
239 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 .ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 .pta_tcr_value = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 .pta_page_size = 4096,
245 .pta_page_shift = 12,
246 };
247
248 const struct page_table_attr pmap_pt_attr_16k = {
249 .pta_level_info = pmap_table_level_info_16k,
250 .pta_root_level = PMAP_TT_L1_LEVEL,
251 .pta_commpage_level = PMAP_TT_L2_LEVEL,
252 .pta_max_level = PMAP_TT_L3_LEVEL,
253 .pta_ops = &native_pt_ops,
254 .ap_ro = ARM_PTE_AP(AP_RORO),
255 .ap_rw = ARM_PTE_AP(AP_RWRW),
256 .ap_rona = ARM_PTE_AP(AP_RONA),
257 .ap_rwna = ARM_PTE_AP(AP_RWNA),
258 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 .ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 .pta_tcr_value = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 .pta_page_size = 16384,
264 .pta_page_shift = 14,
265 };
266
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272
273
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279
280 #endif /* MACH_ASSERT */
281
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 (current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288
289
290 /*
291 * Represents a tlb range that will be flushed before exiting
292 * the ppl.
293 * Used by phys_attribute_clear_range to defer flushing pages in
294 * this range until the end of the operation.
295 */
296 typedef struct pmap_tlb_flush_range {
297 pmap_t ptfr_pmap;
298 vm_map_address_t ptfr_start;
299 vm_map_address_t ptfr_end;
300 bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302
303 #if XNU_MONITOR
304 /*
305 * PPL External References.
306 */
307 extern vm_offset_t segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313
314
315 /*
316 * PPL Global Variables
317 */
318
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325
326 /*
327 * Indicates if the PPL has started applying APRR.
328 * This variable is accessed from various assembly trampolines, so be sure to change
329 * those if you change the size or layout of this variable.
330 */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335
336 #endif /* !XNU_MONITOR */
337
338
339
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
342 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344
345 extern uint8_t bootstrap_pagetables[];
346
347 extern unsigned int not_in_kdp;
348
349 extern vm_offset_t first_avail;
350
351 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
352 extern vm_offset_t virtual_space_end; /* End of kernel address space */
353 extern vm_offset_t static_memory_end;
354
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357
358 extern int maxproc, hard_maxproc;
359
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362
363 /*
364 * The bounds on our TTBRs. These are for sanity checking that
365 * an address is accessible by a TTBR before we attempt to map it.
366 */
367
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373
374 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t kernel_pmap = &kernel_pmap_store;
376
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
378
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
382
383 typedef struct tt_free_entry {
384 struct tt_free_entry *next;
385 } tt_free_entry_t;
386
387 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
388
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX 4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int free_tt_max MARK_AS_PMAP_DATA;
400
401 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
402
403 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
415
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1; /* enable no-execute protection */
421 int allow_data_exec = 0; /* No apps may execute data */
422 int allow_stack_exec = 0; /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1; /* enable no-execute protection */
428 const int allow_data_exec = 0; /* No apps may execute data */
429 const int allow_stack_exec = 0; /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431
432 /**
433 * This variable is set true during hibernation entry to protect pmap data structures
434 * during image copying, and reset false on hibernation exit.
435 */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446
447 /**
448 * This helper function ensures that potentially-long-running batched PPL operations are
449 * called in preemptible context before entering the PPL, so that the PPL call may
450 * periodically exit to allow pending urgent ASTs to be taken.
451 */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
462
463 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
464
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470 "Minimum address space size outside allowable range");
471
472 // Max offset is 15.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 11.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476
477
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479 "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481 "Small device address space size outside allowable range");
482
483 # ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 # else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 # endif
488 #endif /* __arm64__ */
489
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
499 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
500 #if !HAS_16BIT_ASID
501 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
502 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
503 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
504 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
505 #else
506 static uint16_t last_allocated_asid = 0;
507 #endif /* !HAS_16BIT_ASID */
508
509
510 #if __ARM_MIXED_PAGE_SIZE__
511 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
512 #endif
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
514 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
515 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
516
517 /* PTE Define Macros */
518
519 #define ARM_PTE_IS_COMPRESSED(x, p) \
520 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
521 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
522 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
523 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
524 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
525
526 #define pte_is_wired(pte) \
527 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
528
529 #define pte_was_writeable(pte) \
530 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
531
532 #define pte_set_was_writeable(pte, was_writeable) \
533 do { \
534 if ((was_writeable)) { \
535 (pte) |= ARM_PTE_WRITEABLE; \
536 } else { \
537 (pte) &= ~ARM_PTE_WRITEABLE; \
538 } \
539 } while(0)
540
541 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)542 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
543 {
544 if (wired) {
545 *ptep |= ARM_PTE_WIRED;
546 } else {
547 *ptep &= ~ARM_PTE_WIRED;
548 }
549 /*
550 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
551 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
552 * never reclaimed.
553 */
554 if (pmap == kernel_pmap) {
555 return;
556 }
557 unsigned short *ptd_wiredcnt_ptr;
558 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
559 if (wired) {
560 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
561 } else {
562 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 if (__improbable(prev_wired == 0)) {
564 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
565 }
566 }
567 }
568
569 #if HAS_FEAT_XS
570
571 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)572 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
573 {
574 if (__improbable(pt_attr->stage2)) {
575 return false;
576 }
577 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
578 case CACHE_ATTRINDX_POSTED_XS:
579 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
580 return true;
581 default:
582 return false;
583 }
584 }
585
586 #endif /* HAS_FEAT_XS */
587
588 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
589 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
590 arm64_sync_tlb(strong); \
591 }
592
593 /*
594 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
595 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
596 * will observe the updated PTE.
597 */
598 #define FLUSH_PTE() \
599 __builtin_arm_dmb(DMB_ISH);
600
601 /*
602 * Synchronize updates to PTEs that were previously valid and thus may be cached in
603 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
604 * TLBI. This should only require a store-store barrier, as subsequent accesses in
605 * program order will not issue until the DSB completes. Prior loads may be reordered
606 * after the barrier, but their behavior should not be materially affected by the
607 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
608 * matter for loads until the access is re-driven well after the TLB update is
609 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
610 * we should be in a position to handle access faults. For "voluntary" PTE access
611 * restriction due to unmapping or protection, the decision to restrict access should
612 * have a data dependency on prior loads in order to avoid a data race.
613 */
614 #define FLUSH_PTE_STRONG() \
615 __builtin_arm_dsb(DSB_ISHST);
616
617 /**
618 * Write enough page table entries to map a single VM page. On systems where the
619 * VM page size does not match the hardware page size, multiple page table
620 * entries will need to be written.
621 *
622 * @note This function does not emit a barrier to ensure these page table writes
623 * have completed before continuing. This is commonly needed. In the case
624 * where a DMB or DSB barrier is needed, then use the write_pte() and
625 * write_pte_strong() functions respectively instead of this one.
626 *
627 * @param ptep Pointer to the first page table entry to update.
628 * @param pte The value to write into each page table entry. In the case that
629 * multiple PTEs are updated to a non-empty value, then the address
630 * in this value will automatically be incremented for each PTE
631 * write.
632 */
633 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)634 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
635 {
636 /**
637 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
638 * systems, which is why it's checked at runtime instead of compile time.
639 * The "unreachable" warning needs to be suppressed because it still is a
640 * compile time constant on some systems.
641 */
642 __unreachable_ok_push
643 if (TEST_PAGE_RATIO_4) {
644 if (((uintptr_t)ptep) & 0x1f) {
645 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
646 __func__, ptep, (void*)pte);
647 }
648
649 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
650 /**
651 * If we're writing an empty/compressed PTE value, then don't
652 * auto-increment the address for each PTE write.
653 */
654 *ptep = pte;
655 *(ptep + 1) = pte;
656 *(ptep + 2) = pte;
657 *(ptep + 3) = pte;
658 } else {
659 *ptep = pte;
660 *(ptep + 1) = pte | 0x1000;
661 *(ptep + 2) = pte | 0x2000;
662 *(ptep + 3) = pte | 0x3000;
663 }
664 } else {
665 *ptep = pte;
666 }
667 __unreachable_ok_pop
668 }
669
670 /**
671 * Writes enough page table entries to map a single VM page and then ensures
672 * those writes complete by executing a Data Memory Barrier.
673 *
674 * @note The DMB issued by this function is not strong enough to protect against
675 * TLB invalidates from being reordered above the PTE writes. If a TLBI
676 * instruction is going to immediately be called after this write, it's
677 * recommended to call write_pte_strong() instead of this function.
678 *
679 * See the function header for write_pte_fast() for more details on the
680 * parameters.
681 */
682 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)683 write_pte(pt_entry_t *ptep, pt_entry_t pte)
684 {
685 write_pte_fast(ptep, pte);
686 FLUSH_PTE();
687 }
688
689 /**
690 * Writes enough page table entries to map a single VM page and then ensures
691 * those writes complete by executing a Data Synchronization Barrier. This
692 * barrier provides stronger guarantees than the DMB executed by write_pte().
693 *
694 * @note This function is useful if you're going to immediately flush the TLB
695 * after making the PTE write. A DSB is required to protect against the
696 * TLB invalidate being reordered before the PTE write.
697 *
698 * See the function header for write_pte_fast() for more details on the
699 * parameters.
700 */
701 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)702 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
703 {
704 write_pte_fast(ptep, pte);
705 FLUSH_PTE_STRONG();
706 }
707
708 /**
709 * Retrieve the pmap structure for the thread running on the current CPU.
710 */
711 pmap_t
current_pmap()712 current_pmap()
713 {
714 const pmap_t current = vm_map_pmap(current_thread()->map);
715
716 assert(current != NULL);
717
718 #if XNU_MONITOR
719 /**
720 * On PPL-enabled systems, it's important that PPL policy decisions aren't
721 * decided by kernel-writable memory. This function is used in various parts
722 * of the PPL, and besides validating that the pointer returned by this
723 * function is indeed a pmap structure, it's also important to ensure that
724 * it's actually the current thread's pmap. This is because different pmaps
725 * will have access to different entitlements based on the code signature of
726 * their loaded process. So if a different user pmap is set in the current
727 * thread structure (in an effort to bypass code signing restrictions), even
728 * though the structure would validate correctly as it is a real pmap
729 * structure, it should fail here.
730 *
731 * This only needs to occur for user pmaps because the kernel pmap's root
732 * page table is always the same as TTBR1 (it's set during bootstrap and not
733 * changed so it'd be redundant to check), and its code signing fields are
734 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
735 * it shouldn't be possible to set those fields. Due to that, an attacker
736 * setting the current thread's pmap to the kernel pmap as a way to bypass
737 * this check won't accomplish anything as it doesn't provide any extra code
738 * signing entitlements.
739 */
740 if ((current != kernel_pmap) &&
741 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
742 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
743 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
744 }
745 #endif /* XNU_MONITOR */
746
747 return current;
748 }
749
750 #if DEVELOPMENT || DEBUG
751
752 /*
753 * Trace levels are controlled by a bitmask in which each
754 * level can be enabled/disabled by the (1<<level) position
755 * in the boot arg
756 * Level 0: PPL extension functionality
757 * Level 1: pmap lifecycle (create/destroy/switch)
758 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
759 * Level 3: internal state management (attributes/fast-fault)
760 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
761 */
762
763 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
764
765 #define PMAP_TRACE(level, ...) \
766 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
767 KDBG_RELEASE(__VA_ARGS__); \
768 }
769 #else /* DEVELOPMENT || DEBUG */
770
771 #define PMAP_TRACE(level, ...)
772
773 #endif /* DEVELOPMENT || DEBUG */
774
775
776 /*
777 * Internal function prototypes (forward declarations).
778 */
779
780 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
781
782 static void pmap_set_reference(ppnum_t pn);
783
784 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
785
786 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
787
788 static kern_return_t pmap_expand(
789 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
790
791 static int pmap_remove_range(
792 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
793
794 static tt_entry_t *pmap_tt1_allocate(
795 pmap_t, vm_size_t, unsigned int);
796
797 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
798
799 static void pmap_tt1_deallocate(
800 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
801
802 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
803
804 static kern_return_t pmap_tt_allocate(
805 pmap_t, tt_entry_t **, unsigned int, unsigned int);
806
807 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
808
809 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
810 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
811 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
812
813 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
814
815
816 static void pmap_unmap_commpage(
817 pmap_t pmap);
818
819 static boolean_t
820 pmap_is_64bit(pmap_t);
821
822
823 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
824
825 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
826
827 static bool pmap_update_cache_attributes_locked(
828 ppnum_t, unsigned, bool);
829
830 static boolean_t arm_clear_fast_fault(
831 ppnum_t ppnum,
832 vm_prot_t fault_type,
833 pt_entry_t *pte_p);
834
835 static void pmap_trim_self(pmap_t pmap);
836 static void pmap_trim_subord(pmap_t subord);
837
838
839 /*
840 * Temporary prototypes, while we wait for pmap_enter to move to taking an
841 * address instead of a page number.
842 */
843 static kern_return_t
844 pmap_enter_addr(
845 pmap_t pmap,
846 vm_map_address_t v,
847 pmap_paddr_t pa,
848 vm_prot_t prot,
849 vm_prot_t fault_type,
850 unsigned int flags,
851 boolean_t wired);
852
853 kern_return_t
854 pmap_enter_options_addr(
855 pmap_t pmap,
856 vm_map_address_t v,
857 pmap_paddr_t pa,
858 vm_prot_t prot,
859 vm_prot_t fault_type,
860 unsigned int flags,
861 boolean_t wired,
862 unsigned int options,
863 __unused void *arg);
864
865 #ifdef CONFIG_XNUPOST
866 kern_return_t pmap_test(void);
867 #endif /* CONFIG_XNUPOST */
868
869 PMAP_SUPPORT_PROTOTYPES(
870 kern_return_t,
871 arm_fast_fault, (pmap_t pmap,
872 vm_map_address_t va,
873 vm_prot_t fault_type,
874 bool was_af_fault,
875 bool from_user), ARM_FAST_FAULT_INDEX);
876
877 PMAP_SUPPORT_PROTOTYPES(
878 boolean_t,
879 arm_force_fast_fault, (ppnum_t ppnum,
880 vm_prot_t allow_mode,
881 int options), ARM_FORCE_FAST_FAULT_INDEX);
882
883 MARK_AS_PMAP_TEXT static boolean_t
884 arm_force_fast_fault_with_flush_range(
885 ppnum_t ppnum,
886 vm_prot_t allow_mode,
887 int options,
888 pmap_tlb_flush_range_t *flush_range);
889
890 /**
891 * Definition of the states driving the batch cache attributes update
892 * state machine.
893 */
894 typedef struct {
895 uint64_t page_index : 32, /* The page index to be operated on */
896 state : 8, /* The current state of the update machine */
897 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
898 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
899 :0;
900 } batch_set_cache_attr_state_t;
901
902 /* Possible values of the "state" field. */
903 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
904 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
905 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
907
908 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
909
910 PMAP_SUPPORT_PROTOTYPES(
911 batch_set_cache_attr_state_t,
912 pmap_batch_set_cache_attributes, (
913 #if XNU_MONITOR
914 volatile upl_page_info_t *user_page_list,
915 #else /* !XNU_MONITOR */
916 upl_page_info_array_t user_page_list,
917 #endif /* XNU_MONITOR */
918 batch_set_cache_attr_state_t state,
919 unsigned int page_cnt,
920 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
921
922 PMAP_SUPPORT_PROTOTYPES(
923 kern_return_t,
924 pmap_change_wiring, (pmap_t pmap,
925 vm_map_address_t v,
926 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
927
928 PMAP_SUPPORT_PROTOTYPES(
929 pmap_t,
930 pmap_create_options, (ledger_t ledger,
931 vm_map_size_t size,
932 unsigned int flags,
933 kern_return_t * kr), PMAP_CREATE_INDEX);
934
935 PMAP_SUPPORT_PROTOTYPES(
936 void,
937 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
938
939 PMAP_SUPPORT_PROTOTYPES(
940 kern_return_t,
941 pmap_enter_options, (pmap_t pmap,
942 vm_map_address_t v,
943 pmap_paddr_t pa,
944 vm_prot_t prot,
945 vm_prot_t fault_type,
946 unsigned int flags,
947 boolean_t wired,
948 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
949
950 PMAP_SUPPORT_PROTOTYPES(
951 pmap_paddr_t,
952 pmap_find_pa, (pmap_t pmap,
953 addr64_t va), PMAP_FIND_PA_INDEX);
954
955 PMAP_SUPPORT_PROTOTYPES(
956 kern_return_t,
957 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
958
959
960 PMAP_SUPPORT_PROTOTYPES(
961 boolean_t,
962 pmap_is_empty, (pmap_t pmap,
963 vm_map_offset_t va_start,
964 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
965
966
967 PMAP_SUPPORT_PROTOTYPES(
968 unsigned int,
969 pmap_map_cpu_windows_copy, (ppnum_t pn,
970 vm_prot_t prot,
971 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
972
973 PMAP_SUPPORT_PROTOTYPES(
974 void,
975 pmap_ro_zone_memcpy, (zone_id_t zid,
976 vm_offset_t va,
977 vm_offset_t offset,
978 const vm_offset_t new_data,
979 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
980
981 PMAP_SUPPORT_PROTOTYPES(
982 uint64_t,
983 pmap_ro_zone_atomic_op, (zone_id_t zid,
984 vm_offset_t va,
985 vm_offset_t offset,
986 zro_atomic_op_t op,
987 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
988
989 PMAP_SUPPORT_PROTOTYPES(
990 void,
991 pmap_ro_zone_bzero, (zone_id_t zid,
992 vm_offset_t va,
993 vm_offset_t offset,
994 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
995
996 PMAP_SUPPORT_PROTOTYPES(
997 vm_map_offset_t,
998 pmap_nest, (pmap_t grand,
999 pmap_t subord,
1000 addr64_t vstart,
1001 uint64_t size,
1002 vm_map_offset_t vrestart,
1003 kern_return_t * krp), PMAP_NEST_INDEX);
1004
1005 PMAP_SUPPORT_PROTOTYPES(
1006 void,
1007 pmap_page_protect_options, (ppnum_t ppnum,
1008 vm_prot_t prot,
1009 unsigned int options,
1010 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1011
1012 PMAP_SUPPORT_PROTOTYPES(
1013 vm_map_address_t,
1014 pmap_protect_options, (pmap_t pmap,
1015 vm_map_address_t start,
1016 vm_map_address_t end,
1017 vm_prot_t prot,
1018 unsigned int options,
1019 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1020
1021 PMAP_SUPPORT_PROTOTYPES(
1022 kern_return_t,
1023 pmap_query_page_info, (pmap_t pmap,
1024 vm_map_offset_t va,
1025 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1026
1027 PMAP_SUPPORT_PROTOTYPES(
1028 mach_vm_size_t,
1029 pmap_query_resident, (pmap_t pmap,
1030 vm_map_address_t start,
1031 vm_map_address_t end,
1032 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1033
1034 PMAP_SUPPORT_PROTOTYPES(
1035 void,
1036 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1037
1038 PMAP_SUPPORT_PROTOTYPES(
1039 vm_map_address_t,
1040 pmap_remove_options, (pmap_t pmap,
1041 vm_map_address_t start,
1042 vm_map_address_t end,
1043 int options), PMAP_REMOVE_OPTIONS_INDEX);
1044
1045
1046 PMAP_SUPPORT_PROTOTYPES(
1047 void,
1048 pmap_set_cache_attributes, (ppnum_t pn,
1049 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1050
1051 PMAP_SUPPORT_PROTOTYPES(
1052 void,
1053 pmap_update_compressor_page, (ppnum_t pn,
1054 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1055
1056 PMAP_SUPPORT_PROTOTYPES(
1057 void,
1058 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1059
1060 #if MACH_ASSERT || XNU_MONITOR
1061 PMAP_SUPPORT_PROTOTYPES(
1062 void,
1063 pmap_set_process, (pmap_t pmap,
1064 int pid,
1065 char *procname), PMAP_SET_PROCESS_INDEX);
1066 #endif
1067
1068 PMAP_SUPPORT_PROTOTYPES(
1069 void,
1070 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1071
1072 PMAP_SUPPORT_PROTOTYPES(
1073 vm_map_offset_t,
1074 pmap_unnest_options, (pmap_t grand,
1075 addr64_t vaddr,
1076 uint64_t size,
1077 vm_map_offset_t vrestart,
1078 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 void,
1082 phys_attribute_set, (ppnum_t pn,
1083 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1084
1085 PMAP_SUPPORT_PROTOTYPES(
1086 void,
1087 phys_attribute_clear, (ppnum_t pn,
1088 unsigned int bits,
1089 int options,
1090 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1091
1092 #if __ARM_RANGE_TLBI__
1093 PMAP_SUPPORT_PROTOTYPES(
1094 vm_map_address_t,
1095 phys_attribute_clear_range, (pmap_t pmap,
1096 vm_map_address_t start,
1097 vm_map_address_t end,
1098 unsigned int bits,
1099 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1100 #endif /* __ARM_RANGE_TLBI__ */
1101
1102
1103 PMAP_SUPPORT_PROTOTYPES(
1104 void,
1105 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1106
1107 PMAP_SUPPORT_PROTOTYPES(
1108 void,
1109 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1110
1111 PMAP_SUPPORT_PROTOTYPES(
1112 void,
1113 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1114
1115 PMAP_SUPPORT_PROTOTYPES(
1116 void,
1117 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1118
1119 PMAP_SUPPORT_PROTOTYPES(
1120 void,
1121 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1122
1123 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1124 PMAP_SUPPORT_PROTOTYPES(
1125 void,
1126 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1127 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1128
1129 /* Definition of the states used by pmap_trim(). */
1130 typedef enum {
1131 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1132 PMAP_TRIM_STATE_START = 0,
1133
1134 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1135 PMAP_TRIM_STATE_GRAND_BEFORE,
1136
1137 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1138 PMAP_TRIM_STATE_GRAND_AFTER,
1139
1140 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1141 PMAP_TRIM_STATE_SUBORD,
1142
1143 /* Marks that trimming is finished. */
1144 PMAP_TRIM_STATE_DONE,
1145
1146 /* Sentry enum for sanity checks. */
1147 PMAP_TRIM_STATE_COUNT,
1148 } pmap_trim_state_t;
1149
1150 PMAP_SUPPORT_PROTOTYPES(
1151 pmap_trim_state_t,
1152 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1153
1154 #if HAS_APPLE_PAC
1155 PMAP_SUPPORT_PROTOTYPES(
1156 void *,
1157 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1158 PMAP_SUPPORT_PROTOTYPES(
1159 void *,
1160 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1161 #endif /* HAS_APPLE_PAC */
1162
1163
1164
1165
1166 PMAP_SUPPORT_PROTOTYPES(
1167 kern_return_t,
1168 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1169 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1170
1171 PMAP_SUPPORT_PROTOTYPES(
1172 kern_return_t,
1173 pmap_load_trust_cache_with_type, (TCType_t type,
1174 const vm_address_t pmap_img4_payload,
1175 const vm_size_t pmap_img4_payload_len,
1176 const vm_address_t img4_manifest,
1177 const vm_size_t img4_manifest_len,
1178 const vm_address_t img4_aux_manifest,
1179 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1180
1181 PMAP_SUPPORT_PROTOTYPES(
1182 void,
1183 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1184
1185 PMAP_SUPPORT_PROTOTYPES(
1186 kern_return_t,
1187 pmap_query_trust_cache, (TCQueryType_t query_type,
1188 const uint8_t cdhash[kTCEntryHashSize],
1189 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1190
1191 #if PMAP_CS_INCLUDE_CODE_SIGNING
1192
1193 PMAP_SUPPORT_PROTOTYPES(
1194 kern_return_t,
1195 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1196 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1197
1198 PMAP_SUPPORT_PROTOTYPES(
1199 kern_return_t,
1200 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1201 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1202
1203 PMAP_SUPPORT_PROTOTYPES(
1204 kern_return_t,
1205 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1206 pmap_cs_profile_t * profile_obj),
1207 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1208
1209 PMAP_SUPPORT_PROTOTYPES(
1210 kern_return_t,
1211 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1212 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1213
1214 PMAP_SUPPORT_PROTOTYPES(
1215 kern_return_t,
1216 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1217 const void *kernel_entitlements),
1218 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1219
1220 PMAP_SUPPORT_PROTOTYPES(
1221 kern_return_t,
1222 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1223 const void **kernel_entitlements),
1224 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1225
1226 PMAP_SUPPORT_PROTOTYPES(
1227 kern_return_t,
1228 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1229 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1230
1231 PMAP_SUPPORT_PROTOTYPES(
1232 kern_return_t,
1233 pmap_cs_allow_invalid, (pmap_t pmap),
1234 PMAP_CS_ALLOW_INVALID_INDEX);
1235
1236 PMAP_SUPPORT_PROTOTYPES(
1237 void,
1238 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1239 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1240
1241 PMAP_SUPPORT_PROTOTYPES(
1242 bool,
1243 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1244 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1245
1246 PMAP_SUPPORT_PROTOTYPES(
1247 void,
1248 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1249 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1250
1251 PMAP_SUPPORT_PROTOTYPES(
1252 void,
1253 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1254 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1255
1256 #endif
1257
1258 PMAP_SUPPORT_PROTOTYPES(
1259 uint32_t,
1260 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1261
1262 PMAP_SUPPORT_PROTOTYPES(
1263 bool,
1264 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1265
1266 PMAP_SUPPORT_PROTOTYPES(
1267 void,
1268 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1269
1270 void pmap_footprint_suspend(vm_map_t map,
1271 boolean_t suspend);
1272 PMAP_SUPPORT_PROTOTYPES(
1273 void,
1274 pmap_footprint_suspend, (vm_map_t map,
1275 boolean_t suspend),
1276 PMAP_FOOTPRINT_SUSPEND_INDEX);
1277
1278
1279
1280
1281 #if DEVELOPMENT || DEBUG
1282 PMAP_SUPPORT_PROTOTYPES(
1283 kern_return_t,
1284 pmap_test_text_corruption, (pmap_paddr_t),
1285 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1286 #endif /* DEVELOPMENT || DEBUG */
1287
1288 /*
1289 * The low global vector page is mapped at a fixed alias.
1290 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1291 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1292 * to check both addresses anyway for backward compatibility. So for now
1293 * we leave H6 and H7 where they were.
1294 */
1295 #if (ARM_PGSHIFT == 14)
1296 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1297 #else
1298 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1299 #endif
1300
1301
1302 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1303 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1304 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1305
1306 #if XNU_MONITOR
1307
1308 #if __has_feature(ptrauth_calls)
1309 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1310 #else
1311 #define __ptrauth_ppl_handler
1312 #endif
1313
1314 /*
1315 * Table of function pointers used for PPL dispatch.
1316 */
1317 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1318 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1319 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1320 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1321 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1322 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1323 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1324 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1325 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1326 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1327 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1328 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1329 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1330 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1331 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1332 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1333 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1334 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1335 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1336 [PMAP_NEST_INDEX] = pmap_nest_internal,
1337 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1338 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1339 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1340 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1341 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1342 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1343 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1344 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1345 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1346 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1347 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1348 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1349 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1350 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1351 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1352 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1353 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1354 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1355 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1356 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1357 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1358 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1359 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1360 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1361 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1362 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1363 #if PMAP_CS_INCLUDE_CODE_SIGNING
1364 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1365 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1366 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1367 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1368 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1369 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1370 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1371 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1372 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1373 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1374 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1375 #endif
1376 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1377 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1378 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1379 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1380 #if HAS_APPLE_PAC
1381 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1382 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1383 #endif /* HAS_APPLE_PAC */
1384 #if __ARM_RANGE_TLBI__
1385 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1386 #endif /* __ARM_RANGE_TLBI__ */
1387 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1388 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1389 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1390 [PMAP_NOP_INDEX] = pmap_nop_internal,
1391
1392 #if DEVELOPMENT || DEBUG
1393 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1394 #endif /* DEVELOPMENT || DEBUG */
1395
1396 };
1397 #endif
1398
1399 #if XNU_MONITOR
1400 /**
1401 * A convenience function for setting protections on a single physical
1402 * aperture or static region mapping without invalidating the TLB.
1403 *
1404 * @note This function does not perform any TLB invalidations. That must be done
1405 * separately to be able to safely use the updated mapping.
1406 *
1407 * @note This function understands the difference between the VM page size and
1408 * the kernel page size and will update multiple PTEs if the sizes differ.
1409 * In other words, enough PTEs will always get updated to change the
1410 * permissions on a PAGE_SIZE amount of memory.
1411 *
1412 * @note The PVH lock for the physical page represented by this mapping must
1413 * already be locked.
1414 *
1415 * @note This function assumes the caller has already verified that the PTE
1416 * pointer does indeed point to a physical aperture or static region page
1417 * table. Please validate your inputs before passing it along to this
1418 * function.
1419 *
1420 * @param ptep Pointer to the physical aperture or static region page table to
1421 * update with a new XPRR index.
1422 * @param expected_perm The XPRR index that is expected to already exist at the
1423 * current mapping. If the current index doesn't match this
1424 * then the system will panic.
1425 * @param new_perm The new XPRR index to update the mapping with.
1426 */
1427 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1428 pmap_set_pte_xprr_perm(
1429 pt_entry_t * const ptep,
1430 unsigned int expected_perm,
1431 unsigned int new_perm)
1432 {
1433 assert(ptep != NULL);
1434
1435 pt_entry_t spte = *ptep;
1436 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1437
1438 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1439 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1440 __func__, ptep, new_perm, expected_perm);
1441 }
1442
1443 /**
1444 * The PTE involved should be valid, should not have the hint bit set, and
1445 * should have the expected XPRR index.
1446 */
1447 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1448 panic_plain("%s: physical aperture or static region PTE is invalid, "
1449 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1450 __func__, ptep, spte, new_perm, expected_perm);
1451 }
1452
1453 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1454 panic_plain("%s: physical aperture or static region PTE has hint bit "
1455 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1456 __func__, ptep, spte, new_perm, expected_perm);
1457 }
1458
1459 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1460 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1461 "ptep=%p, new_perm=%u, expected_perm=%u",
1462 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1463 }
1464
1465 pt_entry_t template = spte;
1466 template &= ~ARM_PTE_XPRR_MASK;
1467 template |= xprr_perm_to_pte(new_perm);
1468
1469 write_pte_strong(ptep, template);
1470 }
1471
1472 /**
1473 * Update the protections on a single physical aperture mapping and invalidate
1474 * the TLB so the mapping can be used.
1475 *
1476 * @note The PVH lock for the physical page must already be locked.
1477 *
1478 * @param pai The physical address index of the page whose physical aperture
1479 * mapping will be updated with new permissions.
1480 * @param expected_perm The XPRR index that is expected to already exist at the
1481 * current mapping. If the current index doesn't match this
1482 * then the system will panic.
1483 * @param new_perm The new XPRR index to update the mapping with.
1484 */
1485 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1486 pmap_set_xprr_perm(
1487 unsigned int pai,
1488 unsigned int expected_perm,
1489 unsigned int new_perm)
1490 {
1491 pvh_assert_locked(pai);
1492
1493 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1494 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1495
1496 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1497
1498 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1499 sync_tlb_flush();
1500 }
1501
1502 /**
1503 * Update the protections on a range of physical aperture or static region
1504 * mappings and invalidate the TLB so the mappings can be used.
1505 *
1506 * @note Static region mappings can only be updated before machine_lockdown().
1507 * Physical aperture mappings can be updated at any time.
1508 *
1509 * @param start The starting virtual address of the static region or physical
1510 * aperture range whose permissions will be updated.
1511 * @param end The final (inclusive) virtual address of the static region or
1512 * physical aperture range whose permissions will be updated.
1513 * @param expected_perm The XPRR index that is expected to already exist at the
1514 * current mappings. If the current indices don't match
1515 * this then the system will panic.
1516 * @param new_perm The new XPRR index to update the mappings with.
1517 */
1518 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1519 pmap_set_range_xprr_perm(
1520 vm_address_t start,
1521 vm_address_t end,
1522 unsigned int expected_perm,
1523 unsigned int new_perm)
1524 {
1525 /**
1526 * Validate our arguments; any invalid argument will be grounds for a panic.
1527 */
1528 if (__improbable((start | end) & ARM_PGMASK)) {
1529 panic_plain("%s: start or end not page aligned, "
1530 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1531 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1532 }
1533
1534 if (__improbable(start > end)) {
1535 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1536 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1537 }
1538
1539 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1540 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1541
1542 if (__improbable(!(in_physmap || in_static))) {
1543 panic_plain("%s: address not in static region or physical aperture, "
1544 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1545 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1546 }
1547
1548 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1549 panic_plain("%s: invalid XPRR index, "
1550 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1551 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1552 }
1553
1554 /*
1555 * Walk over the PTEs for the given range, and set the protections on those
1556 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1557 * one twig entry (whichever twig entry currently maps "va").
1558 */
1559 vm_address_t va = start;
1560 while (va < end) {
1561 /**
1562 * Get the last VA that the twig entry for "va" maps. All of the leaf
1563 * PTEs from va to tte_va_end will have their permissions updated.
1564 */
1565 vm_address_t tte_va_end =
1566 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1567
1568 if (tte_va_end > end) {
1569 tte_va_end = end;
1570 }
1571
1572 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1573
1574 if (ttep == NULL) {
1575 panic_plain("%s: physical aperture or static region tte is NULL, "
1576 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1577 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1578 }
1579
1580 tt_entry_t tte = *ttep;
1581
1582 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1583 panic_plain("%s: tte=0x%llx is not a table type entry, "
1584 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1585 tte, (void *)start, (void *)end, new_perm, expected_perm);
1586 }
1587
1588 /* Walk over the given L3 page table page and update the PTEs. */
1589 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1590 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1591 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1592 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1593
1594 /**
1595 * The current PTE pointer is incremented by the page ratio (ratio of
1596 * VM page size to kernel hardware page size) because one call to
1597 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1598 * a PAGE_SIZE worth of hardware pages.
1599 */
1600 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1601 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1602 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1603 pvh_lock(pai);
1604 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1605 pvh_unlock(pai);
1606 }
1607
1608 va = tte_va_end;
1609 }
1610
1611 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1612 }
1613
1614 #endif /* XNU_MONITOR */
1615
1616 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1617 PMAP_ZINFO_PALLOC(
1618 pmap_t pmap, int bytes)
1619 {
1620 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1621 }
1622
1623 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1624 PMAP_ZINFO_PFREE(
1625 pmap_t pmap,
1626 int bytes)
1627 {
1628 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1629 }
1630
1631 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1632 pmap_tt_ledger_credit(
1633 pmap_t pmap,
1634 vm_size_t size)
1635 {
1636 if (pmap != kernel_pmap) {
1637 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1638 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1639 }
1640 }
1641
1642 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1643 pmap_tt_ledger_debit(
1644 pmap_t pmap,
1645 vm_size_t size)
1646 {
1647 if (pmap != kernel_pmap) {
1648 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1649 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1650 }
1651 }
1652
1653 static inline void
pmap_update_plru(uint16_t asid_index __unused)1654 pmap_update_plru(uint16_t asid_index __unused)
1655 {
1656 #if !HAS_16BIT_ASID
1657 if (__probable(pmap_asid_plru)) {
1658 unsigned plru_index = asid_index >> 6;
1659 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1660 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1661 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1662 }
1663 }
1664 #endif /* !HAS_16BIT_ASID */
1665 }
1666
1667 static bool
alloc_asid(pmap_t pmap)1668 alloc_asid(pmap_t pmap)
1669 {
1670 int vasid = -1;
1671 uint16_t hw_asid;
1672
1673 pmap_simple_lock(&asid_lock);
1674
1675 #if !HAS_16BIT_ASID
1676 if (__probable(pmap_asid_plru)) {
1677 unsigned plru_index = 0;
1678 uint64_t lowest_gen = asid_plru_generation[0];
1679 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1680 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1681 if (asid_plru_generation[i] < lowest_gen) {
1682 plru_index = i;
1683 lowest_gen = asid_plru_generation[i];
1684 lowest_gen_bitmap = asid_plru_bitmap[i];
1685 }
1686 }
1687
1688 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1689 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1690 if (temp_plru) {
1691 vasid = (plru_index << 6) + lsb_first(temp_plru);
1692 #if DEVELOPMENT || DEBUG
1693 ++pmap_asid_hits;
1694 #endif
1695 break;
1696 }
1697 }
1698 }
1699 #else
1700 /**
1701 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1702 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1703 * However, we first try to allocate starting from the position of the most-recently allocated
1704 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1705 * lower bit positions and then re-checking those same lower positions every time we allocate
1706 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1707 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1708 * logic, without requiring prohibitively expensive RCTX instructions.
1709 */
1710 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1711 #endif /* !HAS_16BIT_ASID */
1712 if (__improbable(vasid < 0)) {
1713 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1714 // slightly better with the collision detection scheme used by pmap_switch_internal().
1715 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1716 #if DEVELOPMENT || DEBUG
1717 ++pmap_asid_misses;
1718 #endif
1719 }
1720 if (__improbable(vasid < 0)) {
1721 pmap_simple_unlock(&asid_lock);
1722 return false;
1723 }
1724 assert((uint32_t)vasid < pmap_max_asids);
1725 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1726 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1727 #if HAS_16BIT_ASID
1728 last_allocated_asid = (uint16_t)vasid;
1729 #endif /* HAS_16BIT_ASID */
1730 pmap_simple_unlock(&asid_lock);
1731 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1732 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1733 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1734 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1735 * reassign to a reserved VASID. */
1736 assert(pmap->sw_asid < UINT8_MAX);
1737 pmap->sw_asid = UINT8_MAX;
1738 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1739 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1740 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1741 assert(hw_asid < MAX_HW_ASIDS);
1742 }
1743 pmap_update_plru(hw_asid);
1744 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1745 #if __ARM_KERNEL_PROTECT__
1746 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1747 #endif
1748 pmap->hw_asid = hw_asid;
1749 return true;
1750 }
1751
1752 static void
free_asid(pmap_t pmap)1753 free_asid(pmap_t pmap)
1754 {
1755 unsigned int vasid;
1756 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1757 if (__improbable(hw_asid == 0)) {
1758 return;
1759 }
1760
1761 #if __ARM_KERNEL_PROTECT__
1762 hw_asid >>= 1;
1763 #endif
1764 hw_asid -= 1;
1765
1766 #if HAS_16BIT_ASID
1767 vasid = hw_asid;
1768 #else
1769 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1770 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1771 } else {
1772 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1773 }
1774
1775 if (__probable(pmap_asid_plru)) {
1776 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1777 }
1778 #endif /* HAS_16BIT_ASID */
1779 pmap_simple_lock(&asid_lock);
1780 assert(!bitmap_test(&asid_bitmap[0], vasid));
1781 bitmap_set(&asid_bitmap[0], vasid);
1782 pmap_simple_unlock(&asid_lock);
1783 }
1784
1785
1786 boolean_t
pmap_valid_address(pmap_paddr_t addr)1787 pmap_valid_address(
1788 pmap_paddr_t addr)
1789 {
1790 return pa_valid(addr);
1791 }
1792
1793
1794
1795
1796
1797
1798 /*
1799 * Map memory at initialization. The physical addresses being
1800 * mapped are not managed and are never unmapped.
1801 *
1802 * For now, VM is already on, we only need to map the
1803 * specified memory.
1804 */
1805 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1806 pmap_map(
1807 vm_map_address_t virt,
1808 vm_offset_t start,
1809 vm_offset_t end,
1810 vm_prot_t prot,
1811 unsigned int flags)
1812 {
1813 kern_return_t kr;
1814 vm_size_t ps;
1815
1816 ps = PAGE_SIZE;
1817 while (start < end) {
1818 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1819 prot, VM_PROT_NONE, flags, FALSE);
1820
1821 if (kr != KERN_SUCCESS) {
1822 panic("%s: failed pmap_enter, "
1823 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1824 __FUNCTION__,
1825 (void *) virt, (void *) start, (void *) end, prot, flags);
1826 }
1827
1828 virt += ps;
1829 start += ps;
1830 }
1831 return virt;
1832 }
1833
1834 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1835 pmap_map_bd_with_options(
1836 vm_map_address_t virt,
1837 vm_offset_t start,
1838 vm_offset_t end,
1839 vm_prot_t prot,
1840 int32_t options)
1841 {
1842 pt_entry_t tmplate;
1843 pt_entry_t *ptep;
1844 vm_map_address_t vaddr;
1845 vm_offset_t paddr;
1846 pt_entry_t mem_attr;
1847
1848 switch (options & PMAP_MAP_BD_MASK) {
1849 case PMAP_MAP_BD_WCOMB:
1850 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1851 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1852 break;
1853 case PMAP_MAP_BD_POSTED:
1854 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1855 break;
1856 case PMAP_MAP_BD_POSTED_REORDERED:
1857 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1858 break;
1859 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1860 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1861 break;
1862 default:
1863 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1864 break;
1865 }
1866
1867 tmplate = pa_to_pte(start) | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA) |
1868 mem_attr | ARM_PTE_TYPE | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF;
1869 #if __ARM_KERNEL_PROTECT__
1870 tmplate |= ARM_PTE_NG;
1871 #endif /* __ARM_KERNEL_PROTECT__ */
1872
1873 vaddr = virt;
1874 paddr = start;
1875 while (paddr < end) {
1876 ptep = pmap_pte(kernel_pmap, vaddr);
1877 if (ptep == PT_ENTRY_NULL) {
1878 panic("%s: no PTE for vaddr=%p, "
1879 "virt=%p, start=%p, end=%p, prot=0x%x, options=0x%x",
1880 __FUNCTION__, (void*)vaddr,
1881 (void*)virt, (void*)start, (void*)end, prot, options);
1882 }
1883
1884 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1885 write_pte_strong(ptep, tmplate);
1886
1887 pte_increment_pa(tmplate);
1888 vaddr += PAGE_SIZE;
1889 paddr += PAGE_SIZE;
1890 }
1891
1892 if (end >= start) {
1893 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1894 }
1895
1896 return vaddr;
1897 }
1898
1899 #if XNU_MONITOR
1900 /**
1901 * Remove kernel writeablity from an IO PTE value if the page is owned by
1902 * guarded mode software.
1903 *
1904 * @param paddr The physical address of the page which has to be non-DRAM.
1905 * @param tmplate The PTE value to be evaluated.
1906 *
1907 * @return A new PTE value with permission bits modified.
1908 */
1909 static inline
1910 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1911 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1912 {
1913 assert(!pa_valid(paddr));
1914
1915 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1916
1917 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1918 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1919 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1920 switch (xprr_perm) {
1921 case XPRR_KERN_RO_PERM:
1922 break;
1923 case XPRR_KERN_RW_PERM:
1924 tmplate &= ~ARM_PTE_XPRR_MASK;
1925 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1926 break;
1927 default:
1928 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1929 }
1930 }
1931
1932 return tmplate;
1933 }
1934 #endif /* XNU_MONITOR */
1935
1936 /*
1937 * Back-door routine for mapping kernel VM at initialization.
1938 * Useful for mapping memory outside the range
1939 * [vm_first_phys, vm_last_phys] (i.e., devices).
1940 * Otherwise like pmap_map.
1941 */
1942 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1943 pmap_map_bd(
1944 vm_map_address_t virt,
1945 vm_offset_t start,
1946 vm_offset_t end,
1947 vm_prot_t prot)
1948 {
1949 pt_entry_t tmplate;
1950 pt_entry_t *ptep;
1951 vm_map_address_t vaddr;
1952 vm_offset_t paddr;
1953
1954 /* not cacheable and not buffered */
1955 tmplate = pa_to_pte(start)
1956 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1957 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1958 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1959 #if __ARM_KERNEL_PROTECT__
1960 tmplate |= ARM_PTE_NG;
1961 #endif /* __ARM_KERNEL_PROTECT__ */
1962
1963 vaddr = virt;
1964 paddr = start;
1965 while (paddr < end) {
1966 ptep = pmap_pte(kernel_pmap, vaddr);
1967 if (ptep == PT_ENTRY_NULL) {
1968 panic("pmap_map_bd");
1969 }
1970
1971 #if XNU_MONITOR
1972 if (!pa_valid(paddr)) {
1973 tmplate = pmap_construct_io_pte(paddr, tmplate);
1974 }
1975 #endif
1976
1977 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1978 write_pte_strong(ptep, tmplate);
1979
1980 pte_increment_pa(tmplate);
1981 vaddr += PAGE_SIZE;
1982 paddr += PAGE_SIZE;
1983 }
1984
1985 if (end >= start) {
1986 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1987 }
1988
1989 return vaddr;
1990 }
1991
1992 /*
1993 * Back-door routine for mapping kernel VM at initialization.
1994 * Useful for mapping memory specific physical addresses in early
1995 * boot (i.e., before kernel_map is initialized).
1996 *
1997 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1998 */
1999
2000 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)2001 pmap_map_high_window_bd(
2002 vm_offset_t pa_start,
2003 vm_size_t len,
2004 vm_prot_t prot)
2005 {
2006 pt_entry_t *ptep, pte;
2007 vm_map_address_t va_start = VREGION1_START;
2008 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
2009 vm_map_address_t va_end;
2010 vm_map_address_t va;
2011 vm_size_t offset;
2012
2013 offset = pa_start & PAGE_MASK;
2014 pa_start -= offset;
2015 len += offset;
2016
2017 if (len > (va_max - va_start)) {
2018 panic("%s: area too large, "
2019 "pa_start=%p, len=%p, prot=0x%x",
2020 __FUNCTION__,
2021 (void*)pa_start, (void*)len, prot);
2022 }
2023
2024 scan:
2025 for (; va_start < va_max; va_start += PAGE_SIZE) {
2026 ptep = pmap_pte(kernel_pmap, va_start);
2027 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2028 if (*ptep == ARM_PTE_TYPE_FAULT) {
2029 break;
2030 }
2031 }
2032 if (va_start > va_max) {
2033 panic("%s: insufficient pages, "
2034 "pa_start=%p, len=%p, prot=0x%x",
2035 __FUNCTION__,
2036 (void*)pa_start, (void*)len, prot);
2037 }
2038
2039 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2040 ptep = pmap_pte(kernel_pmap, va_end);
2041 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2042 if (*ptep != ARM_PTE_TYPE_FAULT) {
2043 va_start = va_end + PAGE_SIZE;
2044 goto scan;
2045 }
2046 }
2047
2048 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2049 ptep = pmap_pte(kernel_pmap, va);
2050 pte = pa_to_pte(pa_start)
2051 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2052 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2053 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2054 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2055 #if __ARM_KERNEL_PROTECT__
2056 pte |= ARM_PTE_NG;
2057 #endif /* __ARM_KERNEL_PROTECT__ */
2058 write_pte_strong(ptep, pte);
2059 }
2060 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2061 #if KASAN
2062 kasan_notify_address(va_start, len);
2063 #endif
2064 return va_start;
2065 }
2066
2067 static uint32_t
pmap_compute_max_asids(void)2068 pmap_compute_max_asids(void)
2069 {
2070 DTEntry entry;
2071 void const *prop = NULL;
2072 uint32_t max_asids;
2073 int err;
2074 unsigned int prop_size;
2075
2076 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2077 assert(err == kSuccess);
2078
2079 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2080 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2081 * we can choose a more flexible default value here. */
2082 return MAX_ASIDS;
2083 }
2084
2085 if (prop_size != sizeof(max_asids)) {
2086 panic("pmap-max-asids property is not a 32-bit integer");
2087 }
2088
2089 max_asids = *((uint32_t const *)prop);
2090 #if HAS_16BIT_ASID
2091 if (max_asids > MAX_HW_ASIDS) {
2092 panic("pmap-max-asids 0x%x too large", max_asids);
2093 }
2094 #else
2095 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2096 max_asids = (max_asids + 63) & ~63UL;
2097
2098 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2099 /* currently capped by size of pmap->sw_asid */
2100 panic("pmap-max-asids 0x%x too large", max_asids);
2101 }
2102 #endif /* HAS_16BIT_ASID */
2103 if (max_asids == 0) {
2104 panic("pmap-max-asids cannot be zero");
2105 }
2106 return max_asids;
2107 }
2108
2109 #if __arm64__
2110 /*
2111 * pmap_get_arm64_prot
2112 *
2113 * return effective armv8 VMSA block protections including
2114 * table AP/PXN/XN overrides of a pmap entry
2115 *
2116 */
2117
2118 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2119 pmap_get_arm64_prot(
2120 pmap_t pmap,
2121 vm_offset_t addr)
2122 {
2123 tt_entry_t tte = 0;
2124 unsigned int level = 0;
2125 uint64_t tte_type = 0;
2126 uint64_t effective_prot_bits = 0;
2127 uint64_t aggregate_tte = 0;
2128 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2129 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2130
2131 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2132 tte = *pmap_ttne(pmap, level, addr);
2133
2134 if (!(tte & ARM_TTE_VALID)) {
2135 return 0;
2136 }
2137
2138 tte_type = tte & ARM_TTE_TYPE_MASK;
2139
2140 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2141 (level == pt_attr->pta_max_level)) {
2142 /* Block or page mapping; both have the same protection bit layout. */
2143 break;
2144 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2145 /* All of the table bits we care about are overrides, so just OR them together. */
2146 aggregate_tte |= tte;
2147 }
2148 }
2149
2150 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2151 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2152 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2153
2154 /* Start with the PTE bits. */
2155 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2156
2157 /* Table AP bits mask out block/page AP bits */
2158 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2159
2160 /* XN/PXN bits can be OR'd in. */
2161 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2162 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2163
2164 return effective_prot_bits;
2165 }
2166 #endif /* __arm64__ */
2167
2168 /*
2169 * Bootstrap the system enough to run with virtual memory.
2170 *
2171 * The early VM initialization code has already allocated
2172 * the first CPU's translation table and made entries for
2173 * all the one-to-one mappings to be found there.
2174 *
2175 * We must set up the kernel pmap structures, the
2176 * physical-to-virtual translation lookup tables for the
2177 * physical memory to be managed (between avail_start and
2178 * avail_end).
2179 *
2180 * Map the kernel's code and data, and allocate the system page table.
2181 * Page_size must already be set.
2182 *
2183 * Parameters:
2184 * first_avail first available physical page -
2185 * after kernel page tables
2186 * avail_start PA of first managed physical page
2187 * avail_end PA of last managed physical page
2188 */
2189
2190 void
pmap_bootstrap(vm_offset_t vstart)2191 pmap_bootstrap(
2192 vm_offset_t vstart)
2193 {
2194 vm_map_offset_t maxoffset;
2195
2196 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2197
2198 #if XNU_MONITOR
2199
2200 #if DEVELOPMENT || DEBUG
2201 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2202 #endif
2203
2204 #if CONFIG_CSR_FROM_DT
2205 if (csr_unsafe_kernel_text) {
2206 pmap_ppl_disable = true;
2207 }
2208 #endif /* CONFIG_CSR_FROM_DT */
2209
2210 #endif /* XNU_MONITOR */
2211
2212 #if DEVELOPMENT || DEBUG
2213 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2214 kprintf("Kernel traces for pmap operations enabled\n");
2215 }
2216 #endif
2217
2218 /*
2219 * Initialize the kernel pmap.
2220 */
2221 #if ARM_PARAMETERIZED_PMAP
2222 kernel_pmap->pmap_pt_attr = native_pt_attr;
2223 #endif /* ARM_PARAMETERIZED_PMAP */
2224 #if HAS_APPLE_PAC
2225 kernel_pmap->disable_jop = 0;
2226 #endif /* HAS_APPLE_PAC */
2227 kernel_pmap->tte = cpu_tte;
2228 kernel_pmap->ttep = cpu_ttep;
2229 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2230 kernel_pmap->max = UINTPTR_MAX;
2231 os_atomic_init(&kernel_pmap->ref_count, 1);
2232 #if XNU_MONITOR
2233 os_atomic_init(&kernel_pmap->nested_count, 0);
2234 #endif
2235 kernel_pmap->nx_enabled = TRUE;
2236 #ifdef __arm64__
2237 kernel_pmap->is_64bit = TRUE;
2238 #else
2239 kernel_pmap->is_64bit = FALSE;
2240 #endif
2241 #if CONFIG_ROSETTA
2242 kernel_pmap->is_rosetta = FALSE;
2243 #endif
2244
2245 #if ARM_PARAMETERIZED_PMAP
2246 kernel_pmap->pmap_pt_attr = native_pt_attr;
2247 #endif /* ARM_PARAMETERIZED_PMAP */
2248
2249 kernel_pmap->nested_region_addr = 0x0ULL;
2250 kernel_pmap->nested_region_size = 0x0ULL;
2251 kernel_pmap->nested_region_asid_bitmap = NULL;
2252 kernel_pmap->nested_region_asid_bitmap_size = 0x0UL;
2253 kernel_pmap->type = PMAP_TYPE_KERNEL;
2254
2255 kernel_pmap->hw_asid = 0;
2256 kernel_pmap->sw_asid = 0;
2257
2258 pmap_lock_init(kernel_pmap);
2259
2260 pmap_max_asids = pmap_compute_max_asids();
2261 #if HAS_16BIT_ASID
2262 asid_chunk_size = MAX_HW_ASIDS;
2263 #else
2264 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2265 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2266 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2267 * masking used by the PLRU scheme. This means we must handle the case in which
2268 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2269 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2270 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2271 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2272 #endif /* HAS_16BIT_ASIDS */
2273
2274 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2275
2276 /**
2277 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2278 * pp_attr_table, etc). This function will use `avail_start` to allocate
2279 * space for these data structures.
2280 */
2281 pmap_data_bootstrap();
2282
2283 /**
2284 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2285 */
2286 uat_bootstrap();
2287
2288
2289 /**
2290 * Bootstrap any necessary SART data structures and values needed from the device tree.
2291 */
2292 sart_bootstrap();
2293
2294 /**
2295 * Don't make any assumptions about the alignment of avail_start before this
2296 * point (i.e., pmap_data_bootstrap() performs allocations).
2297 */
2298 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2299
2300 const pmap_paddr_t pmap_struct_start = avail_start;
2301
2302 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2303 avail_start = round_page(avail_start + asid_table_size);
2304
2305 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2306
2307 vm_first_phys = gPhysBase;
2308 vm_last_phys = trunc_page(avail_end);
2309
2310 queue_init(&map_pmap_list);
2311 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2312 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2313 free_page_size_tt_count = 0;
2314 free_page_size_tt_max = 0;
2315 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2316 free_two_page_size_tt_count = 0;
2317 free_two_page_size_tt_max = 0;
2318 free_tt_list = TT_FREE_ENTRY_NULL;
2319 free_tt_count = 0;
2320 free_tt_max = 0;
2321
2322 virtual_space_start = vstart;
2323 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2324
2325 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2326 #if !HAS_16BIT_ASID
2327 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2328 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2329 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2330 #endif /* !HAS_16BIT_ASID */
2331
2332
2333
2334 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2335 maxoffset = trunc_page(maxoffset);
2336 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2337 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2338 arm_pmap_max_offset_default = maxoffset;
2339 }
2340 }
2341 #if defined(__arm64__)
2342 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2343 maxoffset = trunc_page(maxoffset);
2344 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2345 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2346 arm64_pmap_max_offset_default = maxoffset;
2347 }
2348 }
2349 #endif
2350
2351 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2352
2353
2354 #if PMAP_CS_PPL_MONITOR
2355 /* Initialize the PPL trust cache read-write lock */
2356 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2357 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2358 #endif
2359
2360 #if MACH_ASSERT
2361 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2362 &vm_footprint_suspend_allowed,
2363 sizeof(vm_footprint_suspend_allowed));
2364 #endif /* MACH_ASSERT */
2365
2366 #if KASAN
2367 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2368 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2369 #endif /* KASAN */
2370
2371 /**
2372 * Ensure that avail_start is always left on a page boundary. The calling
2373 * code might not perform any alignment before allocating page tables so
2374 * this is important.
2375 */
2376 avail_start = round_page(avail_start);
2377 }
2378
2379 #if XNU_MONITOR
2380
2381 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2382 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2383 {
2384 pmap_paddr_t cur_pa;
2385 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2386 assert(pa_valid(cur_pa));
2387 ppattr_pa_set_monitor(cur_pa);
2388 }
2389 }
2390
2391 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2392 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2393 pmap_paddr_t end_pa,
2394 unsigned int expected_perm,
2395 unsigned int new_perm)
2396 {
2397 vm_offset_t start_va = phystokv(start_pa);
2398 vm_offset_t end_va = start_va + (end_pa - start_pa);
2399
2400 pa_set_range_monitor(start_pa, end_pa);
2401 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2402 }
2403
2404 static void
pmap_lockdown_kc(void)2405 pmap_lockdown_kc(void)
2406 {
2407 extern vm_offset_t vm_kernelcache_base;
2408 extern vm_offset_t vm_kernelcache_top;
2409 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2410 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2411 pmap_paddr_t cur_pa = start_pa;
2412 vm_offset_t cur_va = vm_kernelcache_base;
2413 while (cur_pa < end_pa) {
2414 vm_size_t range_size = end_pa - cur_pa;
2415 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2416 if (ptov_va != cur_va) {
2417 /*
2418 * If the physical address maps back to a virtual address that is non-linear
2419 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2420 * reclaimed by the OS and should therefore not be locked down.
2421 */
2422 cur_pa += range_size;
2423 cur_va += range_size;
2424 continue;
2425 }
2426 unsigned int pai = pa_index(cur_pa);
2427 pv_entry_t **pv_h = pai_to_pvh(pai);
2428
2429 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2430
2431 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2432 panic("pai %d already locked down", pai);
2433 }
2434
2435 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2436 cur_pa += ARM_PGBYTES;
2437 cur_va += ARM_PGBYTES;
2438 }
2439 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2440 extern uint64_t ctrr_ro_test;
2441 extern uint64_t ctrr_nx_test;
2442 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2443 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2444 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2445 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2446 }
2447 #endif
2448 }
2449
2450 void
pmap_static_allocations_done(void)2451 pmap_static_allocations_done(void)
2452 {
2453 pmap_paddr_t monitor_start_pa;
2454 pmap_paddr_t monitor_end_pa;
2455
2456 /*
2457 * Protect the bootstrap (V=P and V->P) page tables.
2458 *
2459 * These bootstrap allocations will be used primarily for page tables.
2460 * If we wish to secure the page tables, we need to start by marking
2461 * these bootstrap allocations as pages that we want to protect.
2462 */
2463 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2464 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2465
2466 /* The bootstrap page tables are mapped RW at boostrap. */
2467 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2468
2469 /*
2470 * We use avail_start as a pointer to the first address that has not
2471 * been reserved for bootstrap, so we know which pages to give to the
2472 * virtual memory layer.
2473 */
2474 monitor_start_pa = first_avail_phys;
2475 monitor_end_pa = avail_start;
2476
2477 /* The other bootstrap allocations are mapped RW at bootstrap. */
2478 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2479
2480 /*
2481 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2482 * to RO in arm_vm_prot_finalize(), which is called after this function.
2483 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2484 * they can't be allocated for other uses. We don't need a special xPRR
2485 * protection index, as there is no PPL_RO index, and these pages are ultimately
2486 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2487 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2488 * lookup table index to USER_XO before APRR is applied, leading the hardware
2489 * to believe we are dealing with an user XO page upon performing a translation.
2490 */
2491 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2492 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2493 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2494
2495 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2496 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2497
2498 /* PPL data is RW for the PPL, RO for the kernel. */
2499 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2500
2501 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2502 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2503
2504 /* PPL text is RX for the PPL, RO for the kernel. */
2505 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2506
2507
2508 /*
2509 * In order to support DTrace, the save areas for the PPL must be
2510 * writable. This is due to the fact that DTrace will try to update
2511 * register state.
2512 */
2513 if (pmap_ppl_disable) {
2514 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2515 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2516
2517 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2518 }
2519
2520
2521 if (segSizePPLDATACONST > 0) {
2522 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2523 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2524
2525 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2526 }
2527
2528 /*
2529 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2530 * precaution. The real RW mappings are at a different location with guard pages.
2531 */
2532 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2533
2534 /* Prevent remapping of the kernelcache */
2535 pmap_lockdown_kc();
2536 }
2537
2538 void
pmap_lockdown_ppl(void)2539 pmap_lockdown_ppl(void)
2540 {
2541 /* Mark the PPL as being locked down. */
2542
2543 mp_disable_preemption(); // for _nopreempt locking operations
2544 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2545 if (commpage_text_kva != 0) {
2546 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2547 false, VM_PROT_READ | VM_PROT_EXECUTE);
2548 }
2549 mp_enable_preemption();
2550
2551 /* Write-protect the kernel RO commpage. */
2552 #error "XPRR configuration error"
2553 }
2554 #endif /* XNU_MONITOR */
2555
2556 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2557 pmap_virtual_space(
2558 vm_offset_t *startp,
2559 vm_offset_t *endp
2560 )
2561 {
2562 *startp = virtual_space_start;
2563 *endp = virtual_space_end;
2564 }
2565
2566
2567 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2568 pmap_virtual_region(
2569 unsigned int region_select,
2570 vm_map_offset_t *startp,
2571 vm_map_size_t *size
2572 )
2573 {
2574 boolean_t ret = FALSE;
2575 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2576 if (region_select == 0) {
2577 /*
2578 * In this config, the bootstrap mappings should occupy their own L2
2579 * TTs, as they should be immutable after boot. Having the associated
2580 * TTEs and PTEs in their own pages allows us to lock down those pages,
2581 * while allowing the rest of the kernel address range to be remapped.
2582 */
2583 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2584 #if defined(ARM_LARGE_MEMORY)
2585 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2586 #else
2587 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2588 #endif
2589 ret = TRUE;
2590 }
2591
2592 #if defined(ARM_LARGE_MEMORY)
2593 if (region_select == 1) {
2594 *startp = VREGION1_START;
2595 *size = VREGION1_SIZE;
2596 ret = TRUE;
2597 }
2598 #endif
2599 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2600 #if defined(ARM_LARGE_MEMORY)
2601 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2602 if (region_select == 0) {
2603 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2604 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2605 ret = TRUE;
2606 }
2607
2608 if (region_select == 1) {
2609 *startp = VREGION1_START;
2610 *size = VREGION1_SIZE;
2611 ret = TRUE;
2612 }
2613 #else /* !defined(ARM_LARGE_MEMORY) */
2614 unsigned long low_global_vr_mask = 0;
2615 vm_map_size_t low_global_vr_size = 0;
2616
2617 if (region_select == 0) {
2618 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2619 if (!TEST_PAGE_SIZE_4K) {
2620 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2621 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2622 } else {
2623 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2624 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2625 }
2626 ret = TRUE;
2627 }
2628 if (region_select == 1) {
2629 *startp = VREGION1_START;
2630 *size = VREGION1_SIZE;
2631 ret = TRUE;
2632 }
2633 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2634 if (!TEST_PAGE_SIZE_4K) {
2635 low_global_vr_mask = 0xFFFFFFFFFE000000;
2636 low_global_vr_size = 0x2000000;
2637 } else {
2638 low_global_vr_mask = 0xFFFFFFFFFF800000;
2639 low_global_vr_size = 0x800000;
2640 }
2641
2642 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2643 *startp = LOW_GLOBAL_BASE_ADDRESS;
2644 *size = low_global_vr_size;
2645 ret = TRUE;
2646 }
2647
2648 if (region_select == 3) {
2649 /* In this config, we allow the bootstrap mappings to occupy the same
2650 * page table pages as the heap.
2651 */
2652 *startp = VM_MIN_KERNEL_ADDRESS;
2653 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2654 ret = TRUE;
2655 }
2656 #endif /* defined(ARM_LARGE_MEMORY) */
2657 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2658 return ret;
2659 }
2660
2661 /*
2662 * Routines to track and allocate physical pages during early boot.
2663 * On most systems that memory runs from first_avail through to avail_end
2664 * with no gaps.
2665 *
2666 * If the system supports ECC and ecc_bad_pages_count > 0, we
2667 * need to skip those pages.
2668 */
2669
2670 static unsigned int avail_page_count = 0;
2671 static bool need_ram_ranges_init = true;
2672
2673
2674 /**
2675 * Checks to see if a given page is in
2676 * the array of known bad pages
2677 *
2678 * @param ppn page number to check
2679 */
2680 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2681 pmap_is_bad_ram(__unused ppnum_t ppn)
2682 {
2683 return false;
2684 }
2685
2686 /**
2687 * Prepare bad ram pages to be skipped.
2688 */
2689
2690 /*
2691 * Initialize the count of available pages. No lock needed here,
2692 * as this code is called while kernel boot up is single threaded.
2693 */
2694 static void
initialize_ram_ranges(void)2695 initialize_ram_ranges(void)
2696 {
2697 pmap_paddr_t first = first_avail;
2698 pmap_paddr_t end = avail_end;
2699
2700 assert(first <= end);
2701 assert(first == (first & ~PAGE_MASK));
2702 assert(end == (end & ~PAGE_MASK));
2703 avail_page_count = atop(end - first);
2704
2705 need_ram_ranges_init = false;
2706 }
2707
2708 unsigned int
pmap_free_pages(void)2709 pmap_free_pages(
2710 void)
2711 {
2712 if (need_ram_ranges_init) {
2713 initialize_ram_ranges();
2714 }
2715 return avail_page_count;
2716 }
2717
2718 unsigned int
pmap_free_pages_span(void)2719 pmap_free_pages_span(
2720 void)
2721 {
2722 if (need_ram_ranges_init) {
2723 initialize_ram_ranges();
2724 }
2725 return (unsigned int)atop(avail_end - first_avail);
2726 }
2727
2728
2729 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2730 pmap_next_page_hi(
2731 ppnum_t * pnum,
2732 __unused boolean_t might_free)
2733 {
2734 return pmap_next_page(pnum);
2735 }
2736
2737
2738 boolean_t
pmap_next_page(ppnum_t * pnum)2739 pmap_next_page(
2740 ppnum_t *pnum)
2741 {
2742 if (need_ram_ranges_init) {
2743 initialize_ram_ranges();
2744 }
2745
2746
2747 if (first_avail != avail_end) {
2748 *pnum = (ppnum_t)atop(first_avail);
2749 first_avail += PAGE_SIZE;
2750 assert(avail_page_count > 0);
2751 --avail_page_count;
2752 return TRUE;
2753 }
2754 assert(avail_page_count == 0);
2755 return FALSE;
2756 }
2757
2758
2759 /*
2760 * Initialize the pmap module.
2761 * Called by vm_init, to initialize any structures that the pmap
2762 * system needs to map virtual memory.
2763 */
2764 void
pmap_init(void)2765 pmap_init(
2766 void)
2767 {
2768 /*
2769 * Protect page zero in the kernel map.
2770 * (can be overruled by permanent transltion
2771 * table entries at page zero - see arm_vm_init).
2772 */
2773 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2774
2775 pmap_initialized = TRUE;
2776
2777 /*
2778 * Create the zone of physical maps
2779 * and the physical-to-virtual entries.
2780 */
2781 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2782 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2783
2784
2785 /*
2786 * Initialize the pmap object (for tracking the vm_page_t
2787 * structures for pages we allocate to be page tables in
2788 * pmap_expand().
2789 */
2790 _vm_object_allocate(mem_size, pmap_object);
2791 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2792
2793 /*
2794 * The values of [hard_]maxproc may have been scaled, make sure
2795 * they are still less than the value of pmap_max_asids.
2796 */
2797 if ((uint32_t)maxproc > pmap_max_asids) {
2798 maxproc = pmap_max_asids;
2799 }
2800 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2801 hard_maxproc = pmap_max_asids;
2802 }
2803 }
2804
2805 /**
2806 * Verify that a given physical page contains no mappings (outside of the
2807 * default physical aperture mapping).
2808 *
2809 * @param ppnum Physical page number to check there are no mappings to.
2810 *
2811 * @return True if there are no mappings, false otherwise or if the page is not
2812 * kernel-managed.
2813 */
2814 bool
pmap_verify_free(ppnum_t ppnum)2815 pmap_verify_free(ppnum_t ppnum)
2816 {
2817 const pmap_paddr_t pa = ptoa(ppnum);
2818
2819 assert(pa != vm_page_fictitious_addr);
2820
2821 /* Only mappings to kernel-managed physical memory are tracked. */
2822 if (!pa_valid(pa)) {
2823 return false;
2824 }
2825
2826 const unsigned int pai = pa_index(pa);
2827 pv_entry_t **pvh = pai_to_pvh(pai);
2828
2829 return pvh_test_type(pvh, PVH_TYPE_NULL);
2830 }
2831
2832 #if MACH_ASSERT
2833 /**
2834 * Verify that a given physical page contains no mappings (outside of the
2835 * default physical aperture mapping) and if it does, then panic.
2836 *
2837 * @note It's recommended to use pmap_verify_free() directly when operating in
2838 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2839 * normally being called from outside of the PPL, and the pv_head_table
2840 * can't be modified outside of the PPL).
2841 *
2842 * @param ppnum Physical page number to check there are no mappings to.
2843 */
2844 void
pmap_assert_free(ppnum_t ppnum)2845 pmap_assert_free(ppnum_t ppnum)
2846 {
2847 const pmap_paddr_t pa = ptoa(ppnum);
2848
2849 /* Only mappings to kernel-managed physical memory are tracked. */
2850 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2851 return;
2852 }
2853
2854 const unsigned int pai = pa_index(pa);
2855 pv_entry_t **pvh = pai_to_pvh(pai);
2856
2857 /**
2858 * This function is always called from outside of the PPL. Because of this,
2859 * the PVH entry can't be locked. This function is generally only called
2860 * before the VM reclaims a physical page and shouldn't be creating new
2861 * mappings. Even if a new mapping is created while parsing the hierarchy,
2862 * the worst case is that the system will panic in another way, and we were
2863 * already about to panic anyway.
2864 */
2865
2866 /**
2867 * Since pmap_verify_free() returned false, that means there is at least one
2868 * mapping left. Let's get some extra info on the first mapping we find to
2869 * dump in the panic string (the common case is that there is one spare
2870 * mapping that was never unmapped).
2871 */
2872 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2873
2874 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2875 first_ptep = pvh_ptep(pvh);
2876 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2877 pv_entry_t *pvep = pvh_pve_list(pvh);
2878
2879 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2880 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2881 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2882 if (first_ptep != PT_ENTRY_NULL) {
2883 break;
2884 }
2885 }
2886
2887 /* The PVE should have at least one valid PTE. */
2888 assert(first_ptep != PT_ENTRY_NULL);
2889 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2890 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2891 __func__, pvh, pai);
2892 } else {
2893 /**
2894 * The mapping disappeared between here and the pmap_verify_free() call.
2895 * The only way that can happen is if the VM was racing this call with
2896 * a call that unmaps PTEs. Operations on this page should not be
2897 * occurring at the same time as this check, and unfortunately we can't
2898 * lock the PVH entry to prevent it, so just panic instead.
2899 */
2900 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2901 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2902 __func__, pvh, pai);
2903 }
2904
2905 /* Panic with a unique string identifying the first bad mapping and owner. */
2906 {
2907 /* First PTE is mapped by the main CPUs. */
2908 pmap_t pmap = ptep_get_pmap(first_ptep);
2909 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2910
2911 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2912 "%s CPU mapping (pmap: %p)",
2913 __func__, (uint64_t)pa, first_ptep, type, pmap);
2914 }
2915 }
2916 #endif
2917
2918
2919 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2920 pmap_root_alloc_size(pmap_t pmap)
2921 {
2922 #pragma unused(pmap)
2923 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2924 unsigned int root_level = pt_attr_root_level(pt_attr);
2925 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2926 }
2927
2928
2929 /*
2930 * Create and return a physical map.
2931 *
2932 * If the size specified for the map
2933 * is zero, the map is an actual physical
2934 * map, and may be referenced by the
2935 * hardware.
2936 *
2937 * If the size specified is non-zero,
2938 * the map will be used in software only, and
2939 * is bounded by that size.
2940 */
2941 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2942 pmap_create_options_internal(
2943 ledger_t ledger,
2944 vm_map_size_t size,
2945 unsigned int flags,
2946 kern_return_t *kr)
2947 {
2948 unsigned i;
2949 unsigned tte_index_max;
2950 pmap_t p;
2951 bool is_64bit = flags & PMAP_CREATE_64BIT;
2952 #if defined(HAS_APPLE_PAC)
2953 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2954 #endif /* defined(HAS_APPLE_PAC) */
2955 kern_return_t local_kr = KERN_SUCCESS;
2956
2957 if (size != 0) {
2958 {
2959 // Size parameter should only be set for stage 2.
2960 return PMAP_NULL;
2961 }
2962 }
2963
2964 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2965 return PMAP_NULL;
2966 }
2967
2968 #if XNU_MONITOR
2969 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2970 goto pmap_create_fail;
2971 }
2972
2973 assert(p != PMAP_NULL);
2974
2975 if (ledger) {
2976 pmap_ledger_validate(ledger);
2977 pmap_ledger_retain(ledger);
2978 }
2979 #else
2980 /*
2981 * Allocate a pmap struct from the pmap_zone. Then allocate
2982 * the translation table of the right size for the pmap.
2983 */
2984 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2985 local_kr = KERN_RESOURCE_SHORTAGE;
2986 goto pmap_create_fail;
2987 }
2988 #endif
2989
2990 p->ledger = ledger;
2991
2992
2993 p->pmap_vm_map_cs_enforced = false;
2994 p->min = 0;
2995
2996
2997 #if CONFIG_ROSETTA
2998 if (flags & PMAP_CREATE_ROSETTA) {
2999 p->is_rosetta = TRUE;
3000 } else {
3001 p->is_rosetta = FALSE;
3002 }
3003 #endif /* CONFIG_ROSETTA */
3004
3005 #if defined(HAS_APPLE_PAC)
3006 p->disable_jop = disable_jop;
3007 #endif /* defined(HAS_APPLE_PAC) */
3008
3009 p->nested_region_true_start = 0;
3010 p->nested_region_true_end = ~0;
3011
3012 p->nx_enabled = true;
3013 p->is_64bit = is_64bit;
3014 p->nested_pmap = PMAP_NULL;
3015 p->type = PMAP_TYPE_USER;
3016
3017 #if ARM_PARAMETERIZED_PMAP
3018 /* Default to the native pt_attr */
3019 p->pmap_pt_attr = native_pt_attr;
3020 #endif /* ARM_PARAMETERIZED_PMAP */
3021 #if __ARM_MIXED_PAGE_SIZE__
3022 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3023 p->pmap_pt_attr = &pmap_pt_attr_4k;
3024 }
3025 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3026 p->max = pmap_user_va_size(p);
3027
3028 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3029 local_kr = KERN_NO_SPACE;
3030 goto id_alloc_fail;
3031 }
3032
3033 pmap_lock_init(p);
3034
3035 p->tt_entry_free = (tt_entry_t *)0;
3036 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3037
3038
3039 #if XNU_MONITOR
3040 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3041 #else
3042 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3043 #endif
3044 if (!(p->tte)) {
3045 local_kr = KERN_RESOURCE_SHORTAGE;
3046 goto tt1_alloc_fail;
3047 }
3048
3049 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3050 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3051
3052 /* nullify the translation table */
3053 for (i = 0; i < tte_index_max; i++) {
3054 p->tte[i] = ARM_TTE_TYPE_FAULT;
3055 }
3056
3057 FLUSH_PTE();
3058
3059 /*
3060 * initialize the rest of the structure
3061 */
3062 p->nested_region_addr = 0x0ULL;
3063 p->nested_region_size = 0x0ULL;
3064 p->nested_region_asid_bitmap = NULL;
3065 p->nested_region_asid_bitmap_size = 0x0UL;
3066
3067 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3068 p->nested_no_bounds_refcnt = 0;
3069 p->nested_bounds_set = false;
3070
3071
3072 #if MACH_ASSERT
3073 p->pmap_pid = 0;
3074 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3075 #endif /* MACH_ASSERT */
3076 #if DEVELOPMENT || DEBUG
3077 p->footprint_was_suspended = FALSE;
3078 #endif /* DEVELOPMENT || DEBUG */
3079
3080 #if XNU_MONITOR
3081 os_atomic_init(&p->nested_count, 0);
3082 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3083 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3084 os_atomic_thread_fence(release);
3085 #endif
3086 os_atomic_init(&p->ref_count, 1);
3087 pmap_simple_lock(&pmaps_lock);
3088 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3089 pmap_simple_unlock(&pmaps_lock);
3090
3091 /*
3092 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3093 * which can lead to a concurrent disconnect operation making the balance
3094 * transiently negative. The ledger should still ultimately balance out,
3095 * which we still check upon pmap destruction.
3096 */
3097 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3098 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3099 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3100 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3101 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3102 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3103 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3104 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3105 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3106
3107 return p;
3108
3109 tt1_alloc_fail:
3110 pmap_get_pt_ops(p)->free_id(p);
3111 id_alloc_fail:
3112 #if XNU_MONITOR
3113 pmap_free_pmap(p);
3114
3115 if (ledger) {
3116 pmap_ledger_release(ledger);
3117 }
3118 #else
3119 zfree(pmap_zone, p);
3120 #endif
3121 pmap_create_fail:
3122 #if XNU_MONITOR
3123 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3124 #endif
3125 *kr = local_kr;
3126 #if XNU_MONITOR
3127 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3128 #endif
3129 return PMAP_NULL;
3130 }
3131
3132 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3133 pmap_create_options(
3134 ledger_t ledger,
3135 vm_map_size_t size,
3136 unsigned int flags)
3137 {
3138 pmap_t pmap;
3139 kern_return_t kr = KERN_SUCCESS;
3140
3141 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3142
3143 ledger_reference(ledger);
3144
3145 #if XNU_MONITOR
3146 for (;;) {
3147 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3148 if (kr != KERN_RESOURCE_SHORTAGE) {
3149 break;
3150 }
3151 assert(pmap == PMAP_NULL);
3152 pmap_alloc_page_for_ppl(0);
3153 kr = KERN_SUCCESS;
3154 }
3155 #else
3156 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3157 #endif
3158
3159 if (pmap == PMAP_NULL) {
3160 ledger_dereference(ledger);
3161 }
3162
3163 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3164
3165 return pmap;
3166 }
3167
3168 #if XNU_MONITOR
3169 /*
3170 * This symbol remains in place when the PPL is enabled so that the dispatch
3171 * table does not change from development to release configurations.
3172 */
3173 #endif
3174 #if MACH_ASSERT || XNU_MONITOR
3175 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3176 pmap_set_process_internal(
3177 __unused pmap_t pmap,
3178 __unused int pid,
3179 __unused char *procname)
3180 {
3181 #if MACH_ASSERT
3182 if (pmap == NULL || pmap->pmap_pid == -1) {
3183 return;
3184 }
3185
3186 validate_pmap_mutable(pmap);
3187
3188 pmap->pmap_pid = pid;
3189 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3190 #endif /* MACH_ASSERT */
3191 }
3192 #endif /* MACH_ASSERT || XNU_MONITOR */
3193
3194 #if MACH_ASSERT
3195 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3196 pmap_set_process(
3197 pmap_t pmap,
3198 int pid,
3199 char *procname)
3200 {
3201 #if XNU_MONITOR
3202 pmap_set_process_ppl(pmap, pid, procname);
3203 #else
3204 pmap_set_process_internal(pmap, pid, procname);
3205 #endif
3206 }
3207 #endif /* MACH_ASSERT */
3208
3209 /*
3210 * pmap_deallocate_all_leaf_tts:
3211 *
3212 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3213 * removing and deallocating all TTEs.
3214 */
3215 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3216 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3217 {
3218 tt_entry_t tte = ARM_TTE_EMPTY;
3219 tt_entry_t * ttep = NULL;
3220 tt_entry_t * last_ttep = NULL;
3221
3222 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3223
3224 assert(level < pt_attr_leaf_level(pt_attr));
3225
3226 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3227
3228 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3229 tte = *ttep;
3230
3231 if (!(tte & ARM_TTE_VALID)) {
3232 continue;
3233 }
3234
3235 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3236 panic("%s: found block mapping, ttep=%p, tte=%p, "
3237 "pmap=%p, first_ttep=%p, level=%u",
3238 __FUNCTION__, ttep, (void *)tte,
3239 pmap, first_ttep, level);
3240 }
3241
3242 /* Must be valid, type table */
3243 if (level < pt_attr_twig_level(pt_attr)) {
3244 /* If we haven't reached the twig level, recurse to the next level. */
3245 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3246 }
3247
3248 /* Remove the TTE. */
3249 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3250 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3251 }
3252 }
3253
3254 /*
3255 * We maintain stats and ledgers so that a task's physical footprint is:
3256 * phys_footprint = ((internal - alternate_accounting)
3257 * + (internal_compressed - alternate_accounting_compressed)
3258 * + iokit_mapped
3259 * + purgeable_nonvolatile
3260 * + purgeable_nonvolatile_compressed
3261 * + page_table)
3262 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3263 */
3264
3265 /*
3266 * Retire the given physical map from service.
3267 * Should only be called if the map contains
3268 * no valid mappings.
3269 */
3270 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3271 pmap_destroy_internal(
3272 pmap_t pmap)
3273 {
3274 if (pmap == PMAP_NULL) {
3275 return;
3276 }
3277
3278 validate_pmap(pmap);
3279
3280 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3281
3282 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3283 if (ref_count > 0) {
3284 return;
3285 } else if (__improbable(ref_count < 0)) {
3286 panic("pmap %p: refcount underflow", pmap);
3287 } else if (__improbable(pmap == kernel_pmap)) {
3288 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3289 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3290 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3291 }
3292
3293 #if XNU_MONITOR
3294 /*
3295 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3296 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3297 * That ensures that if the pmap is currently in use elsewhere, this path will
3298 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3299 * ref_count of 0 and panic.
3300 */
3301 os_atomic_thread_fence(seq_cst);
3302 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3303 panic("pmap %p: attempt to destroy while nested", pmap);
3304 }
3305 const int max_cpu = ml_get_max_cpu_number();
3306 for (unsigned int i = 0; i <= max_cpu; ++i) {
3307 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3308 if (cpu_data == NULL) {
3309 continue;
3310 }
3311 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3312 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3313 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3314 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3315 }
3316 }
3317 #endif
3318 pmap_unmap_commpage(pmap);
3319
3320 pmap_simple_lock(&pmaps_lock);
3321 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3322 pmap_simple_unlock(&pmaps_lock);
3323
3324 pmap_trim_self(pmap);
3325
3326 /*
3327 * Free the memory maps, then the
3328 * pmap structure.
3329 */
3330 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3331
3332
3333
3334 if (pmap->tte) {
3335 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3336 pmap->tte = (tt_entry_t *) NULL;
3337 pmap->ttep = 0;
3338 }
3339
3340 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3341
3342 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3343 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3344 sync_tlb_flush();
3345 } else {
3346 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3347 sync_tlb_flush();
3348 /* return its asid to the pool */
3349 pmap_get_pt_ops(pmap)->free_id(pmap);
3350 if (pmap->nested_pmap != NULL) {
3351 #if XNU_MONITOR
3352 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3353 #endif
3354 /* release the reference we hold on the nested pmap */
3355 pmap_destroy_internal(pmap->nested_pmap);
3356 }
3357 }
3358
3359 pmap_check_ledgers(pmap);
3360
3361 if (pmap->nested_region_asid_bitmap) {
3362 #if XNU_MONITOR
3363 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_asid_bitmap)), PAGE_SIZE);
3364 #else
3365 kfree_data(pmap->nested_region_asid_bitmap,
3366 pmap->nested_region_asid_bitmap_size * sizeof(unsigned int));
3367 #endif
3368 }
3369
3370 #if XNU_MONITOR
3371 if (pmap->ledger) {
3372 pmap_ledger_release(pmap->ledger);
3373 }
3374
3375 pmap_lock_destroy(pmap);
3376 pmap_free_pmap(pmap);
3377 #else
3378 pmap_lock_destroy(pmap);
3379 zfree(pmap_zone, pmap);
3380 #endif
3381 }
3382
3383 void
pmap_destroy(pmap_t pmap)3384 pmap_destroy(
3385 pmap_t pmap)
3386 {
3387 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3388
3389 ledger_t ledger = pmap->ledger;
3390
3391 #if XNU_MONITOR
3392 pmap_destroy_ppl(pmap);
3393
3394 pmap_ledger_check_balance(pmap);
3395 #else
3396 pmap_destroy_internal(pmap);
3397 #endif
3398
3399 ledger_dereference(ledger);
3400
3401 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3402 }
3403
3404
3405 /*
3406 * Add a reference to the specified pmap.
3407 */
3408 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3409 pmap_reference_internal(
3410 pmap_t pmap)
3411 {
3412 if (pmap != PMAP_NULL) {
3413 validate_pmap_mutable(pmap);
3414 os_atomic_inc(&pmap->ref_count, relaxed);
3415 }
3416 }
3417
3418 void
pmap_reference(pmap_t pmap)3419 pmap_reference(
3420 pmap_t pmap)
3421 {
3422 #if XNU_MONITOR
3423 pmap_reference_ppl(pmap);
3424 #else
3425 pmap_reference_internal(pmap);
3426 #endif
3427 }
3428
3429 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3430 pmap_tt1_allocate(
3431 pmap_t pmap,
3432 vm_size_t size,
3433 unsigned option)
3434 {
3435 tt_entry_t *tt1 = NULL;
3436 tt_free_entry_t *tt1_free;
3437 pmap_paddr_t pa;
3438 vm_address_t va;
3439 vm_address_t va_end;
3440 kern_return_t ret;
3441
3442 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3443 size = PAGE_SIZE;
3444 }
3445
3446 pmap_simple_lock(&tt1_lock);
3447 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3448 free_page_size_tt_count--;
3449 tt1 = (tt_entry_t *)free_page_size_tt_list;
3450 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3451 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3452 free_two_page_size_tt_count--;
3453 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3454 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3455 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3456 free_tt_count--;
3457 tt1 = (tt_entry_t *)free_tt_list;
3458 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3459 }
3460
3461 pmap_simple_unlock(&tt1_lock);
3462
3463 if (tt1 != NULL) {
3464 pmap_tt_ledger_credit(pmap, size);
3465 return (tt_entry_t *)tt1;
3466 }
3467
3468 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3469
3470 if (ret == KERN_RESOURCE_SHORTAGE) {
3471 return (tt_entry_t *)0;
3472 }
3473
3474 #if XNU_MONITOR
3475 assert(pa);
3476 #endif
3477
3478 if (size < PAGE_SIZE) {
3479 va = phystokv(pa) + size;
3480 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3481 tt_free_entry_t *next_free = NULL;
3482 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3483 tt1_free = (tt_free_entry_t *)va;
3484 tt1_free->next = next_free;
3485 next_free = tt1_free;
3486 }
3487 pmap_simple_lock(&tt1_lock);
3488 local_free_list->next = free_tt_list;
3489 free_tt_list = next_free;
3490 free_tt_count += ((PAGE_SIZE / size) - 1);
3491 if (free_tt_count > free_tt_max) {
3492 free_tt_max = free_tt_count;
3493 }
3494 pmap_simple_unlock(&tt1_lock);
3495 }
3496
3497 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3498 * Depending on the device, this can vary between 512b and 16K. */
3499 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3500 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3501 pmap_tt_ledger_credit(pmap, size);
3502
3503 return (tt_entry_t *) phystokv(pa);
3504 }
3505
3506 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3507 pmap_tt1_deallocate(
3508 pmap_t pmap,
3509 tt_entry_t *tt,
3510 vm_size_t size,
3511 unsigned option)
3512 {
3513 tt_free_entry_t *tt_entry;
3514
3515 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3516 size = PAGE_SIZE;
3517 }
3518
3519 tt_entry = (tt_free_entry_t *)tt;
3520 assert(not_in_kdp);
3521 pmap_simple_lock(&tt1_lock);
3522
3523 if (size < PAGE_SIZE) {
3524 free_tt_count++;
3525 if (free_tt_count > free_tt_max) {
3526 free_tt_max = free_tt_count;
3527 }
3528 tt_entry->next = free_tt_list;
3529 free_tt_list = tt_entry;
3530 }
3531
3532 if (size == PAGE_SIZE) {
3533 free_page_size_tt_count++;
3534 if (free_page_size_tt_count > free_page_size_tt_max) {
3535 free_page_size_tt_max = free_page_size_tt_count;
3536 }
3537 tt_entry->next = free_page_size_tt_list;
3538 free_page_size_tt_list = tt_entry;
3539 }
3540
3541 if (size == 2 * PAGE_SIZE) {
3542 free_two_page_size_tt_count++;
3543 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3544 free_two_page_size_tt_max = free_two_page_size_tt_count;
3545 }
3546 tt_entry->next = free_two_page_size_tt_list;
3547 free_two_page_size_tt_list = tt_entry;
3548 }
3549
3550 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3551 pmap_simple_unlock(&tt1_lock);
3552 pmap_tt_ledger_debit(pmap, size);
3553 return;
3554 }
3555
3556 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3557 free_page_size_tt_count--;
3558 tt = (tt_entry_t *)free_page_size_tt_list;
3559 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3560
3561 pmap_simple_unlock(&tt1_lock);
3562
3563 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3564
3565 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3566
3567 pmap_simple_lock(&tt1_lock);
3568 }
3569
3570 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3571 free_two_page_size_tt_count--;
3572 tt = (tt_entry_t *)free_two_page_size_tt_list;
3573 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3574
3575 pmap_simple_unlock(&tt1_lock);
3576
3577 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3578
3579 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3580
3581 pmap_simple_lock(&tt1_lock);
3582 }
3583 pmap_simple_unlock(&tt1_lock);
3584 pmap_tt_ledger_debit(pmap, size);
3585 }
3586
3587 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3588 pmap_tt_allocate(
3589 pmap_t pmap,
3590 tt_entry_t **ttp,
3591 unsigned int level,
3592 unsigned int options)
3593 {
3594 pmap_paddr_t pa;
3595 *ttp = NULL;
3596
3597 /* Traverse the tt_entry_free list to find a free tt_entry */
3598 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3599 return KERN_ABORTED;
3600 }
3601
3602 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3603 tt_free_entry_t *tt_free_cur, *tt_free_next;
3604
3605 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3606 tt_free_next = tt_free_cur->next;
3607 tt_free_cur->next = NULL;
3608 *ttp = (tt_entry_t *)tt_free_cur;
3609 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3610 }
3611 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3612
3613 /* Only do the heavylifting here when we don't have a free tt_entry. */
3614 if (*ttp == NULL) {
3615 pt_desc_t *ptdp;
3616
3617 /*
3618 * Allocate a VM page for the level x page table entries.
3619 */
3620 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3621 if (options & PMAP_OPTIONS_NOWAIT) {
3622 return KERN_RESOURCE_SHORTAGE;
3623 }
3624 VM_PAGE_WAIT();
3625 }
3626
3627 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3628 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3629 if (options & PMAP_OPTIONS_NOWAIT) {
3630 /* Deallocate all allocated resources so far. */
3631 pmap_pages_free(pa, PAGE_SIZE);
3632 return KERN_RESOURCE_SHORTAGE;
3633 }
3634 VM_PAGE_WAIT();
3635 }
3636
3637 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3638 OSAddAtomic64(1, &alloc_ttepages_count);
3639 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3640 } else {
3641 OSAddAtomic64(1, &alloc_ptepages_count);
3642 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3643 }
3644
3645 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3646
3647 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3648
3649 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3650 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3651 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3652
3653 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3654 if (PAGE_SIZE > pmap_page_size) {
3655 vm_address_t va;
3656 vm_address_t va_end;
3657
3658 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3659 /* Deallocate all allocated resources so far. */
3660 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3661 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3662 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3663 pmap_pages_free(pa, PAGE_SIZE);
3664 ptd_deallocate(ptdp);
3665
3666 return KERN_ABORTED;
3667 }
3668
3669 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3670 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3671 pmap->tt_entry_free = (tt_entry_t *)va;
3672 }
3673 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3674 }
3675
3676 *ttp = (tt_entry_t *)phystokv(pa);
3677 }
3678
3679 #if XNU_MONITOR
3680 assert(*ttp);
3681 #endif
3682
3683 return KERN_SUCCESS;
3684 }
3685
3686
3687 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3688 pmap_tt_deallocate(
3689 pmap_t pmap,
3690 tt_entry_t *ttp,
3691 unsigned int level)
3692 {
3693 pt_desc_t *ptdp;
3694 ptd_info_t *ptd_info;
3695 unsigned pt_acc_cnt;
3696 unsigned i;
3697 vm_offset_t free_page = 0;
3698 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3699 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3700
3701 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3702
3703 ptdp = ptep_get_ptd(ttp);
3704 ptd_info = ptd_get_info(ptdp, ttp);
3705
3706 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3707
3708 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3709 ptd_info->refcnt = 0;
3710 }
3711
3712 if (__improbable(ptd_info->refcnt != 0)) {
3713 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3714 }
3715
3716 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3717 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3718 }
3719
3720 if (pt_acc_cnt == 0) {
3721 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 unsigned pt_free_entry_cnt = 1;
3723
3724 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3725 tt_free_entry_t *tt_free_list_next;
3726
3727 tt_free_list_next = tt_free_list->next;
3728 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3729 pt_free_entry_cnt++;
3730 }
3731 tt_free_list = tt_free_list_next;
3732 }
3733 if (pt_free_entry_cnt == max_pt_index) {
3734 tt_free_entry_t *tt_free_list_cur;
3735
3736 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3737 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3738 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3739
3740 while (tt_free_list_cur) {
3741 tt_free_entry_t *tt_free_list_next;
3742
3743 tt_free_list_next = tt_free_list_cur->next;
3744 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3745 tt_free_list->next = tt_free_list_next->next;
3746 } else {
3747 tt_free_list = tt_free_list_next;
3748 }
3749 tt_free_list_cur = tt_free_list_next;
3750 }
3751 } else {
3752 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3753 pmap->tt_entry_free = ttp;
3754 }
3755 } else {
3756 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3757 pmap->tt_entry_free = ttp;
3758 }
3759
3760 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3761
3762 if (free_page != 0) {
3763 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3764 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3765 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3766 if (level < pt_attr_leaf_level(pt_attr)) {
3767 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3768 } else {
3769 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3770 }
3771 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3772 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3773 }
3774 }
3775
3776 /**
3777 * Safely clear out a translation table entry.
3778 *
3779 * @note If the TTE to clear out points to a leaf table, then that leaf table
3780 * must have a refcnt of zero before the TTE can be removed.
3781 * @note This function expects to be called with pmap locked exclusive, and will
3782 * return with pmap unlocked.
3783 *
3784 * @param pmap The pmap containing the page table whose TTE is being removed.
3785 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3786 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3787 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3788 * @param ttep Pointer to the TTE that should be cleared out.
3789 * @param level The level of the page table that contains the TTE to be removed.
3790 */
3791 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3792 pmap_tte_remove(
3793 pmap_t pmap,
3794 vm_offset_t va_start,
3795 vm_offset_t va_end,
3796 bool need_strong_sync,
3797 tt_entry_t *ttep,
3798 unsigned int level)
3799 {
3800 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3801
3802 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3803 const tt_entry_t tte = *ttep;
3804
3805 if (__improbable(tte == ARM_TTE_EMPTY)) {
3806 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3807 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3808 }
3809
3810 *ttep = (tt_entry_t) 0;
3811 FLUSH_PTE_STRONG();
3812 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3813 if (va_end > va_start) {
3814 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3815 }
3816
3817 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3818
3819 /**
3820 * Remember, the passed in "level" parameter refers to the level above the
3821 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3822 * page table).
3823 */
3824 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3825
3826 /**
3827 * Non-leaf pagetables don't track active references in the PTD and instead
3828 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3829 * the real refcount below.
3830 */
3831 unsigned short refcnt = PT_DESC_REFCOUNT;
3832
3833 /*
3834 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3835 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3836 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3837 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3838 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3839 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3840 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3841 * synchronize it against the disconnect operation. If that removal caused the
3842 * refcount to reach zero, the pagetable page could be freed before the disconnect
3843 * operation is finished using the relevant pagetable descriptor.
3844 * Address these cases by waiting until all CPUs have been observed to not be
3845 * executing pmap_disconnect().
3846 */
3847 if (remove_leaf_table) {
3848 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3849 const int max_cpu = ml_get_max_cpu_number();
3850 bitmap_full(&active_disconnects[0], max_cpu + 1);
3851 bool inflight_disconnect;
3852
3853 /*
3854 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3855 * ahead of any prior PTE load which may have observed the effect of a
3856 * concurrent disconnect operation. An acquire fence is required for this;
3857 * a load-acquire operation is insufficient.
3858 */
3859 os_atomic_thread_fence(acquire);
3860 do {
3861 inflight_disconnect = false;
3862 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3863 i >= 0;
3864 i = bitmap_next(&active_disconnects[0], i)) {
3865 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3866 if (cpu_data == NULL) {
3867 continue;
3868 }
3869 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3870 __builtin_arm_wfe();
3871 inflight_disconnect = true;
3872 continue;
3873 }
3874 os_atomic_clear_exclusive();
3875 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3876 }
3877 } while (inflight_disconnect);
3878 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3879 os_atomic_thread_fence(acquire);
3880 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3881 }
3882
3883 #if MACH_ASSERT
3884 /**
3885 * On internal devices, always do the page table consistency check
3886 * regardless of page table level or the actual refcnt value.
3887 */
3888 {
3889 #else /* MACH_ASSERT */
3890 /**
3891 * Only perform the page table consistency check when deleting leaf page
3892 * tables and it seems like there might be valid/compressed mappings
3893 * leftover.
3894 */
3895 if (__improbable(remove_leaf_table && refcnt != 0)) {
3896 #endif /* MACH_ASSERT */
3897
3898 /**
3899 * There are multiple problems that can arise as a non-zero refcnt:
3900 * 1. A bug in the refcnt management logic.
3901 * 2. A memory stomper or hardware failure.
3902 * 3. The VM forgetting to unmap all of the valid mappings in an address
3903 * space before destroying a pmap.
3904 *
3905 * By looping over the page table and determining how many valid or
3906 * compressed entries there actually are, we can narrow down which of
3907 * these three cases is causing this panic. If the expected refcnt
3908 * (valid + compressed) and the actual refcnt don't match then the
3909 * problem is probably either a memory corruption issue (if the
3910 * non-empty entries don't match valid+compressed, that could also be a
3911 * sign of corruption) or refcnt management bug. Otherwise, there
3912 * actually are leftover mappings and the higher layers of xnu are
3913 * probably at fault.
3914 */
3915 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3916 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3917
3918 pt_entry_t *ptep = bpte;
3919 unsigned short non_empty = 0, valid = 0, comp = 0;
3920
3921 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3922 /**
3923 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3924 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3925 * That's because it's possible for the 4-tuple PTE clear operation in
3926 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3927 * pmap_disconnect() to race each other in such a way that the compressed marker
3928 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3929 * This should be harmless as only the 1st PTE is used for accounting purposes,
3930 * but we don't want it to trip our internal checks here.
3931 */
3932 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3933 if ((i % PAGE_RATIO) == 0) {
3934 comp++;
3935 } else {
3936 continue;
3937 }
3938 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3939 valid++;
3940 }
3941
3942 /* Keep track of all non-empty entries to detect memory corruption. */
3943 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3944 non_empty++;
3945 }
3946 }
3947
3948 #if MACH_ASSERT
3949 /**
3950 * On internal machines, panic whenever a page table getting deleted has
3951 * leftover mappings (valid or otherwise) or a leaf page table has a
3952 * non-zero refcnt.
3953 */
3954 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3955 #else /* MACH_ASSERT */
3956 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3957 {
3958 #endif /* MACH_ASSERT */
3959 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3960 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3961 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3962 }
3963 }
3964 }
3965
3966 /**
3967 * Given a pointer to an entry within a `level` page table, delete the
3968 * page table at `level` + 1 that is represented by that entry. For instance,
3969 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3970 * contains the PA of the L3 table, and `level` would be "2".
3971 *
3972 * @note If the table getting deallocated is a leaf table, then that leaf table
3973 * must have a refcnt of zero before getting deallocated. All other levels
3974 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3975 * @note This function expects to be called with pmap locked exclusive and will
3976 * return with pmap unlocked.
3977 *
3978 * @param pmap The pmap that owns the page table to be deallocated.
3979 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3980 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3981 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3982 * @param ttep Pointer to the `level` TTE to remove.
3983 * @param level The level of the table that contains an entry pointing to the
3984 * table to be removed. The deallocated page table will be a
3985 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3986 * deleted).
3987 */
3988 void
3989 pmap_tte_deallocate(
3990 pmap_t pmap,
3991 vm_offset_t va_start,
3992 vm_offset_t va_end,
3993 bool need_strong_sync,
3994 tt_entry_t *ttep,
3995 unsigned int level)
3996 {
3997 tt_entry_t tte;
3998
3999 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4000
4001 tte = *ttep;
4002
4003 if (tte_get_ptd(tte)->pmap != pmap) {
4004 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
4005 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
4006 }
4007
4008 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
4009 __func__, ttep, (unsigned long long)tte);
4010
4011 /* pmap_tte_remove() will drop the pmap lock */
4012 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
4013
4014 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
4015 }
4016
4017 /*
4018 * Remove a range of hardware page-table entries.
4019 * The entries given are the first (inclusive)
4020 * and last (exclusive) entries for the VM pages.
4021 * The virtual address is the va for the first pte.
4022 *
4023 * The pmap must be locked.
4024 * If the pmap is not the kernel pmap, the range must lie
4025 * entirely within one pte-page. This is NOT checked.
4026 * Assumes that the pte-page exists.
4027 *
4028 * Returns the number of PTE changed
4029 */
4030 MARK_AS_PMAP_TEXT static int
4031 pmap_remove_range(
4032 pmap_t pmap,
4033 vm_map_address_t va,
4034 pt_entry_t *bpte,
4035 pt_entry_t *epte)
4036 {
4037 bool need_strong_sync = false;
4038 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4039 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4040 if (num_changed > 0) {
4041 PMAP_UPDATE_TLBS(pmap, va,
4042 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4043 }
4044 return num_changed;
4045 }
4046
4047
4048 #ifdef PVH_FLAG_EXEC
4049
4050 /*
4051 * Update the access protection bits of the physical aperture mapping for a page.
4052 * This is useful, for example, in guranteeing that a verified executable page
4053 * has no writable mappings anywhere in the system, including the physical
4054 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4055 * synchronization overhead in cases where the call to this function is
4056 * guaranteed to be followed by other TLB operations.
4057 */
4058 void
4059 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4060 {
4061 #if __ARM_PTE_PHYSMAP__
4062 pvh_assert_locked(pai);
4063 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4064 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4065
4066 pt_entry_t tmplate = *pte_p;
4067 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4068 return;
4069 }
4070 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4071 if (tmplate & ARM_PTE_HINT_MASK) {
4072 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4073 __func__, pte_p, (void *)kva, tmplate);
4074 }
4075 write_pte_strong(pte_p, tmplate);
4076 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4077 if (!flush_tlb_async) {
4078 sync_tlb_flush();
4079 }
4080 #endif
4081 }
4082 #endif /* defined(PVH_FLAG_EXEC) */
4083
4084
4085
4086 MARK_AS_PMAP_TEXT int
4087 pmap_remove_range_options(
4088 pmap_t pmap,
4089 vm_map_address_t va,
4090 pt_entry_t *bpte,
4091 pt_entry_t *epte,
4092 vm_map_address_t *eva,
4093 bool *need_strong_sync __unused,
4094 int options)
4095 {
4096 pt_entry_t *cpte;
4097 size_t npages = 0;
4098 int num_removed, num_unwired;
4099 int num_pte_changed;
4100 unsigned int pai = 0;
4101 pmap_paddr_t pa;
4102 int num_external, num_internal, num_reusable;
4103 int num_alt_internal;
4104 uint64_t num_compressed, num_alt_compressed;
4105 int16_t refcnt = 0;
4106
4107 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4108
4109 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4110 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4111
4112 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4113 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4114 }
4115
4116 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4117 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4118 }
4119
4120 num_removed = 0;
4121 num_unwired = 0;
4122 num_pte_changed = 0;
4123 num_external = 0;
4124 num_internal = 0;
4125 num_reusable = 0;
4126 num_compressed = 0;
4127 num_alt_internal = 0;
4128 num_alt_compressed = 0;
4129
4130 #if XNU_MONITOR
4131 bool ro_va = false;
4132 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4133 ro_va = true;
4134 }
4135 #endif
4136 for (cpte = bpte; cpte < epte;
4137 cpte += PAGE_RATIO, va += pmap_page_size) {
4138 pt_entry_t spte;
4139 boolean_t managed = FALSE;
4140
4141 /*
4142 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4143 * so we need to be as aggressive as possible in checking for preemption when we can.
4144 */
4145 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4146 *eva = va;
4147 break;
4148 }
4149
4150 spte = *((volatile pt_entry_t*)cpte);
4151
4152 while (!managed) {
4153 if (pmap != kernel_pmap &&
4154 (options & PMAP_OPTIONS_REMOVE) &&
4155 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4156 /*
4157 * "pmap" must be locked at this point,
4158 * so this should not race with another
4159 * pmap_remove_range() or pmap_enter().
4160 */
4161
4162 /* one less "compressed"... */
4163 num_compressed++;
4164 if (spte & ARM_PTE_COMPRESSED_ALT) {
4165 /* ... but it used to be "ALTACCT" */
4166 num_alt_compressed++;
4167 }
4168
4169 /* clear marker */
4170 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4171 /*
4172 * "refcnt" also accounts for
4173 * our "compressed" markers,
4174 * so let's update it here.
4175 */
4176 --refcnt;
4177 spte = *((volatile pt_entry_t*)cpte);
4178 }
4179 /*
4180 * It may be possible for the pte to transition from managed
4181 * to unmanaged in this timeframe; for now, elide the assert.
4182 * We should break out as a consequence of checking pa_valid.
4183 */
4184 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4185 pa = pte_to_pa(spte);
4186 if (!pa_valid(pa)) {
4187 #if XNU_MONITOR
4188 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4189 #endif
4190 #if XNU_MONITOR
4191 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4192 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4193 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4194 __func__, (uint64_t)pa);
4195 }
4196 #endif
4197 break;
4198 }
4199 #if HAS_FEAT_XS
4200 if (pte_is_xs(pt_attr, spte)) {
4201 *need_strong_sync = true;
4202 }
4203 #endif /* HAS_FEAT_XS */
4204 pai = pa_index(pa);
4205 pvh_lock(pai);
4206 spte = *((volatile pt_entry_t*)cpte);
4207 pa = pte_to_pa(spte);
4208 if (pai == pa_index(pa)) {
4209 managed = TRUE;
4210 break; // Leave pai locked as we will unlock it after we free the PV entry
4211 }
4212 pvh_unlock(pai);
4213 }
4214
4215 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4216 /*
4217 * There used to be a valid mapping here but it
4218 * has already been removed when the page was
4219 * sent to the VM compressor, so nothing left to
4220 * remove now...
4221 */
4222 continue;
4223 }
4224
4225 /* remove the translation, do not flush the TLB */
4226 if (*cpte != ARM_PTE_TYPE_FAULT) {
4227 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4228 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4229 #if MACH_ASSERT
4230 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4231 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4232 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4233 }
4234 #endif
4235 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4236 num_pte_changed++;
4237 }
4238
4239 if ((spte != ARM_PTE_TYPE_FAULT) &&
4240 (pmap != kernel_pmap)) {
4241 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4242 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4243 --refcnt;
4244 }
4245
4246 if (pte_is_wired(spte)) {
4247 pte_set_wired(pmap, cpte, 0);
4248 num_unwired++;
4249 }
4250 /*
4251 * if not managed, we're done
4252 */
4253 if (!managed) {
4254 continue;
4255 }
4256
4257 #if XNU_MONITOR
4258 if (__improbable(ro_va)) {
4259 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4260 }
4261 #endif
4262
4263 /*
4264 * find and remove the mapping from the chain for this
4265 * physical address.
4266 */
4267 bool is_internal, is_altacct;
4268 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4269
4270 if (is_altacct) {
4271 assert(is_internal);
4272 num_internal++;
4273 num_alt_internal++;
4274 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4275 ppattr_clear_altacct(pai);
4276 ppattr_clear_internal(pai);
4277 }
4278 } else if (is_internal) {
4279 if (ppattr_test_reusable(pai)) {
4280 num_reusable++;
4281 } else {
4282 num_internal++;
4283 }
4284 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4285 ppattr_clear_internal(pai);
4286 }
4287 } else {
4288 num_external++;
4289 }
4290 pvh_unlock(pai);
4291 num_removed++;
4292 }
4293
4294 /*
4295 * Update the counts
4296 */
4297 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4298
4299 if (pmap != kernel_pmap) {
4300 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4301 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4302 }
4303
4304 /* update ledgers */
4305 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4306 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4307 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4308 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4309 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4310 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4311 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4312 /* make needed adjustments to phys_footprint */
4313 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4314 ((num_internal -
4315 num_alt_internal) +
4316 (num_compressed -
4317 num_alt_compressed)) * pmap_page_size);
4318 }
4319
4320 /* flush the ptable entries we have written */
4321 if (num_pte_changed > 0) {
4322 FLUSH_PTE_STRONG();
4323 }
4324
4325 return num_pte_changed;
4326 }
4327
4328
4329 /*
4330 * Remove the given range of addresses
4331 * from the specified map.
4332 *
4333 * It is assumed that the start and end are properly
4334 * rounded to the hardware page size.
4335 */
4336 void
4337 pmap_remove(
4338 pmap_t pmap,
4339 vm_map_address_t start,
4340 vm_map_address_t end)
4341 {
4342 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4343 }
4344
4345 MARK_AS_PMAP_TEXT vm_map_address_t
4346 pmap_remove_options_internal(
4347 pmap_t pmap,
4348 vm_map_address_t start,
4349 vm_map_address_t end,
4350 int options)
4351 {
4352 vm_map_address_t eva = end;
4353 pt_entry_t *bpte, *epte;
4354 pt_entry_t *pte_p;
4355 tt_entry_t *tte_p;
4356 int remove_count = 0;
4357 bool need_strong_sync = false;
4358 bool unlock = true;
4359
4360 if (__improbable(end < start)) {
4361 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4362 }
4363
4364 validate_pmap_mutable(pmap);
4365
4366 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4367
4368 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4369
4370 tte_p = pmap_tte(pmap, start);
4371
4372 if (tte_p == (tt_entry_t *) NULL) {
4373 goto done;
4374 }
4375
4376 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4377 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4378 bpte = &pte_p[pte_index(pt_attr, start)];
4379 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4380
4381 /*
4382 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4383 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4384 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4385 */
4386 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4387 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4388 __func__, ptep_get_pmap(bpte), pmap, bpte);
4389 }
4390
4391 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4392 &need_strong_sync, options);
4393
4394 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4395 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4396 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4397 unlock = false; // pmap_tte_deallocate() has dropped the lock
4398 }
4399 }
4400
4401 done:
4402 if (unlock) {
4403 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4404 }
4405
4406 if (remove_count > 0) {
4407 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4408 }
4409 return eva;
4410 }
4411
4412 void
4413 pmap_remove_options(
4414 pmap_t pmap,
4415 vm_map_address_t start,
4416 vm_map_address_t end,
4417 int options)
4418 {
4419 vm_map_address_t va;
4420
4421 if (pmap == PMAP_NULL) {
4422 return;
4423 }
4424
4425 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4426
4427 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4428 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4429 VM_KERNEL_ADDRHIDE(end));
4430
4431 #if MACH_ASSERT
4432 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4433 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4434 pmap, (uint64_t)start, (uint64_t)end);
4435 }
4436 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4437 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4438 pmap, (uint64_t)start, (uint64_t)end);
4439 }
4440 #endif
4441
4442 /*
4443 * We allow single-page requests to execute non-preemptibly,
4444 * as it doesn't make sense to sample AST_URGENT for a single-page
4445 * operation, and there are a couple of special use cases that
4446 * require a non-preemptible single-page operation.
4447 */
4448 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4449 pmap_verify_preemptible();
4450 }
4451
4452 /*
4453 * Invalidate the translation buffer first
4454 */
4455 va = start;
4456 while (va < end) {
4457 vm_map_address_t l;
4458
4459 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4460 if (l > end) {
4461 l = end;
4462 }
4463
4464 #if XNU_MONITOR
4465 va = pmap_remove_options_ppl(pmap, va, l, options);
4466
4467 pmap_ledger_check_balance(pmap);
4468 #else
4469 va = pmap_remove_options_internal(pmap, va, l, options);
4470 #endif
4471 }
4472
4473 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4474 }
4475
4476
4477 /*
4478 * Remove phys addr if mapped in specified map
4479 */
4480 void
4481 pmap_remove_some_phys(
4482 __unused pmap_t map,
4483 __unused ppnum_t pn)
4484 {
4485 /* Implement to support working set code */
4486 }
4487
4488 /*
4489 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4490 * switch a thread onto a new vm_map.
4491 */
4492 void
4493 pmap_switch_user(thread_t thread, vm_map_t new_map)
4494 {
4495 pmap_t new_pmap = new_map->pmap;
4496
4497
4498 thread->map = new_map;
4499 pmap_set_pmap(new_pmap, thread);
4500
4501 }
4502
4503 void
4504 pmap_set_pmap(
4505 pmap_t pmap,
4506 #if !__ARM_USER_PROTECT__
4507 __unused
4508 #endif
4509 thread_t thread)
4510 {
4511 pmap_switch(pmap);
4512 #if __ARM_USER_PROTECT__
4513 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4514 thread->machine.asid = pmap->hw_asid;
4515 #endif
4516 }
4517
4518 static void
4519 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4520 {
4521 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4522 }
4523
4524 static inline bool
4525 pmap_user_ttb_is_clear(void)
4526 {
4527 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4528 }
4529
4530 MARK_AS_PMAP_TEXT void
4531 pmap_switch_internal(
4532 pmap_t pmap)
4533 {
4534 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4535 #if XNU_MONITOR
4536 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4537 #endif
4538 validate_pmap_mutable(pmap);
4539 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4540 uint16_t asid_index = pmap->hw_asid;
4541 bool do_asid_flush = false;
4542 bool do_commpage_flush = false;
4543
4544 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4545 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4546 }
4547 #if __ARM_KERNEL_PROTECT__
4548 asid_index >>= 1;
4549 #endif
4550
4551 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4552 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4553 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4554 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4555 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4556 bool break_before_make = do_shared_region_flush;
4557
4558 #if !HAS_16BIT_ASID
4559 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4560 asid_index -= 1;
4561 pmap_update_plru(asid_index);
4562
4563 /* Paranoia. */
4564 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4565
4566 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4567 uint8_t new_sw_asid = pmap->sw_asid;
4568 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4569
4570 if (new_sw_asid != last_sw_asid) {
4571 /*
4572 * If the virtual ASID of the new pmap does not match the virtual ASID
4573 * last seen on this CPU for the physical ASID (that was a mouthful),
4574 * then this switch runs the risk of aliasing. We need to flush the
4575 * TLB for this phyiscal ASID in this case.
4576 */
4577 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4578 do_asid_flush = true;
4579 break_before_make = true;
4580 }
4581 }
4582 #endif /* !HAS_16BIT_ASID */
4583
4584 #if __ARM_MIXED_PAGE_SIZE__
4585 if (pt_attr->pta_tcr_value != get_tcr()) {
4586 break_before_make = true;
4587 }
4588 #endif
4589 #if __ARM_MIXED_PAGE_SIZE__
4590 /*
4591 * For mixed page size configurations, we need to flush the global commpage mappings from
4592 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4593 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4594 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4595 * conflict abort or other unpredictable behavior.
4596 */
4597 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4598 do_commpage_flush = true;
4599 }
4600 if (do_commpage_flush) {
4601 break_before_make = true;
4602 }
4603 #endif
4604 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4605 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4606 pmap_clear_user_ttb_internal();
4607 }
4608
4609 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4610 * to flush the userspace mappings for that region. Those mappings are global
4611 * and will not be protected by the ASID. It should also be cheaper to flush the
4612 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4613 if (__improbable(do_shared_region_flush)) {
4614 #if __ARM_RANGE_TLBI__
4615 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4616 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4617
4618 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4619 * There may still be non-global entries that overlap with the incoming pmap's
4620 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4621 * must necessarily belong to a different ASID than the incoming pmap, or they would
4622 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4623 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4624 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4625 * to consider additional invalidation here in the future. */
4626 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4627 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4628 } else {
4629 /*
4630 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4631 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4632 * have a single-page shared region anyway, not least because pmap_nest()
4633 * requires L2 block alignment of the address and size.
4634 */
4635 do_asid_flush = false;
4636 flush_core_tlb_async();
4637 }
4638 #else
4639 do_asid_flush = false;
4640 flush_core_tlb_async();
4641 #endif // __ARM_RANGE_TLBI__
4642 }
4643
4644 #if __ARM_MIXED_PAGE_SIZE__
4645 if (__improbable(do_commpage_flush)) {
4646 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4647 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4648 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4649 flush_core_tlb_allrange_async(rtlbi_param);
4650 }
4651 #endif
4652 if (__improbable(do_asid_flush)) {
4653 pmap_flush_core_tlb_asid_async(pmap);
4654 #if DEVELOPMENT || DEBUG
4655 os_atomic_inc(&pmap_asid_flushes, relaxed);
4656 #endif
4657 }
4658 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4659 sync_tlb_flush_local();
4660 }
4661
4662 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4663 }
4664
4665 void
4666 pmap_switch(
4667 pmap_t pmap)
4668 {
4669 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4670 #if XNU_MONITOR
4671 pmap_switch_ppl(pmap);
4672 #else
4673 pmap_switch_internal(pmap);
4674 #endif
4675 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4676 }
4677
4678 void
4679 pmap_page_protect(
4680 ppnum_t ppnum,
4681 vm_prot_t prot)
4682 {
4683 pmap_page_protect_options(ppnum, prot, 0, NULL);
4684 }
4685
4686 /*
4687 * Routine: pmap_page_protect_options
4688 *
4689 * Function:
4690 * Lower the permission for all mappings to a given
4691 * page.
4692 */
4693 MARK_AS_PMAP_TEXT static void
4694 pmap_page_protect_options_with_flush_range(
4695 ppnum_t ppnum,
4696 vm_prot_t prot,
4697 unsigned int options,
4698 pmap_tlb_flush_range_t *flush_range)
4699 {
4700 pmap_paddr_t phys = ptoa(ppnum);
4701 pv_entry_t **pv_h;
4702 pv_entry_t *pve_p, *orig_pve_p;
4703 pv_entry_t *pveh_p;
4704 pv_entry_t *pvet_p;
4705 pt_entry_t *pte_p, *orig_pte_p;
4706 pv_entry_t *new_pve_p;
4707 pt_entry_t *new_pte_p;
4708 vm_offset_t pvh_flags;
4709 unsigned int pai;
4710 bool remove;
4711 bool set_NX;
4712 unsigned int pvh_cnt = 0;
4713 unsigned int pass1_updated = 0;
4714 unsigned int pass2_updated = 0;
4715
4716 assert(ppnum != vm_page_fictitious_addr);
4717
4718 /* Only work with managed pages. */
4719 if (!pa_valid(phys)) {
4720 return;
4721 }
4722
4723 /*
4724 * Determine the new protection.
4725 */
4726 switch (prot) {
4727 case VM_PROT_ALL:
4728 return; /* nothing to do */
4729 case VM_PROT_READ:
4730 case VM_PROT_READ | VM_PROT_EXECUTE:
4731 remove = false;
4732 break;
4733 default:
4734 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4735 options = options & ~PMAP_OPTIONS_NOFLUSH;
4736 remove = true;
4737 break;
4738 }
4739
4740 pmap_cpu_data_t *pmap_cpu_data = NULL;
4741 if (remove) {
4742 #if !XNU_MONITOR
4743 mp_disable_preemption();
4744 #endif
4745 pmap_cpu_data = pmap_get_cpu_data();
4746 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4747 /*
4748 * Ensure the store to inflight_disconnect will be observed before any of the
4749 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4750 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4751 * another CPU, in between this function's clearing a PTE and dropping the
4752 * corresponding pagetable refcount. That can lead to a panic if the
4753 * destroying thread observes a non-zero refcount. For this we need a store-
4754 * store barrier; a store-release operation would not be sufficient.
4755 */
4756 os_atomic_thread_fence(release);
4757 }
4758
4759 pai = pa_index(phys);
4760 pvh_lock(pai);
4761 pv_h = pai_to_pvh(pai);
4762 pvh_flags = pvh_get_flags(pv_h);
4763
4764 #if XNU_MONITOR
4765 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4766 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4767 }
4768 if (__improbable(ppattr_pa_test_monitor(phys))) {
4769 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4770 }
4771 #endif
4772
4773
4774 orig_pte_p = pte_p = PT_ENTRY_NULL;
4775 orig_pve_p = pve_p = PV_ENTRY_NULL;
4776 pveh_p = PV_ENTRY_NULL;
4777 pvet_p = PV_ENTRY_NULL;
4778 new_pve_p = PV_ENTRY_NULL;
4779 new_pte_p = PT_ENTRY_NULL;
4780
4781
4782 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4783 orig_pte_p = pte_p = pvh_ptep(pv_h);
4784 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4785 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4786 pveh_p = pve_p;
4787 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4788 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4789 }
4790
4791 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4792 int pve_ptep_idx = 0;
4793
4794 /*
4795 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4796 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4797 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4798 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4799 * operation, TLB invalidation may be handled by the caller so it's possible for
4800 * tlb_flush_needed to be true while issue_tlbi is false.
4801 */
4802 bool issue_tlbi = false;
4803 bool tlb_flush_needed = false;
4804 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4805 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4806 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4807 bool update = false;
4808
4809 if (pve_p != PV_ENTRY_NULL) {
4810 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4811 if (pte_p == PT_ENTRY_NULL) {
4812 goto protect_skip_pve_pass1;
4813 }
4814 }
4815
4816 #ifdef PVH_FLAG_IOMMU
4817 if (pvh_ptep_is_iommu(pte_p)) {
4818 #if XNU_MONITOR
4819 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4820 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4821 ppnum, ptep_get_iommu(pte_p), pve_p);
4822 }
4823 #endif
4824 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4825 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4826 ppnum, ptep_get_iommu(pte_p), pve_p);
4827 }
4828 goto protect_skip_pve_pass1;
4829 }
4830 #endif
4831 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4832 const pmap_t pmap = ptdp->pmap;
4833 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4834
4835 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4836 #if MACH_ASSERT
4837 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4838 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4839 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4840 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4841
4842 pv_entry_t *check_pvep = pve_p;
4843
4844 do {
4845 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4846 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4847 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4848 }
4849 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4850
4851 /* Restore previous PTEP value. */
4852 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4853 }
4854 #endif
4855 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4856 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4857 }
4858
4859 #if DEVELOPMENT || DEBUG
4860 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4861 #else
4862 if ((prot & VM_PROT_EXECUTE))
4863 #endif
4864 {
4865 set_NX = false;
4866 } else {
4867 set_NX = true;
4868 }
4869
4870 #if HAS_FEAT_XS
4871 /**
4872 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4873 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4874 */
4875 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4876 #endif /* HAS_FEAT_XS */
4877
4878 /* Remove the mapping if new protection is NONE */
4879 if (remove) {
4880 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4881 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4882 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4883 pt_entry_t spte = *pte_p;
4884
4885 if (pte_is_wired(spte)) {
4886 pte_set_wired(pmap, pte_p, 0);
4887 spte = *pte_p;
4888 if (pmap != kernel_pmap) {
4889 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4890 }
4891 }
4892
4893 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4894 (uint64_t)spte, pte_p, ppnum);
4895
4896 if (compress && is_internal && (pmap != kernel_pmap)) {
4897 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4898 /* mark this PTE as having been "compressed" */
4899 tmplate = ARM_PTE_COMPRESSED;
4900 if (is_altacct) {
4901 tmplate |= ARM_PTE_COMPRESSED_ALT;
4902 }
4903 } else {
4904 tmplate = ARM_PTE_TYPE_FAULT;
4905 }
4906
4907 assert(spte != tmplate);
4908 write_pte_fast(pte_p, tmplate);
4909 update = true;
4910 ++pass1_updated;
4911
4912 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4913
4914 if (pmap != kernel_pmap) {
4915 if (ppattr_test_reusable(pai) &&
4916 is_internal &&
4917 !is_altacct) {
4918 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4919 } else if (!is_internal) {
4920 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4921 }
4922
4923 if (is_altacct) {
4924 assert(is_internal);
4925 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4926 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4927 if (options & PMAP_OPTIONS_COMPRESSOR) {
4928 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4929 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4930 }
4931 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4932 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4933 } else if (ppattr_test_reusable(pai)) {
4934 assert(is_internal);
4935 if (options & PMAP_OPTIONS_COMPRESSOR) {
4936 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4937 /* was not in footprint, but is now */
4938 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4939 }
4940 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4941 } else if (is_internal) {
4942 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4943
4944 /*
4945 * Update all stats related to physical footprint, which only
4946 * deals with internal pages.
4947 */
4948 if (options & PMAP_OPTIONS_COMPRESSOR) {
4949 /*
4950 * This removal is only being done so we can send this page to
4951 * the compressor; therefore it mustn't affect total task footprint.
4952 */
4953 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4954 } else {
4955 /*
4956 * This internal page isn't going to the compressor, so adjust stats to keep
4957 * phys_footprint up to date.
4958 */
4959 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4960 }
4961 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4962 } else {
4963 /* external page: no impact on ledgers */
4964 }
4965 }
4966 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4967 } else {
4968 pt_entry_t spte = *pte_p;
4969 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4970
4971 if (pmap == kernel_pmap) {
4972 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4973 } else {
4974 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4975 }
4976
4977 /*
4978 * While the naive implementation of this would serve to add execute
4979 * permission, this is not how the VM uses this interface, or how
4980 * x86_64 implements it. So ignore requests to add execute permissions.
4981 */
4982 if (set_NX) {
4983 tmplate |= pt_attr_leaf_xn(pt_attr);
4984 }
4985
4986
4987 assert(spte != ARM_PTE_TYPE_FAULT);
4988 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4989
4990 if (spte != tmplate) {
4991 /*
4992 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4993 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4994 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4995 * should always be cleared by this function.
4996 */
4997 pte_set_was_writeable(tmplate, true);
4998 write_pte_fast(pte_p, tmplate);
4999 update = true;
5000 ++pass1_updated;
5001 } else if (pte_was_writeable(tmplate)) {
5002 /*
5003 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5004 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
5005 * write access to a page, this function should always at least clear that flag for
5006 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5007 * these mappings go through vm_fault(). We therefore don't want those accesses to
5008 * be handled through arm_fast_fault().
5009 */
5010 pte_set_was_writeable(tmplate, false);
5011 write_pte_fast(pte_p, tmplate);
5012 }
5013 }
5014
5015 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5016 tlb_flush_needed = true;
5017 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5018 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5019 issue_tlbi = true;
5020 }
5021 }
5022 protect_skip_pve_pass1:
5023 pte_p = PT_ENTRY_NULL;
5024 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5025 pve_ptep_idx = 0;
5026 pve_p = pve_next(pve_p);
5027 }
5028 }
5029
5030 if (tlb_flush_needed) {
5031 FLUSH_PTE_STRONG();
5032 }
5033
5034 if (!remove && !issue_tlbi) {
5035 goto protect_finish;
5036 }
5037
5038 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5039 pv_entry_t **pve_pp = pv_h;
5040 pve_p = orig_pve_p;
5041 pte_p = orig_pte_p;
5042 pve_ptep_idx = 0;
5043
5044 /*
5045 * We need to keep track of whether a particular PVE list contains IOMMU
5046 * mappings when removing entries, because we should only remove CPU
5047 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5048 * it around.
5049 */
5050 bool iommu_mapping_in_pve = false;
5051 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5052 if (pve_p != PV_ENTRY_NULL) {
5053 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5054 if (pte_p == PT_ENTRY_NULL) {
5055 goto protect_skip_pve_pass2;
5056 }
5057 }
5058
5059 #ifdef PVH_FLAG_IOMMU
5060 if (pvh_ptep_is_iommu(pte_p)) {
5061 iommu_mapping_in_pve = true;
5062 if (remove && (pve_p == PV_ENTRY_NULL)) {
5063 /*
5064 * We've found an IOMMU entry and it's the only entry in the PV list.
5065 * We don't discard IOMMU entries, so simply set up the new PV list to
5066 * contain the single IOMMU PTE and exit the loop.
5067 */
5068 new_pte_p = pte_p;
5069 break;
5070 }
5071 goto protect_skip_pve_pass2;
5072 }
5073 #endif
5074 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5075 const pmap_t pmap = ptdp->pmap;
5076 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5077
5078 if (remove) {
5079 if (!compress && (pmap != kernel_pmap)) {
5080 /*
5081 * We must wait to decrement the refcount until we're completely finished using the PTE
5082 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5083 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5084 * under us.
5085 */
5086 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5087 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5088 }
5089 }
5090 /* Remove this CPU mapping from PVE list. */
5091 if (pve_p != PV_ENTRY_NULL) {
5092 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5093 }
5094 } else {
5095 pt_entry_t spte = *pte_p;
5096 if (pte_was_writeable(spte)) {
5097 pte_set_was_writeable(spte, false);
5098 write_pte_fast(pte_p, spte);
5099 } else {
5100 goto protect_skip_pve_pass2;
5101 }
5102 }
5103 ++pass2_updated;
5104 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5105 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5106 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5107 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5108 }
5109
5110 protect_skip_pve_pass2:
5111 pte_p = PT_ENTRY_NULL;
5112 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5113 pve_ptep_idx = 0;
5114
5115 if (remove) {
5116 /**
5117 * If there are any IOMMU mappings in the PVE list, preserve
5118 * those mappings in a new PVE list (new_pve_p) which will later
5119 * become the new PVH entry. Keep track of the CPU mappings in
5120 * pveh_p/pvet_p so they can be deallocated later.
5121 */
5122 if (iommu_mapping_in_pve) {
5123 iommu_mapping_in_pve = false;
5124 pv_entry_t *temp_pve_p = pve_next(pve_p);
5125 pve_remove(pv_h, pve_pp, pve_p);
5126 pveh_p = pvh_pve_list(pv_h);
5127 pve_p->pve_next = new_pve_p;
5128 new_pve_p = pve_p;
5129 pve_p = temp_pve_p;
5130 continue;
5131 } else {
5132 pvet_p = pve_p;
5133 pvh_cnt++;
5134 }
5135 }
5136
5137 pve_pp = pve_next_ptr(pve_p);
5138 pve_p = pve_next(pve_p);
5139 iommu_mapping_in_pve = false;
5140 }
5141 }
5142
5143 protect_finish:
5144
5145 #ifdef PVH_FLAG_EXEC
5146 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5147 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5148 }
5149 #endif
5150 if (__improbable(pass1_updated != pass2_updated)) {
5151 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5152 __func__, pass1_updated, pass2_updated);
5153 }
5154 /* if we removed a bunch of entries, take care of them now */
5155 if (remove) {
5156 if (new_pve_p != PV_ENTRY_NULL) {
5157 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5158 pvh_set_flags(pv_h, pvh_flags);
5159 } else if (new_pte_p != PT_ENTRY_NULL) {
5160 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5161 pvh_set_flags(pv_h, pvh_flags);
5162 } else {
5163 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5164 pmap_flush_noncoherent_page(phys);
5165 }
5166 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5167 }
5168 }
5169
5170 if (flush_range && tlb_flush_needed) {
5171 if (!remove) {
5172 flush_range->ptfr_flush_needed = true;
5173 tlb_flush_needed = false;
5174 }
5175 }
5176
5177 /*
5178 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5179 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5180 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5181 * a page to be repurposed while it is still live in the TLBs.
5182 */
5183 if (remove && tlb_flush_needed) {
5184 sync_tlb_flush();
5185 }
5186
5187
5188 pvh_unlock(pai);
5189
5190 if (remove) {
5191 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5192 #if !XNU_MONITOR
5193 mp_enable_preemption();
5194 #endif
5195 }
5196
5197 if (!remove && tlb_flush_needed) {
5198 sync_tlb_flush();
5199 }
5200
5201 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5202 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5203 }
5204 }
5205
5206 MARK_AS_PMAP_TEXT void
5207 pmap_page_protect_options_internal(
5208 ppnum_t ppnum,
5209 vm_prot_t prot,
5210 unsigned int options,
5211 void *arg)
5212 {
5213 if (arg != NULL) {
5214 /*
5215 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5216 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5217 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5218 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5219 * In that case, force the flush to take place.
5220 */
5221 options &= ~PMAP_OPTIONS_NOFLUSH;
5222 }
5223 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5224 }
5225
5226 void
5227 pmap_page_protect_options(
5228 ppnum_t ppnum,
5229 vm_prot_t prot,
5230 unsigned int options,
5231 void *arg)
5232 {
5233 pmap_paddr_t phys = ptoa(ppnum);
5234
5235 assert(ppnum != vm_page_fictitious_addr);
5236
5237 /* Only work with managed pages. */
5238 if (!pa_valid(phys)) {
5239 return;
5240 }
5241
5242 /*
5243 * Determine the new protection.
5244 */
5245 if (prot == VM_PROT_ALL) {
5246 return; /* nothing to do */
5247 }
5248
5249 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5250
5251 #if XNU_MONITOR
5252 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5253 #else
5254 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5255 #endif
5256
5257 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5258 }
5259
5260
5261 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5262 MARK_AS_PMAP_TEXT void
5263 pmap_disable_user_jop_internal(pmap_t pmap)
5264 {
5265 if (pmap == kernel_pmap) {
5266 panic("%s: called with kernel_pmap", __func__);
5267 }
5268 validate_pmap_mutable(pmap);
5269 pmap->disable_jop = true;
5270 }
5271
5272 void
5273 pmap_disable_user_jop(pmap_t pmap)
5274 {
5275 #if XNU_MONITOR
5276 pmap_disable_user_jop_ppl(pmap);
5277 #else
5278 pmap_disable_user_jop_internal(pmap);
5279 #endif
5280 }
5281 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5282
5283 /*
5284 * Indicates if the pmap layer enforces some additional restrictions on the
5285 * given set of protections.
5286 */
5287 bool
5288 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5289 {
5290 return false;
5291 }
5292
5293 /*
5294 * Set the physical protection on the
5295 * specified range of this map as requested.
5296 * VERY IMPORTANT: Will not increase permissions.
5297 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5298 */
5299 void
5300 pmap_protect(
5301 pmap_t pmap,
5302 vm_map_address_t b,
5303 vm_map_address_t e,
5304 vm_prot_t prot)
5305 {
5306 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5307 }
5308
5309 MARK_AS_PMAP_TEXT vm_map_address_t
5310 pmap_protect_options_internal(
5311 pmap_t pmap,
5312 vm_map_address_t start,
5313 vm_map_address_t end,
5314 vm_prot_t prot,
5315 unsigned int options,
5316 __unused void *args)
5317 {
5318 tt_entry_t *tte_p;
5319 pt_entry_t *bpte_p, *epte_p;
5320 pt_entry_t *pte_p;
5321 boolean_t set_NX = TRUE;
5322 boolean_t set_XO = FALSE;
5323 boolean_t should_have_removed = FALSE;
5324 bool need_strong_sync = false;
5325
5326 /* Validate the pmap input before accessing its data. */
5327 validate_pmap_mutable(pmap);
5328
5329 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5330
5331 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5332 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5333 }
5334
5335 #if DEVELOPMENT || DEBUG
5336 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5337 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5338 should_have_removed = TRUE;
5339 }
5340 } else
5341 #endif
5342 {
5343 /* Determine the new protection. */
5344 switch (prot) {
5345 case VM_PROT_EXECUTE:
5346 set_XO = TRUE;
5347 OS_FALLTHROUGH;
5348 case VM_PROT_READ:
5349 case VM_PROT_READ | VM_PROT_EXECUTE:
5350 break;
5351 case VM_PROT_READ | VM_PROT_WRITE:
5352 case VM_PROT_ALL:
5353 return end; /* nothing to do */
5354 default:
5355 should_have_removed = TRUE;
5356 }
5357 }
5358
5359 if (should_have_removed) {
5360 panic("%s: should have been a remove operation, "
5361 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5362 __FUNCTION__,
5363 pmap, (void *)start, (void *)end, prot, options, args);
5364 }
5365
5366 #if DEVELOPMENT || DEBUG
5367 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5368 #else
5369 if ((prot & VM_PROT_EXECUTE))
5370 #endif
5371 {
5372 set_NX = FALSE;
5373 } else {
5374 set_NX = TRUE;
5375 }
5376
5377 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5378 vm_map_address_t va = start;
5379 unsigned int npages = 0;
5380
5381 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5382
5383 tte_p = pmap_tte(pmap, start);
5384
5385 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5386 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5387 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5388 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5389 pte_p = bpte_p;
5390
5391 for (pte_p = bpte_p;
5392 pte_p < epte_p;
5393 pte_p += PAGE_RATIO, va += pmap_page_size) {
5394 ++npages;
5395 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5396 pmap_pending_preemption())) {
5397 break;
5398 }
5399 pt_entry_t spte;
5400 #if DEVELOPMENT || DEBUG
5401 boolean_t force_write = FALSE;
5402 #endif
5403
5404 spte = *((volatile pt_entry_t*)pte_p);
5405
5406 if ((spte == ARM_PTE_TYPE_FAULT) ||
5407 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5408 continue;
5409 }
5410
5411 pmap_paddr_t pa;
5412 unsigned int pai = 0;
5413 boolean_t managed = FALSE;
5414
5415 while (!managed) {
5416 /*
5417 * It may be possible for the pte to transition from managed
5418 * to unmanaged in this timeframe; for now, elide the assert.
5419 * We should break out as a consequence of checking pa_valid.
5420 */
5421 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5422 pa = pte_to_pa(spte);
5423 if (!pa_valid(pa)) {
5424 break;
5425 }
5426 pai = pa_index(pa);
5427 pvh_lock(pai);
5428 spte = *((volatile pt_entry_t*)pte_p);
5429 pa = pte_to_pa(spte);
5430 if (pai == pa_index(pa)) {
5431 managed = TRUE;
5432 break; // Leave the PVH locked as we will unlock it after we free the PTE
5433 }
5434 pvh_unlock(pai);
5435 }
5436
5437 if ((spte == ARM_PTE_TYPE_FAULT) ||
5438 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5439 continue;
5440 }
5441
5442 pt_entry_t tmplate;
5443
5444 if (pmap == kernel_pmap) {
5445 #if DEVELOPMENT || DEBUG
5446 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5447 force_write = TRUE;
5448 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5449 } else
5450 #endif
5451 {
5452 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5453 }
5454 } else {
5455 #if DEVELOPMENT || DEBUG
5456 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5457 assert(pmap->type != PMAP_TYPE_NESTED);
5458 force_write = TRUE;
5459 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5460 } else
5461 #endif
5462 {
5463 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5464 }
5465 }
5466
5467 /*
5468 * XXX Removing "NX" would
5469 * grant "execute" access
5470 * immediately, bypassing any
5471 * checks VM might want to do
5472 * in its soft fault path.
5473 * pmap_protect() and co. are
5474 * not allowed to increase
5475 * access permissions.
5476 */
5477 if (set_NX) {
5478 tmplate |= pt_attr_leaf_xn(pt_attr);
5479 } else {
5480 if (pmap == kernel_pmap) {
5481 /* do NOT clear "PNX"! */
5482 tmplate |= ARM_PTE_NX;
5483 } else {
5484 /* do NOT clear "NX"! */
5485 tmplate |= pt_attr_leaf_x(pt_attr);
5486 if (set_XO) {
5487 tmplate &= ~ARM_PTE_APMASK;
5488 tmplate |= pt_attr_leaf_rona(pt_attr);
5489 }
5490 }
5491 }
5492
5493 #if DEVELOPMENT || DEBUG
5494 if (force_write) {
5495 /*
5496 * TODO: Run CS/Monitor checks here.
5497 */
5498 if (managed) {
5499 /*
5500 * We are marking the page as writable,
5501 * so we consider it to be modified and
5502 * referenced.
5503 */
5504 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5505 tmplate |= ARM_PTE_AF;
5506
5507 if (ppattr_test_reffault(pai)) {
5508 ppattr_clear_reffault(pai);
5509 }
5510
5511 if (ppattr_test_modfault(pai)) {
5512 ppattr_clear_modfault(pai);
5513 }
5514 }
5515 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5516 /*
5517 * An immediate request for anything other than
5518 * write should still mark the page as
5519 * referenced if managed.
5520 */
5521 if (managed) {
5522 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5523 tmplate |= ARM_PTE_AF;
5524
5525 if (ppattr_test_reffault(pai)) {
5526 ppattr_clear_reffault(pai);
5527 }
5528 }
5529 }
5530 #endif
5531
5532 /* We do not expect to write fast fault the entry. */
5533 pte_set_was_writeable(tmplate, false);
5534 #if HAS_FEAT_XS
5535 if (pte_is_xs(pt_attr, spte)) {
5536 need_strong_sync = true;
5537 }
5538 #endif /* HAS_FEAT_XS */
5539
5540 write_pte_fast(pte_p, tmplate);
5541
5542 if (managed) {
5543 pvh_assert_locked(pai);
5544 pvh_unlock(pai);
5545 }
5546 }
5547 FLUSH_PTE_STRONG();
5548 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5549 } else {
5550 va = end;
5551 }
5552
5553 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5554 return va;
5555 }
5556
5557 void
5558 pmap_protect_options(
5559 pmap_t pmap,
5560 vm_map_address_t b,
5561 vm_map_address_t e,
5562 vm_prot_t prot,
5563 unsigned int options,
5564 __unused void *args)
5565 {
5566 vm_map_address_t l, beg;
5567
5568 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5569
5570 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5571 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5572 pmap, (uint64_t)b, (uint64_t)e);
5573 }
5574
5575 /*
5576 * We allow single-page requests to execute non-preemptibly,
5577 * as it doesn't make sense to sample AST_URGENT for a single-page
5578 * operation, and there are a couple of special use cases that
5579 * require a non-preemptible single-page operation.
5580 */
5581 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5582 pmap_verify_preemptible();
5583 }
5584
5585 #if DEVELOPMENT || DEBUG
5586 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5587 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5588 pmap_remove_options(pmap, b, e, options);
5589 return;
5590 }
5591 } else
5592 #endif
5593 {
5594 /* Determine the new protection. */
5595 switch (prot) {
5596 case VM_PROT_EXECUTE:
5597 case VM_PROT_READ:
5598 case VM_PROT_READ | VM_PROT_EXECUTE:
5599 break;
5600 case VM_PROT_READ | VM_PROT_WRITE:
5601 case VM_PROT_ALL:
5602 return; /* nothing to do */
5603 default:
5604 pmap_remove_options(pmap, b, e, options);
5605 return;
5606 }
5607 }
5608
5609 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5610 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5611 VM_KERNEL_ADDRHIDE(e));
5612
5613 beg = b;
5614
5615 while (beg < e) {
5616 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5617
5618 if (l > e) {
5619 l = e;
5620 }
5621
5622 #if XNU_MONITOR
5623 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5624 #else
5625 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5626 #endif
5627 }
5628
5629 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5630 }
5631
5632 /**
5633 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5634 *
5635 * @param pmap pmap to insert the pages into.
5636 * @param va virtual address to map the pages into.
5637 * @param pa page number of the first physical page to map.
5638 * @param size block size, in number of pages.
5639 * @param prot mapping protection attributes.
5640 * @param attr flags to pass to pmap_enter().
5641 *
5642 * @return KERN_SUCCESS.
5643 */
5644 kern_return_t
5645 pmap_map_block(
5646 pmap_t pmap,
5647 addr64_t va,
5648 ppnum_t pa,
5649 uint32_t size,
5650 vm_prot_t prot,
5651 int attr,
5652 unsigned int flags)
5653 {
5654 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5655 }
5656
5657 /**
5658 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5659 * As opposed to pmap_map_block(), this function takes
5660 * a physical address as an input and operates using the
5661 * page size associated with the input pmap.
5662 *
5663 * @param pmap pmap to insert the pages into.
5664 * @param va virtual address to map the pages into.
5665 * @param pa physical address of the first physical page to map.
5666 * @param size block size, in number of pages.
5667 * @param prot mapping protection attributes.
5668 * @param attr flags to pass to pmap_enter().
5669 *
5670 * @return KERN_SUCCESS.
5671 */
5672 kern_return_t
5673 pmap_map_block_addr(
5674 pmap_t pmap,
5675 addr64_t va,
5676 pmap_paddr_t pa,
5677 uint32_t size,
5678 vm_prot_t prot,
5679 int attr,
5680 unsigned int flags)
5681 {
5682 #if __ARM_MIXED_PAGE_SIZE__
5683 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5684 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5685 #else
5686 const uint64_t pmap_page_size = PAGE_SIZE;
5687 #endif
5688
5689 for (ppnum_t page = 0; page < size; page++) {
5690 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5691 panic("%s: failed pmap_enter_addr, "
5692 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5693 __FUNCTION__,
5694 pmap, va, (uint64_t)pa, size, prot, flags);
5695 }
5696
5697 va += pmap_page_size;
5698 pa += pmap_page_size;
5699 }
5700
5701 return KERN_SUCCESS;
5702 }
5703
5704 kern_return_t
5705 pmap_enter_addr(
5706 pmap_t pmap,
5707 vm_map_address_t v,
5708 pmap_paddr_t pa,
5709 vm_prot_t prot,
5710 vm_prot_t fault_type,
5711 unsigned int flags,
5712 boolean_t wired)
5713 {
5714 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5715 }
5716
5717 /*
5718 * Insert the given physical page (p) at
5719 * the specified virtual address (v) in the
5720 * target physical map with the protection requested.
5721 *
5722 * If specified, the page will be wired down, meaning
5723 * that the related pte can not be reclaimed.
5724 *
5725 * NB: This is the only routine which MAY NOT lazy-evaluate
5726 * or lose information. That is, this routine must actually
5727 * insert this page into the given map eventually (must make
5728 * forward progress eventually.
5729 */
5730 kern_return_t
5731 pmap_enter(
5732 pmap_t pmap,
5733 vm_map_address_t v,
5734 ppnum_t pn,
5735 vm_prot_t prot,
5736 vm_prot_t fault_type,
5737 unsigned int flags,
5738 boolean_t wired)
5739 {
5740 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5741 }
5742
5743 /*
5744 * Attempt to commit the pte.
5745 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5746 * Performs no page table or accounting writes on failures.
5747 */
5748 static inline bool
5749 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5750 {
5751 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5752 bool success = false, changed_wiring = false;
5753
5754 __unreachable_ok_push
5755 if (TEST_PAGE_RATIO_4) {
5756 /*
5757 * 16K virtual pages w/ 4K hw pages.
5758 * We actually need to update 4 ptes here which can't easily be done atomically.
5759 * As a result we require the exclusive pmap lock.
5760 */
5761 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5762 *old_pte = *pte_p;
5763 if (*old_pte == new_pte) {
5764 /* Another thread completed this operation. Nothing to do here. */
5765 success = true;
5766 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5767 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5768 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5769 success = false;
5770 } else {
5771 write_pte_fast(pte_p, new_pte);
5772 success = true;
5773 }
5774 } else {
5775 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5776 }
5777 __unreachable_ok_pop
5778
5779 if (success && *old_pte != new_pte) {
5780 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5781 bool need_strong_sync = false;
5782 FLUSH_PTE_STRONG();
5783 #if HAS_FEAT_XS
5784 if (pte_is_xs(pt_attr, *old_pte)) {
5785 need_strong_sync = true;
5786 }
5787 #endif /* HAS_FEAT_XS */
5788 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5789 } else {
5790 FLUSH_PTE();
5791 __builtin_arm_isb(ISB_SY);
5792 }
5793 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5794 (new_pte & ARM_PTE_WIRED) != 0 :
5795 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5796
5797 if (pmap != kernel_pmap && changed_wiring) {
5798 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5799 if (new_pte & ARM_PTE_WIRED) {
5800 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5801 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5802 } else {
5803 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5804 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5805 }
5806 }
5807
5808 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5809 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5810 }
5811 return success;
5812 }
5813
5814 MARK_AS_PMAP_TEXT static pt_entry_t
5815 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5816 {
5817 pt_entry_t pte;
5818
5819 switch (wimg & (VM_WIMG_MASK)) {
5820 case VM_WIMG_IO:
5821 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5822 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5823 // AP, while preserving the security benefits of using device
5824 // mapping against side-channel attacks. On pre-H14 platforms,
5825 // the accesses will still be strongly ordered.
5826 if (is_dram_addr(pa)) {
5827 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5828 } else {
5829 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5830 }
5831 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5832 break;
5833 case VM_WIMG_RT:
5834 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5835 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5836 break;
5837 case VM_WIMG_POSTED:
5838 if (is_dram_addr(pa)) {
5839 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5840 } else {
5841 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5842 }
5843 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5844 break;
5845 case VM_WIMG_POSTED_REORDERED:
5846 if (is_dram_addr(pa)) {
5847 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5848 } else {
5849 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5850 }
5851 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5852 break;
5853 case VM_WIMG_POSTED_COMBINED_REORDERED:
5854 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5855 #if HAS_FEAT_XS
5856 if (!is_dram_addr(pa)) {
5857 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5858 }
5859 #endif /* HAS_FEAT_XS */
5860 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5861 break;
5862 case VM_WIMG_WCOMB:
5863 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5864 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5865 break;
5866 case VM_WIMG_WTHRU:
5867 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5868 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5869 break;
5870 case VM_WIMG_COPYBACK:
5871 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5872 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5873 break;
5874 case VM_WIMG_INNERWBACK:
5875 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5876 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5877 break;
5878 default:
5879 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5880 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5881 }
5882
5883 return pte;
5884 }
5885
5886
5887 /*
5888 * Construct a PTE (and the physical page attributes) for the given virtual to
5889 * physical mapping.
5890 *
5891 * This function has no side effects and is safe to call so that it is safe to
5892 * call while attempting a pmap_enter transaction.
5893 */
5894 MARK_AS_PMAP_TEXT static pt_entry_t
5895 pmap_construct_pte(
5896 const pmap_t pmap,
5897 vm_map_address_t va,
5898 pmap_paddr_t pa,
5899 vm_prot_t prot,
5900 vm_prot_t fault_type,
5901 boolean_t wired,
5902 const pt_attr_t* const pt_attr,
5903 uint16_t *pp_attr_bits /* OUTPUT */
5904 )
5905 {
5906 bool set_NX = false, set_XO = false;
5907 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5908 assert(pp_attr_bits != NULL);
5909 *pp_attr_bits = 0;
5910
5911 if (wired) {
5912 pte |= ARM_PTE_WIRED;
5913 }
5914
5915 #if DEVELOPMENT || DEBUG
5916 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5917 #else
5918 if ((prot & VM_PROT_EXECUTE))
5919 #endif
5920 {
5921 set_NX = false;
5922 } else {
5923 set_NX = true;
5924 }
5925
5926 if (prot == VM_PROT_EXECUTE) {
5927 set_XO = true;
5928 }
5929
5930 if (set_NX) {
5931 pte |= pt_attr_leaf_xn(pt_attr);
5932 } else {
5933 if (pmap == kernel_pmap) {
5934 pte |= ARM_PTE_NX;
5935 } else {
5936 pte |= pt_attr_leaf_x(pt_attr);
5937 }
5938 }
5939
5940 if (pmap == kernel_pmap) {
5941 #if __ARM_KERNEL_PROTECT__
5942 pte |= ARM_PTE_NG;
5943 #endif /* __ARM_KERNEL_PROTECT__ */
5944 if (prot & VM_PROT_WRITE) {
5945 pte |= ARM_PTE_AP(AP_RWNA);
5946 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5947 } else {
5948 pte |= ARM_PTE_AP(AP_RONA);
5949 *pp_attr_bits |= PP_ATTR_REFERENCED;
5950 }
5951 } else {
5952 if (pmap->type != PMAP_TYPE_NESTED) {
5953 pte |= ARM_PTE_NG;
5954 } else if ((pmap->nested_region_asid_bitmap)
5955 && (va >= pmap->nested_region_addr)
5956 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5957 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5958
5959 if ((pmap->nested_region_asid_bitmap)
5960 && testbit(index, (int *)pmap->nested_region_asid_bitmap)) {
5961 pte |= ARM_PTE_NG;
5962 }
5963 }
5964 if (prot & VM_PROT_WRITE) {
5965 assert(pmap->type != PMAP_TYPE_NESTED);
5966 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5967 if (fault_type & VM_PROT_WRITE) {
5968 pte |= pt_attr_leaf_rw(pt_attr);
5969 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5970 } else {
5971 pte |= pt_attr_leaf_ro(pt_attr);
5972 /*
5973 * Mark the page as MODFAULT so that a subsequent write
5974 * may be handled through arm_fast_fault().
5975 */
5976 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5977 pte_set_was_writeable(pte, true);
5978 }
5979 } else {
5980 pte |= pt_attr_leaf_rw(pt_attr);
5981 *pp_attr_bits |= PP_ATTR_REFERENCED;
5982 }
5983 } else {
5984 if (set_XO) {
5985 pte |= pt_attr_leaf_rona(pt_attr);
5986 } else {
5987 pte |= pt_attr_leaf_ro(pt_attr);
5988 }
5989 *pp_attr_bits |= PP_ATTR_REFERENCED;
5990 }
5991 }
5992
5993 pte |= ARM_PTE_AF;
5994 return pte;
5995 }
5996
5997 MARK_AS_PMAP_TEXT kern_return_t
5998 pmap_enter_options_internal(
5999 pmap_t pmap,
6000 vm_map_address_t v,
6001 pmap_paddr_t pa,
6002 vm_prot_t prot,
6003 vm_prot_t fault_type,
6004 unsigned int flags,
6005 boolean_t wired,
6006 unsigned int options)
6007 {
6008 ppnum_t pn = (ppnum_t)atop(pa);
6009 pt_entry_t pte;
6010 pt_entry_t spte;
6011 pt_entry_t *pte_p;
6012 bool refcnt_updated;
6013 bool wiredcnt_updated;
6014 bool ro_va = false;
6015 unsigned int wimg_bits;
6016 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6017 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6018 kern_return_t kr = KERN_SUCCESS;
6019 uint16_t pp_attr_bits;
6020 volatile uint16_t *refcnt;
6021 volatile uint16_t *wiredcnt;
6022 pv_free_list_t *local_pv_free;
6023
6024 validate_pmap_mutable(pmap);
6025
6026 #if XNU_MONITOR
6027 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6028 panic("pmap_enter_options() called without PMAP_OPTIONS_NOWAIT set");
6029 }
6030 #endif
6031
6032 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6033
6034 if ((v) & pt_attr_leaf_offmask(pt_attr)) {
6035 panic("pmap_enter_options() pmap %p v 0x%llx",
6036 pmap, (uint64_t)v);
6037 }
6038
6039 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6040 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6041 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6042 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6043 }
6044
6045 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6046 panic("pmap_enter_options() pmap %p pa 0x%llx",
6047 pmap, (uint64_t)pa);
6048 }
6049
6050 /* The PA should not extend beyond the architected physical address space */
6051 pa &= ARM_PTE_PAGE_MASK;
6052
6053 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6054 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6055 extern vm_offset_t ctrr_test_page;
6056 if (__probable(v != ctrr_test_page))
6057 #endif
6058 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6059 }
6060 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6061 if (__improbable(prot != VM_PROT_READ)) {
6062 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6063 __func__, (unsigned long long)v, prot);
6064 }
6065 ro_va = true;
6066 }
6067 assert(pn != vm_page_fictitious_addr);
6068
6069 refcnt_updated = false;
6070 wiredcnt_updated = false;
6071
6072 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6073 /*
6074 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6075 *
6076 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6077 */
6078 lock_mode = PMAP_LOCK_EXCLUSIVE;
6079 }
6080
6081 if (!pmap_lock_preempt(pmap, lock_mode)) {
6082 return KERN_ABORTED;
6083 }
6084
6085 /*
6086 * Expand pmap to include this pte. Assume that
6087 * pmap is always expanded to include enough hardware
6088 * pages to map one VM page.
6089 */
6090 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6091 /* Must unlock to expand the pmap. */
6092 pmap_unlock(pmap, lock_mode);
6093
6094 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6095
6096 if (kr != KERN_SUCCESS) {
6097 return kr;
6098 }
6099
6100 if (!pmap_lock_preempt(pmap, lock_mode)) {
6101 return KERN_ABORTED;
6102 }
6103 }
6104
6105 if (options & PMAP_OPTIONS_NOENTER) {
6106 pmap_unlock(pmap, lock_mode);
6107 return KERN_SUCCESS;
6108 }
6109
6110 /*
6111 * Since we may not hold the pmap lock exclusive, updating the pte is
6112 * done via a cmpxchg loop.
6113 * We need to be careful about modifying non-local data structures before commiting
6114 * the new pte since we may need to re-do the transaction.
6115 */
6116 spte = os_atomic_load(pte_p, relaxed);
6117 while (!committed) {
6118 refcnt = NULL;
6119 wiredcnt = NULL;
6120 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6121 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6122
6123 if (pmap != kernel_pmap) {
6124 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6125 refcnt = &ptd_info->refcnt;
6126 wiredcnt = &ptd_info->wiredcnt;
6127 /*
6128 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6129 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6130 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6131 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6132 * have PTDs, so we can't use the check there.
6133 */
6134 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6135 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6136 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6137 }
6138 /*
6139 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6140 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6141 * or acquire the pmap lock exclusive.
6142 */
6143 if (!wiredcnt_updated) {
6144 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6145 wiredcnt_updated = true;
6146 }
6147 if (!refcnt_updated) {
6148 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6149 refcnt_updated = true;
6150 drop_refcnt = true;
6151 }
6152 }
6153
6154 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6155 /*
6156 * There is already a mapping here & it's for a different physical page.
6157 * First remove that mapping.
6158 *
6159 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6160 */
6161 if (lock_mode == PMAP_LOCK_SHARED) {
6162 if (pmap_lock_shared_to_exclusive(pmap)) {
6163 lock_mode = PMAP_LOCK_EXCLUSIVE;
6164 } else {
6165 /*
6166 * We failed to upgrade to an exclusive lock.
6167 * As a result we no longer hold the lock at all,
6168 * so we need to re-acquire it and restart the transaction.
6169 */
6170 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6171 lock_mode = PMAP_LOCK_EXCLUSIVE;
6172 /* pmap might have changed after we dropped the lock. Try again. */
6173 spte = os_atomic_load(pte_p, relaxed);
6174 continue;
6175 }
6176 }
6177 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6178 spte = ARM_PTE_TYPE_FAULT;
6179 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6180 }
6181
6182 /*
6183 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6184 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6185 * read-write protection. The PMAP layer though still needs to use the right
6186 * index, which is the older XO-now-TPRO one and that is specially selected
6187 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6188 */
6189 if (options & PMAP_OPTIONS_MAP_TPRO) {
6190 if (__improbable(pmap == kernel_pmap)) {
6191 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6192 __func__);
6193 }
6194 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6195 } else {
6196 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6197 }
6198
6199 if (pa_valid(pa)) {
6200 unsigned int pai;
6201 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6202
6203 is_internal = FALSE;
6204 is_altacct = FALSE;
6205
6206 pai = pa_index(pa);
6207
6208 pvh_lock(pai);
6209
6210 /*
6211 * Make sure that the current per-cpu PV free list has
6212 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6213 * if the transaction succeeds. We're either in the
6214 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6215 * Note that we can still be interrupted, but a primary
6216 * interrupt handler can never enter the pmap.
6217 */
6218 #if !XNU_MONITOR
6219 assert(get_preemption_level() > 0);
6220 #endif
6221 local_pv_free = &pmap_get_cpu_data()->pv_free;
6222 pv_entry_t **pv_h = pai_to_pvh(pai);
6223 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6224 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6225
6226 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6227 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6228 int new_allocated_pves = 0;
6229
6230 while (new_allocated_pves < 2) {
6231 local_pv_free = &pmap_get_cpu_data()->pv_free;
6232 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6233 if (pv_status == PV_ALLOC_FAIL) {
6234 break;
6235 } else if (pv_status == PV_ALLOC_RETRY) {
6236 /*
6237 * In the case that pv_alloc() had to grab a new page of PVEs,
6238 * it will have dropped the pmap lock while doing so.
6239 * On non-PPL devices, dropping the lock re-enables preemption so we may
6240 * be on a different CPU now.
6241 */
6242 local_pv_free = &pmap_get_cpu_data()->pv_free;
6243 } else {
6244 /* If we've gotten this far then a node should've been allocated. */
6245 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6246
6247 new_allocated_pves++;
6248 }
6249 }
6250
6251 for (int i = 0; i < new_allocated_pves; i++) {
6252 pv_free(new_pve_p[i]);
6253 }
6254 }
6255
6256 if (pv_status == PV_ALLOC_FAIL) {
6257 pvh_unlock(pai);
6258 kr = KERN_RESOURCE_SHORTAGE;
6259 break;
6260 } else if (pv_status == PV_ALLOC_RETRY) {
6261 pvh_unlock(pai);
6262 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6263 spte = os_atomic_load(pte_p, relaxed);
6264 continue;
6265 }
6266
6267 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6268 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6269 } else {
6270 wimg_bits = pmap_cache_attributes(pn);
6271 }
6272
6273 /* We may be retrying this operation after dropping the PVH lock.
6274 * Cache attributes for the physical page may have changed while the lock
6275 * was dropped, so clear any cache attributes we may have previously set
6276 * in the PTE template. */
6277 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6278 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6279
6280 #if XNU_MONITOR
6281 /* The regular old kernel is not allowed to remap PPL pages. */
6282 if (__improbable(ppattr_pa_test_monitor(pa))) {
6283 panic("%s: page belongs to PPL, "
6284 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6285 __FUNCTION__,
6286 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6287 }
6288
6289 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6290 panic("%s: page locked down, "
6291 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6292 __FUNCTION__,
6293 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6294 }
6295 #endif
6296
6297
6298
6299 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6300 if (!committed) {
6301 pvh_unlock(pai);
6302 continue;
6303 }
6304 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6305 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6306
6307 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6308 /*
6309 * If there was already a valid pte here then we reuse its reference
6310 * on the ptd and drop the one that we took above.
6311 */
6312 drop_refcnt = had_valid_mapping;
6313
6314 if (!had_valid_mapping) {
6315 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6316 int pve_ptep_idx = 0;
6317 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6318 /* We did all the allocations up top. So this shouldn't be able to fail. */
6319 if (pv_status != PV_ALLOC_SUCCESS) {
6320 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6321 __func__, pv_status, new_pve_p, pmap);
6322 }
6323
6324 if (pmap != kernel_pmap) {
6325 if (options & PMAP_OPTIONS_INTERNAL) {
6326 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6327 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6328 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6329 /*
6330 * Make a note to ourselves that this
6331 * mapping is using alternative
6332 * accounting. We'll need this in order
6333 * to know which ledger to debit when
6334 * the mapping is removed.
6335 *
6336 * The altacct bit must be set while
6337 * the pv head is locked. Defer the
6338 * ledger accounting until after we've
6339 * dropped the lock.
6340 */
6341 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6342 is_altacct = TRUE;
6343 }
6344 }
6345 if (ppattr_test_reusable(pai) &&
6346 !is_altacct) {
6347 is_reusable = TRUE;
6348 } else if (options & PMAP_OPTIONS_INTERNAL) {
6349 is_internal = TRUE;
6350 } else {
6351 is_external = TRUE;
6352 }
6353 }
6354 }
6355
6356 pvh_unlock(pai);
6357
6358 if (pp_attr_bits != 0) {
6359 ppattr_pa_set_bits(pa, pp_attr_bits);
6360 }
6361
6362 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6363 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6364
6365 if (is_internal) {
6366 /*
6367 * Make corresponding adjustments to
6368 * phys_footprint statistics.
6369 */
6370 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6371 if (is_altacct) {
6372 /*
6373 * If this page is internal and
6374 * in an IOKit region, credit
6375 * the task's total count of
6376 * dirty, internal IOKit pages.
6377 * It should *not* count towards
6378 * the task's total physical
6379 * memory footprint, because
6380 * this entire region was
6381 * already billed to the task
6382 * at the time the mapping was
6383 * created.
6384 *
6385 * Put another way, this is
6386 * internal++ and
6387 * alternate_accounting++, so
6388 * net effect on phys_footprint
6389 * is 0. That means: don't
6390 * touch phys_footprint here.
6391 */
6392 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6393 } else {
6394 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6395 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6396 skip_footprint_debit = true;
6397 } else {
6398 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6399 }
6400 }
6401 }
6402 if (is_reusable) {
6403 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6404 } else if (is_external) {
6405 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6406 }
6407 }
6408 } else {
6409 if (prot & VM_PROT_EXECUTE) {
6410 kr = KERN_FAILURE;
6411 break;
6412 }
6413
6414 wimg_bits = pmap_cache_attributes(pn);
6415 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6416 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6417 }
6418
6419 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6420
6421 #if XNU_MONITOR
6422 pte = pmap_construct_io_pte(pa, pte);
6423
6424 /**
6425 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6426 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6427 * created and later removed. We must therefore prevent an attacker from downgrading a
6428 * a writable mapping in order to allow it to be removed and remapped to something else.
6429 */
6430 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6431 ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6432 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6433 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6434 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6435 __func__, (uint64_t)pte_to_pa(spte));
6436 }
6437 #endif
6438
6439 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6440 if (committed) {
6441 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6442 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6443
6444 /**
6445 * If there was already a valid pte here then we reuse its
6446 * reference on the ptd and drop the one that we took above.
6447 */
6448 drop_refcnt = had_valid_mapping;
6449 }
6450 }
6451 if (committed) {
6452 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6453 assert(pmap != kernel_pmap);
6454
6455 /* One less "compressed" */
6456 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6457 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6458
6459 if (spte & ARM_PTE_COMPRESSED_ALT) {
6460 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6461 } else if (!skip_footprint_debit) {
6462 /* Was part of the footprint */
6463 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6464 }
6465 /* The old entry held a reference so drop the extra one that we took above. */
6466 drop_refcnt = true;
6467 }
6468 }
6469 }
6470
6471 if (drop_refcnt && refcnt != NULL) {
6472 assert(refcnt_updated);
6473 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6474 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6475 }
6476 }
6477
6478 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6479 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6480 }
6481
6482 pmap_unlock(pmap, lock_mode);
6483
6484 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6485 pmap_phys_write_disable(v);
6486 }
6487
6488 return kr;
6489 }
6490
6491 kern_return_t
6492 pmap_enter_options_addr(
6493 pmap_t pmap,
6494 vm_map_address_t v,
6495 pmap_paddr_t pa,
6496 vm_prot_t prot,
6497 vm_prot_t fault_type,
6498 unsigned int flags,
6499 boolean_t wired,
6500 unsigned int options,
6501 __unused void *arg)
6502 {
6503 kern_return_t kr = KERN_FAILURE;
6504
6505
6506 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6507 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6508
6509
6510 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6511 do {
6512 #if XNU_MONITOR
6513 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6514 #else
6515 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6516 #endif
6517
6518 if (kr == KERN_RESOURCE_SHORTAGE) {
6519 #if XNU_MONITOR
6520 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6521 #endif
6522 if (nowait_requested) {
6523 break;
6524 }
6525 }
6526 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6527
6528 #if XNU_MONITOR
6529 pmap_ledger_check_balance(pmap);
6530 #endif
6531
6532 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6533
6534 return kr;
6535 }
6536
6537 kern_return_t
6538 pmap_enter_options(
6539 pmap_t pmap,
6540 vm_map_address_t v,
6541 ppnum_t pn,
6542 vm_prot_t prot,
6543 vm_prot_t fault_type,
6544 unsigned int flags,
6545 boolean_t wired,
6546 unsigned int options,
6547 __unused void *arg)
6548 {
6549 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6550 }
6551
6552 /*
6553 * Routine: pmap_change_wiring
6554 * Function: Change the wiring attribute for a map/virtual-address
6555 * pair.
6556 * In/out conditions:
6557 * The mapping must already exist in the pmap.
6558 */
6559 MARK_AS_PMAP_TEXT kern_return_t
6560 pmap_change_wiring_internal(
6561 pmap_t pmap,
6562 vm_map_address_t v,
6563 boolean_t wired)
6564 {
6565 pt_entry_t *pte_p;
6566 pmap_paddr_t pa;
6567
6568 validate_pmap_mutable(pmap);
6569
6570 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6571 return KERN_ABORTED;
6572 }
6573
6574 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6575
6576 pte_p = pmap_pte(pmap, v);
6577 if (pte_p == PT_ENTRY_NULL) {
6578 if (!wired) {
6579 /*
6580 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6581 * may have been freed by a remove operation.
6582 */
6583 goto pmap_change_wiring_return;
6584 } else {
6585 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6586 }
6587 }
6588 /*
6589 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6590 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6591 */
6592 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6593
6594 while (pa_valid(pa)) {
6595 pmap_paddr_t new_pa;
6596
6597 pvh_lock(pa_index(pa));
6598 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6599
6600 if (pa == new_pa) {
6601 break;
6602 }
6603
6604 pvh_unlock(pa_index(pa));
6605 pa = new_pa;
6606 }
6607
6608 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6609 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6610 if (!wired) {
6611 /* PTE cleared by prior remove/disconnect operation */
6612 goto pmap_change_wiring_cleanup;
6613 } else {
6614 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6615 __func__, pte_p, (uint64_t)*pte_p, pmap);
6616 }
6617 }
6618
6619 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6620 if (wired != pte_is_wired(*pte_p)) {
6621 pte_set_wired(pmap, pte_p, wired);
6622 if (pmap != kernel_pmap) {
6623 if (wired) {
6624 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6625 } else if (!wired) {
6626 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6627 }
6628 }
6629 }
6630
6631 pmap_change_wiring_cleanup:
6632 if (pa_valid(pa)) {
6633 pvh_unlock(pa_index(pa));
6634 }
6635
6636 pmap_change_wiring_return:
6637 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6638
6639 return KERN_SUCCESS;
6640 }
6641
6642 void
6643 pmap_change_wiring(
6644 pmap_t pmap,
6645 vm_map_address_t v,
6646 boolean_t wired)
6647 {
6648 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6649 pmap_verify_preemptible();
6650
6651 kern_return_t kr = KERN_FAILURE;
6652 #if XNU_MONITOR
6653 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6654 do {
6655 kr = pmap_change_wiring_ppl(pmap, v, wired);
6656 } while (kr == KERN_ABORTED);
6657
6658 pmap_ledger_check_balance(pmap);
6659 #else
6660 /* Since we verified preemptibility, call the helper only once. */
6661 kr = pmap_change_wiring_internal(pmap, v, wired);
6662 #endif
6663
6664 if (kr != KERN_SUCCESS) {
6665 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6666 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6667 }
6668 }
6669
6670 MARK_AS_PMAP_TEXT pmap_paddr_t
6671 pmap_find_pa_internal(
6672 pmap_t pmap,
6673 addr64_t va)
6674 {
6675 pmap_paddr_t pa = 0;
6676
6677 validate_pmap(pmap);
6678
6679 if (pmap != kernel_pmap) {
6680 pmap_lock(pmap, PMAP_LOCK_SHARED);
6681 }
6682
6683 pa = pmap_vtophys(pmap, va);
6684
6685 if (pmap != kernel_pmap) {
6686 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6687 }
6688
6689 return pa;
6690 }
6691
6692 pmap_paddr_t
6693 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6694 {
6695 pmap_paddr_t pa = 0;
6696
6697 if (pmap == kernel_pmap) {
6698 pa = mmu_kvtop(va);
6699 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6700 /*
6701 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6702 * translation even if PAN would prevent kernel access through the translation.
6703 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6704 */
6705 pa = mmu_uvtop(va);
6706 }
6707 return pa;
6708 }
6709
6710 pmap_paddr_t
6711 pmap_find_pa(
6712 pmap_t pmap,
6713 addr64_t va)
6714 {
6715 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6716
6717 if (pa != 0) {
6718 return pa;
6719 }
6720
6721 if (not_in_kdp) {
6722 #if XNU_MONITOR
6723 return pmap_find_pa_ppl(pmap, va);
6724 #else
6725 return pmap_find_pa_internal(pmap, va);
6726 #endif
6727 } else {
6728 return pmap_vtophys(pmap, va);
6729 }
6730 }
6731
6732 ppnum_t
6733 pmap_find_phys_nofault(
6734 pmap_t pmap,
6735 addr64_t va)
6736 {
6737 ppnum_t ppn;
6738 ppn = atop(pmap_find_pa_nofault(pmap, va));
6739 return ppn;
6740 }
6741
6742 ppnum_t
6743 pmap_find_phys(
6744 pmap_t pmap,
6745 addr64_t va)
6746 {
6747 ppnum_t ppn;
6748 ppn = atop(pmap_find_pa(pmap, va));
6749 return ppn;
6750 }
6751
6752 /**
6753 * Translate a kernel virtual address into a physical address.
6754 *
6755 * @param va The kernel virtual address to translate. Does not work on user
6756 * virtual addresses.
6757 *
6758 * @return The physical address if the translation was successful, or zero if
6759 * no valid mappings were found for the given virtual address.
6760 */
6761 pmap_paddr_t
6762 kvtophys(vm_offset_t va)
6763 {
6764 /**
6765 * Attempt to do the translation first in hardware using the AT (address
6766 * translation) instruction. This will attempt to use the MMU to do the
6767 * translation for us.
6768 */
6769 pmap_paddr_t pa = mmu_kvtop(va);
6770
6771 if (pa) {
6772 return pa;
6773 }
6774
6775 /* If the MMU can't find the mapping, then manually walk the page tables. */
6776 return pmap_vtophys(kernel_pmap, va);
6777 }
6778
6779 /**
6780 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6781 * points to a non-kernel-managed physical page, then this call will panic().
6782 *
6783 * @note The output of this function is guaranteed to be a kernel-managed
6784 * physical page, which means it's safe to pass the output directly to
6785 * pa_index() to create a physical address index for various pmap data
6786 * structures.
6787 *
6788 * @param va The kernel virtual address to translate. Does not work on user
6789 * virtual addresses.
6790 *
6791 * @return The translated physical address for the given virtual address.
6792 */
6793 pmap_paddr_t
6794 kvtophys_nofail(vm_offset_t va)
6795 {
6796 pmap_paddr_t pa = kvtophys(va);
6797
6798 if (!pa_valid(pa)) {
6799 panic("%s: Invalid or non-kernel-managed physical page returned, "
6800 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6801 }
6802
6803 return pa;
6804 }
6805
6806 pmap_paddr_t
6807 pmap_vtophys(
6808 pmap_t pmap,
6809 addr64_t va)
6810 {
6811 if ((va < pmap->min) || (va >= pmap->max)) {
6812 return 0;
6813 }
6814
6815 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6816
6817 tt_entry_t * ttp = NULL;
6818 tt_entry_t * ttep = NULL;
6819 tt_entry_t tte = ARM_TTE_EMPTY;
6820 pmap_paddr_t pa = 0;
6821 unsigned int cur_level;
6822
6823 ttp = pmap->tte;
6824
6825 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6826 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6827
6828 tte = *ttep;
6829
6830 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6831 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6832 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6833 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6834
6835 if ((tte & valid_mask) != valid_mask) {
6836 return (pmap_paddr_t) 0;
6837 }
6838
6839 /* This detects both leaf entries and intermediate block mappings. */
6840 if ((tte & type_mask) == type_block) {
6841 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6842 break;
6843 }
6844
6845 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6846 }
6847
6848 return pa;
6849 }
6850
6851 /*
6852 * pmap_init_pte_page - Initialize a page table page.
6853 */
6854 MARK_AS_PMAP_TEXT void
6855 pmap_init_pte_page(
6856 pmap_t pmap,
6857 pt_entry_t *pte_p,
6858 vm_offset_t va,
6859 unsigned int ttlevel,
6860 boolean_t alloc_ptd)
6861 {
6862 pt_desc_t *ptdp = NULL;
6863 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6864
6865 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6866 if (alloc_ptd) {
6867 /*
6868 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6869 * on 4KB hardware, we may already have allocated a page table descriptor for a
6870 * bootstrap request, so we check for an existing PTD here.
6871 */
6872 ptdp = ptd_alloc(pmap);
6873 if (ptdp == NULL) {
6874 panic("%s: unable to allocate PTD", __func__);
6875 }
6876 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6877 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6878 pvh_set_flags(pvh, 0);
6879 } else {
6880 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6881 }
6882 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6883 ptdp = pvh_ptd(pvh);
6884 } else {
6885 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6886 }
6887
6888 // below barrier ensures previous updates to the page are visible to PTW before
6889 // it is linked to the PTE of previous level
6890 __builtin_arm_dmb(DMB_ISHST);
6891 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6892 }
6893
6894 /*
6895 * Routine: pmap_expand
6896 *
6897 * Expands a pmap to be able to map the specified virtual address.
6898 *
6899 * Allocates new memory for the default (COARSE) translation table
6900 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6901 * also allocates space for the corresponding pv entries.
6902 *
6903 * Nothing should be locked.
6904 */
6905 MARK_AS_PMAP_TEXT static kern_return_t
6906 pmap_expand(
6907 pmap_t pmap,
6908 vm_map_address_t v,
6909 unsigned int options,
6910 unsigned int level)
6911 {
6912 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6913
6914 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6915 return KERN_INVALID_ADDRESS;
6916 }
6917 pmap_paddr_t pa;
6918 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6919 tt_entry_t *tte_p;
6920 tt_entry_t *tt_p;
6921
6922 pa = 0x0ULL;
6923 tt_p = (tt_entry_t *)NULL;
6924
6925 for (; ttlevel < level; ttlevel++) {
6926 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6927 return KERN_ABORTED;
6928 }
6929
6930 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6931 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6932 kern_return_t ret;
6933 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6934 if (options & PMAP_OPTIONS_NOWAIT) {
6935 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6936 return ret;
6937 }
6938 #if XNU_MONITOR
6939 panic("%s: failed to allocate tt, "
6940 "pmap=%p, v=%p, options=0x%x, level=%u",
6941 __FUNCTION__,
6942 pmap, (void *)v, options, level);
6943 #else
6944 VM_PAGE_WAIT();
6945 #endif
6946 }
6947
6948 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6949 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6950 return KERN_ABORTED;
6951 }
6952
6953 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6954 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6955 pa = kvtophys_nofail((vm_offset_t)tt_p);
6956 tte_p = pmap_ttne(pmap, ttlevel, v);
6957 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6958 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6959 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6960 pa = 0x0ULL;
6961 tt_p = (tt_entry_t *)NULL;
6962 }
6963 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6964 } else {
6965 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6966 }
6967
6968 if (tt_p != (tt_entry_t *)NULL) {
6969 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6970 tt_p = (tt_entry_t *)NULL;
6971 }
6972 }
6973
6974 return KERN_SUCCESS;
6975 }
6976
6977 /*
6978 * Routine: pmap_gc
6979 * Function:
6980 * Pmap garbage collection
6981 * Called by the pageout daemon when pages are scarce.
6982 *
6983 */
6984 void
6985 pmap_gc(void)
6986 {
6987 /*
6988 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6989 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6990 * or may contain wired mappings. However, with the relatively recent change to
6991 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6992 * page, it may make sense to call that function here.
6993 */
6994 }
6995
6996 /*
6997 * By default, don't attempt pmap GC more frequently
6998 * than once / 1 minutes.
6999 */
7000
7001 void
7002 compute_pmap_gc_throttle(
7003 void *arg __unused)
7004 {
7005 }
7006
7007 /*
7008 * pmap_attribute_cache_sync(vm_offset_t pa)
7009 *
7010 * Invalidates all of the instruction cache on a physical page and
7011 * pushes any dirty data from the data cache for the same physical page
7012 */
7013
7014 kern_return_t
7015 pmap_attribute_cache_sync(
7016 ppnum_t pp,
7017 vm_size_t size,
7018 __unused vm_machine_attribute_t attribute,
7019 __unused vm_machine_attribute_val_t * value)
7020 {
7021 if (size > PAGE_SIZE) {
7022 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7023 } else {
7024 cache_sync_page(pp);
7025 }
7026
7027 return KERN_SUCCESS;
7028 }
7029
7030 /*
7031 * pmap_sync_page_data_phys(ppnum_t pp)
7032 *
7033 * Invalidates all of the instruction cache on a physical page and
7034 * pushes any dirty data from the data cache for the same physical page
7035 */
7036 void
7037 pmap_sync_page_data_phys(
7038 ppnum_t pp)
7039 {
7040 cache_sync_page(pp);
7041 }
7042
7043 /*
7044 * pmap_sync_page_attributes_phys(ppnum_t pp)
7045 *
7046 * Write back and invalidate all cachelines on a physical page.
7047 */
7048 void
7049 pmap_sync_page_attributes_phys(
7050 ppnum_t pp)
7051 {
7052 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7053 }
7054
7055 #if CONFIG_COREDUMP
7056 /* temporary workaround */
7057 boolean_t
7058 coredumpok(
7059 vm_map_t map,
7060 mach_vm_offset_t va)
7061 {
7062 pt_entry_t *pte_p;
7063 pt_entry_t spte;
7064
7065 pte_p = pmap_pte(map->pmap, va);
7066 if (0 == pte_p) {
7067 return FALSE;
7068 }
7069 if (vm_map_entry_has_device_pager(map, va)) {
7070 return FALSE;
7071 }
7072 spte = *pte_p;
7073 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7074 }
7075 #endif
7076
7077 void
7078 fillPage(
7079 ppnum_t pn,
7080 unsigned int fill)
7081 {
7082 unsigned int *addr;
7083 int count;
7084
7085 addr = (unsigned int *) phystokv(ptoa(pn));
7086 count = PAGE_SIZE / sizeof(unsigned int);
7087 while (count--) {
7088 *addr++ = fill;
7089 }
7090 }
7091
7092 extern void mapping_set_mod(ppnum_t pn);
7093
7094 void
7095 mapping_set_mod(
7096 ppnum_t pn)
7097 {
7098 pmap_set_modify(pn);
7099 }
7100
7101 extern void mapping_set_ref(ppnum_t pn);
7102
7103 void
7104 mapping_set_ref(
7105 ppnum_t pn)
7106 {
7107 pmap_set_reference(pn);
7108 }
7109
7110 /*
7111 * Clear specified attribute bits.
7112 *
7113 * Try to force an arm_fast_fault() for all mappings of
7114 * the page - to force attributes to be set again at fault time.
7115 * If the forcing succeeds, clear the cached bits at the head.
7116 * Otherwise, something must have been wired, so leave the cached
7117 * attributes alone.
7118 */
7119 MARK_AS_PMAP_TEXT static void
7120 phys_attribute_clear_with_flush_range(
7121 ppnum_t pn,
7122 unsigned int bits,
7123 int options,
7124 void *arg,
7125 pmap_tlb_flush_range_t *flush_range)
7126 {
7127 pmap_paddr_t pa = ptoa(pn);
7128 vm_prot_t allow_mode = VM_PROT_ALL;
7129
7130 #if XNU_MONITOR
7131 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7132 panic("%s: illegal request, "
7133 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7134 __FUNCTION__,
7135 pn, bits, options, arg, flush_range);
7136 }
7137 #endif
7138 if ((arg != NULL) || (flush_range != NULL)) {
7139 options = options & ~PMAP_OPTIONS_NOFLUSH;
7140 }
7141
7142 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7143 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7144 "invalid options",
7145 pn, bits, options, arg, flush_range);
7146 }
7147
7148 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7149 (options & PMAP_OPTIONS_NOFLUSH))) {
7150 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7151 "should not clear 'modified' without flushing TLBs",
7152 pn, bits, options, arg, flush_range);
7153 }
7154
7155 assert(pn != vm_page_fictitious_addr);
7156
7157 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7158 assert(bits == PP_ATTR_MODIFIED);
7159
7160 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7161 /*
7162 * We short circuit this case; it should not need to
7163 * invoke arm_force_fast_fault, so just clear the modified bit.
7164 * pmap_page_protect has taken care of resetting
7165 * the state so that we'll see the next write as a fault to
7166 * the VM (i.e. we don't want a fast fault).
7167 */
7168 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7169 return;
7170 }
7171 if (bits & PP_ATTR_REFERENCED) {
7172 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7173 }
7174 if (bits & PP_ATTR_MODIFIED) {
7175 allow_mode &= ~VM_PROT_WRITE;
7176 }
7177
7178 if (bits == PP_ATTR_NOENCRYPT) {
7179 /*
7180 * We short circuit this case; it should not need to
7181 * invoke arm_force_fast_fault, so just clear and
7182 * return. On ARM, this bit is just a debugging aid.
7183 */
7184 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7185 return;
7186 }
7187
7188 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7189 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7190 }
7191 }
7192
7193 MARK_AS_PMAP_TEXT void
7194 phys_attribute_clear_internal(
7195 ppnum_t pn,
7196 unsigned int bits,
7197 int options,
7198 void *arg)
7199 {
7200 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7201 }
7202
7203 #if __ARM_RANGE_TLBI__
7204 MARK_AS_PMAP_TEXT static vm_map_address_t
7205 phys_attribute_clear_twig_internal(
7206 pmap_t pmap,
7207 vm_map_address_t start,
7208 vm_map_address_t end,
7209 unsigned int bits,
7210 unsigned int options,
7211 pmap_tlb_flush_range_t *flush_range)
7212 {
7213 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7214 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7215 assert(end >= start);
7216 assert((end - start) <= pt_attr_twig_size(pt_attr));
7217 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7218 vm_map_address_t va = start;
7219 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7220 tt_entry_t *tte_p;
7221 tte_p = pmap_tte(pmap, start);
7222 unsigned int npages = 0;
7223
7224 if (tte_p == (tt_entry_t *) NULL) {
7225 return end;
7226 }
7227
7228 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7229 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7230
7231 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7232 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7233 assert(end_pte_p >= start_pte_p);
7234 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7235 if (__improbable(npages++ && pmap_pending_preemption())) {
7236 return va;
7237 }
7238 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7239 if (pa_valid(pa)) {
7240 ppnum_t pn = (ppnum_t) atop(pa);
7241 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7242 }
7243 }
7244 }
7245 return end;
7246 }
7247
7248 MARK_AS_PMAP_TEXT vm_map_address_t
7249 phys_attribute_clear_range_internal(
7250 pmap_t pmap,
7251 vm_map_address_t start,
7252 vm_map_address_t end,
7253 unsigned int bits,
7254 unsigned int options)
7255 {
7256 if (__improbable(end < start)) {
7257 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7258 }
7259 validate_pmap_mutable(pmap);
7260
7261 vm_map_address_t va = start;
7262 pmap_tlb_flush_range_t flush_range = {
7263 .ptfr_pmap = pmap,
7264 .ptfr_start = start,
7265 .ptfr_end = end,
7266 .ptfr_flush_needed = false
7267 };
7268
7269 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7270 return va;
7271 }
7272
7273 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7274
7275 while (va < end) {
7276 vm_map_address_t curr_end;
7277
7278 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7279 if (curr_end > end) {
7280 curr_end = end;
7281 }
7282
7283 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7284 if ((va < curr_end) || pmap_pending_preemption()) {
7285 break;
7286 }
7287 }
7288 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7289 if (flush_range.ptfr_flush_needed) {
7290 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7291 flush_range.ptfr_start,
7292 flush_range.ptfr_end - flush_range.ptfr_start,
7293 flush_range.ptfr_pmap,
7294 true,
7295 false);
7296 sync_tlb_flush();
7297 }
7298 return va;
7299 }
7300
7301 static void
7302 phys_attribute_clear_range(
7303 pmap_t pmap,
7304 vm_map_address_t start,
7305 vm_map_address_t end,
7306 unsigned int bits,
7307 unsigned int options)
7308 {
7309 /*
7310 * We allow single-page requests to execute non-preemptibly,
7311 * as it doesn't make sense to sample AST_URGENT for a single-page
7312 * operation, and there are a couple of special use cases that
7313 * require a non-preemptible single-page operation.
7314 */
7315 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7316 pmap_verify_preemptible();
7317 }
7318
7319 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7320
7321 while (start < end) {
7322 #if XNU_MONITOR
7323 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7324 #else
7325 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7326 #endif
7327 }
7328
7329 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7330 }
7331 #endif /* __ARM_RANGE_TLBI__ */
7332
7333 static void
7334 phys_attribute_clear(
7335 ppnum_t pn,
7336 unsigned int bits,
7337 int options,
7338 void *arg)
7339 {
7340 /*
7341 * Do we really want this tracepoint? It will be extremely chatty.
7342 * Also, should we have a corresponding trace point for the set path?
7343 */
7344 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7345
7346 #if XNU_MONITOR
7347 phys_attribute_clear_ppl(pn, bits, options, arg);
7348 #else
7349 phys_attribute_clear_internal(pn, bits, options, arg);
7350 #endif
7351
7352 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7353 }
7354
7355 /*
7356 * Set specified attribute bits.
7357 *
7358 * Set cached value in the pv head because we have
7359 * no per-mapping hardware support for referenced and
7360 * modify bits.
7361 */
7362 MARK_AS_PMAP_TEXT void
7363 phys_attribute_set_internal(
7364 ppnum_t pn,
7365 unsigned int bits)
7366 {
7367 pmap_paddr_t pa = ptoa(pn);
7368 assert(pn != vm_page_fictitious_addr);
7369
7370 #if XNU_MONITOR
7371 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7372 panic("%s: illegal request, "
7373 "pn=%u, bits=%#x",
7374 __FUNCTION__,
7375 pn, bits);
7376 }
7377 #endif
7378
7379 ppattr_pa_set_bits(pa, (uint16_t)bits);
7380
7381 return;
7382 }
7383
7384 static void
7385 phys_attribute_set(
7386 ppnum_t pn,
7387 unsigned int bits)
7388 {
7389 #if XNU_MONITOR
7390 phys_attribute_set_ppl(pn, bits);
7391 #else
7392 phys_attribute_set_internal(pn, bits);
7393 #endif
7394 }
7395
7396
7397 /*
7398 * Check specified attribute bits.
7399 *
7400 * use the software cached bits (since no hw support).
7401 */
7402 static boolean_t
7403 phys_attribute_test(
7404 ppnum_t pn,
7405 unsigned int bits)
7406 {
7407 pmap_paddr_t pa = ptoa(pn);
7408 assert(pn != vm_page_fictitious_addr);
7409 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7410 }
7411
7412
7413 /*
7414 * Set the modify/reference bits on the specified physical page.
7415 */
7416 void
7417 pmap_set_modify(ppnum_t pn)
7418 {
7419 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7420 }
7421
7422
7423 /*
7424 * Clear the modify bits on the specified physical page.
7425 */
7426 void
7427 pmap_clear_modify(
7428 ppnum_t pn)
7429 {
7430 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7431 }
7432
7433
7434 /*
7435 * pmap_is_modified:
7436 *
7437 * Return whether or not the specified physical page is modified
7438 * by any physical maps.
7439 */
7440 boolean_t
7441 pmap_is_modified(
7442 ppnum_t pn)
7443 {
7444 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7445 }
7446
7447
7448 /*
7449 * Set the reference bit on the specified physical page.
7450 */
7451 static void
7452 pmap_set_reference(
7453 ppnum_t pn)
7454 {
7455 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7456 }
7457
7458 /*
7459 * Clear the reference bits on the specified physical page.
7460 */
7461 void
7462 pmap_clear_reference(
7463 ppnum_t pn)
7464 {
7465 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7466 }
7467
7468
7469 /*
7470 * pmap_is_referenced:
7471 *
7472 * Return whether or not the specified physical page is referenced
7473 * by any physical maps.
7474 */
7475 boolean_t
7476 pmap_is_referenced(
7477 ppnum_t pn)
7478 {
7479 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7480 }
7481
7482 /*
7483 * pmap_get_refmod(phys)
7484 * returns the referenced and modified bits of the specified
7485 * physical page.
7486 */
7487 unsigned int
7488 pmap_get_refmod(
7489 ppnum_t pn)
7490 {
7491 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7492 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7493 }
7494
7495 static inline unsigned int
7496 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7497 {
7498 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7499 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7500 }
7501
7502 /*
7503 * pmap_clear_refmod(phys, mask)
7504 * clears the referenced and modified bits as specified by the mask
7505 * of the specified physical page.
7506 */
7507 void
7508 pmap_clear_refmod_options(
7509 ppnum_t pn,
7510 unsigned int mask,
7511 unsigned int options,
7512 void *arg)
7513 {
7514 unsigned int bits;
7515
7516 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7517 phys_attribute_clear(pn, bits, options, arg);
7518 }
7519
7520 /*
7521 * Perform pmap_clear_refmod_options on a virtual address range.
7522 * The operation will be performed in bulk & tlb flushes will be coalesced
7523 * if possible.
7524 *
7525 * Returns true if the operation is supported on this platform.
7526 * If this function returns false, the operation is not supported and
7527 * nothing has been modified in the pmap.
7528 */
7529 bool
7530 pmap_clear_refmod_range_options(
7531 pmap_t pmap __unused,
7532 vm_map_address_t start __unused,
7533 vm_map_address_t end __unused,
7534 unsigned int mask __unused,
7535 unsigned int options __unused)
7536 {
7537 #if __ARM_RANGE_TLBI__
7538 unsigned int bits;
7539 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7540 phys_attribute_clear_range(pmap, start, end, bits, options);
7541 return true;
7542 #else /* __ARM_RANGE_TLBI__ */
7543 #pragma unused(pmap, start, end, mask, options)
7544 /*
7545 * This operation allows the VM to bulk modify refmod bits on a virtually
7546 * contiguous range of addresses. This is large performance improvement on
7547 * platforms that support ranged tlbi instructions. But on older platforms,
7548 * we can only flush per-page or the entire asid. So we currently
7549 * only support this operation on platforms that support ranged tlbi.
7550 * instructions. On other platforms, we require that
7551 * the VM modify the bits on a per-page basis.
7552 */
7553 return false;
7554 #endif /* __ARM_RANGE_TLBI__ */
7555 }
7556
7557 void
7558 pmap_clear_refmod(
7559 ppnum_t pn,
7560 unsigned int mask)
7561 {
7562 pmap_clear_refmod_options(pn, mask, 0, NULL);
7563 }
7564
7565 unsigned int
7566 pmap_disconnect_options(
7567 ppnum_t pn,
7568 unsigned int options,
7569 void *arg)
7570 {
7571 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7572 /*
7573 * On ARM, the "modified" bit is managed by software, so
7574 * we know up-front if the physical page is "modified",
7575 * without having to scan all the PTEs pointing to it.
7576 * The caller should have made the VM page "busy" so noone
7577 * should be able to establish any new mapping and "modify"
7578 * the page behind us.
7579 */
7580 if (pmap_is_modified(pn)) {
7581 /*
7582 * The page has been modified and will be sent to
7583 * the VM compressor.
7584 */
7585 options |= PMAP_OPTIONS_COMPRESSOR;
7586 } else {
7587 /*
7588 * The page hasn't been modified and will be freed
7589 * instead of compressed.
7590 */
7591 }
7592 }
7593
7594 /* disconnect the page */
7595 pmap_page_protect_options(pn, 0, options, arg);
7596
7597 /* return ref/chg status */
7598 return pmap_get_refmod(pn);
7599 }
7600
7601 /*
7602 * Routine:
7603 * pmap_disconnect
7604 *
7605 * Function:
7606 * Disconnect all mappings for this page and return reference and change status
7607 * in generic format.
7608 *
7609 */
7610 unsigned int
7611 pmap_disconnect(
7612 ppnum_t pn)
7613 {
7614 pmap_page_protect(pn, 0); /* disconnect the page */
7615 return pmap_get_refmod(pn); /* return ref/chg status */
7616 }
7617
7618 boolean_t
7619 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7620 {
7621 if (ptoa(first) >= vm_last_phys) {
7622 return FALSE;
7623 }
7624 if (ptoa(last) < vm_first_phys) {
7625 return FALSE;
7626 }
7627
7628 return TRUE;
7629 }
7630
7631 /*
7632 * The state maintained by the noencrypt functions is used as a
7633 * debugging aid on ARM. This incurs some overhead on the part
7634 * of the caller. A special case check in phys_attribute_clear
7635 * (the most expensive path) currently minimizes this overhead,
7636 * but stubbing these functions out on RELEASE kernels yields
7637 * further wins.
7638 */
7639 boolean_t
7640 pmap_is_noencrypt(
7641 ppnum_t pn)
7642 {
7643 #if DEVELOPMENT || DEBUG
7644 boolean_t result = FALSE;
7645
7646 if (!pa_valid(ptoa(pn))) {
7647 return FALSE;
7648 }
7649
7650 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7651
7652 return result;
7653 #else
7654 #pragma unused(pn)
7655 return FALSE;
7656 #endif
7657 }
7658
7659 void
7660 pmap_set_noencrypt(
7661 ppnum_t pn)
7662 {
7663 #if DEVELOPMENT || DEBUG
7664 if (!pa_valid(ptoa(pn))) {
7665 return;
7666 }
7667
7668 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7669 #else
7670 #pragma unused(pn)
7671 #endif
7672 }
7673
7674 void
7675 pmap_clear_noencrypt(
7676 ppnum_t pn)
7677 {
7678 #if DEVELOPMENT || DEBUG
7679 if (!pa_valid(ptoa(pn))) {
7680 return;
7681 }
7682
7683 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7684 #else
7685 #pragma unused(pn)
7686 #endif
7687 }
7688
7689 #if XNU_MONITOR
7690 boolean_t
7691 pmap_is_monitor(ppnum_t pn)
7692 {
7693 assert(pa_valid(ptoa(pn)));
7694 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7695 }
7696 #endif
7697
7698 void
7699 pmap_lock_phys_page(ppnum_t pn)
7700 {
7701 #if !XNU_MONITOR
7702 unsigned int pai;
7703 pmap_paddr_t phys = ptoa(pn);
7704
7705 if (pa_valid(phys)) {
7706 pai = pa_index(phys);
7707 pvh_lock(pai);
7708 } else
7709 #else
7710 (void)pn;
7711 #endif
7712 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7713 }
7714
7715
7716 void
7717 pmap_unlock_phys_page(ppnum_t pn)
7718 {
7719 #if !XNU_MONITOR
7720 unsigned int pai;
7721 pmap_paddr_t phys = ptoa(pn);
7722
7723 if (pa_valid(phys)) {
7724 pai = pa_index(phys);
7725 pvh_unlock(pai);
7726 } else
7727 #else
7728 (void)pn;
7729 #endif
7730 { simple_unlock(&phys_backup_lock);}
7731 }
7732
7733 MARK_AS_PMAP_TEXT static void
7734 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7735 {
7736 if (pmap != kernel_pmap) {
7737 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7738 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7739 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7740 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7741 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7742 #if __ARM_MIXED_PAGE_SIZE__
7743 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7744 #endif
7745 }
7746
7747
7748 #if __ARM_MIXED_PAGE_SIZE__
7749 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7750 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7751 }
7752 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7753
7754
7755 if (pmap != kernel_pmap) {
7756 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7757 } else if (!pmap_user_ttb_is_clear()) {
7758 pmap_clear_user_ttb_internal();
7759 }
7760 }
7761
7762 MARK_AS_PMAP_TEXT void
7763 pmap_clear_user_ttb_internal(void)
7764 {
7765 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7766 }
7767
7768 void
7769 pmap_clear_user_ttb(void)
7770 {
7771 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7772 #if XNU_MONITOR
7773 pmap_clear_user_ttb_ppl();
7774 #else
7775 pmap_clear_user_ttb_internal();
7776 #endif
7777 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7778 }
7779
7780
7781 #if defined(__arm64__)
7782 /*
7783 * Marker for use in multi-pass fast-fault PV list processing.
7784 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7785 * these functions, as compressed PTEs should never be present in PV lists.
7786 * Note that this only holds true for arm64; for arm32 we don't have enough
7787 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7788 * and WRITEABLE marker depending on whether the PTE is valid.
7789 */
7790 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7791 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7792 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7793 #endif
7794
7795
7796 MARK_AS_PMAP_TEXT static boolean_t
7797 arm_force_fast_fault_with_flush_range(
7798 ppnum_t ppnum,
7799 vm_prot_t allow_mode,
7800 int options,
7801 pmap_tlb_flush_range_t *flush_range)
7802 {
7803 pmap_paddr_t phys = ptoa(ppnum);
7804 pv_entry_t *pve_p;
7805 pt_entry_t *pte_p;
7806 unsigned int pai;
7807 unsigned int pass1_updated = 0;
7808 unsigned int pass2_updated = 0;
7809 boolean_t result;
7810 pv_entry_t **pv_h;
7811 bool is_reusable;
7812 bool ref_fault;
7813 bool mod_fault;
7814 bool clear_write_fault = false;
7815 bool ref_aliases_mod = false;
7816 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7817
7818 assert(ppnum != vm_page_fictitious_addr);
7819
7820 if (!pa_valid(phys)) {
7821 return FALSE; /* Not a managed page. */
7822 }
7823
7824 result = TRUE;
7825 ref_fault = false;
7826 mod_fault = false;
7827 pai = pa_index(phys);
7828 if (__probable(mustsynch)) {
7829 pvh_lock(pai);
7830 }
7831 pv_h = pai_to_pvh(pai);
7832
7833 #if XNU_MONITOR
7834 if (__improbable(ppattr_pa_test_monitor(phys))) {
7835 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7836 }
7837 #endif
7838 pte_p = PT_ENTRY_NULL;
7839 pve_p = PV_ENTRY_NULL;
7840 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7841 pte_p = pvh_ptep(pv_h);
7842 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7843 pve_p = pvh_pve_list(pv_h);
7844 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7845 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7846 }
7847
7848 is_reusable = ppattr_test_reusable(pai);
7849
7850 /*
7851 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7852 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7853 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7854 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7855 * operation, TLB invalidation may be handled by the caller so it's possible for
7856 * tlb_flush_needed to be true while issue_tlbi is false.
7857 */
7858 bool issue_tlbi = false;
7859 bool tlb_flush_needed = false;
7860
7861 pv_entry_t *orig_pve_p = pve_p;
7862 pt_entry_t *orig_pte_p = pte_p;
7863 int pve_ptep_idx = 0;
7864
7865 /*
7866 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7867 * TLB invalidation in pass 2.
7868 */
7869 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7870 pt_entry_t spte;
7871 pt_entry_t tmplate;
7872
7873 if (pve_p != PV_ENTRY_NULL) {
7874 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7875 if (pte_p == PT_ENTRY_NULL) {
7876 goto fff_skip_pve_pass1;
7877 }
7878 }
7879
7880 #ifdef PVH_FLAG_IOMMU
7881 if (pvh_ptep_is_iommu(pte_p)) {
7882 goto fff_skip_pve_pass1;
7883 }
7884 #endif
7885 if (*pte_p == ARM_PTE_EMPTY) {
7886 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7887 }
7888 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7889 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7890 }
7891
7892 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7893 const pmap_t pmap = ptdp->pmap;
7894 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7895 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7896
7897 assert(va >= pmap->min && va < pmap->max);
7898
7899 /* update pmap stats and ledgers */
7900 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7901 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7902 if (is_altacct) {
7903 /*
7904 * We do not track "reusable" status for
7905 * "alternate accounting" mappings.
7906 */
7907 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7908 is_reusable &&
7909 is_internal &&
7910 pmap != kernel_pmap) {
7911 /* one less "reusable" */
7912 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7913 /* one more "internal" */
7914 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7915 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7916
7917 /*
7918 * Since the page is being marked non-reusable, we assume that it will be
7919 * modified soon. Avoid the cost of another trap to handle the fast
7920 * fault when we next write to this page.
7921 */
7922 clear_write_fault = true;
7923 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7924 !is_reusable &&
7925 is_internal &&
7926 pmap != kernel_pmap) {
7927 /* one more "reusable" */
7928 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7929 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7930 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7931 }
7932
7933 bool wiredskip = pte_is_wired(*pte_p) &&
7934 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7935
7936 if (wiredskip) {
7937 result = FALSE;
7938 goto fff_skip_pve_pass1;
7939 }
7940
7941 spte = *pte_p;
7942 tmplate = spte;
7943
7944 #if HAS_FEAT_XS
7945 /**
7946 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7947 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7948 */
7949 assert(!pte_is_xs(pt_attr, spte));
7950 #endif /* HAS_FEAT_XS */
7951 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7952 /* read protection sets the pte to fault */
7953 tmplate = tmplate & ~ARM_PTE_AF;
7954 ref_fault = true;
7955 }
7956 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7957 /* take away write permission if set */
7958 if (pmap == kernel_pmap) {
7959 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7960 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7961 pte_set_was_writeable(tmplate, true);
7962 mod_fault = true;
7963 }
7964 } else {
7965 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7966 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7967 pte_set_was_writeable(tmplate, true);
7968 mod_fault = true;
7969 }
7970 }
7971 }
7972
7973 #if MACH_ASSERT && XNU_MONITOR
7974 if (is_pte_xprr_protected(pmap, spte)) {
7975 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7976 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7977 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7978 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7979 ppnum, options, allow_mode);
7980 }
7981 }
7982 #endif /* MACH_ASSERT && XNU_MONITOR */
7983
7984 if (result && (tmplate != spte)) {
7985 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7986 !(options & PMAP_OPTIONS_NOFLUSH)) {
7987 tlb_flush_needed = true;
7988 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7989 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7990 #ifdef ARM_PTE_FF_MARKER
7991 assert(!(spte & ARM_PTE_FF_MARKER));
7992 tmplate |= ARM_PTE_FF_MARKER;
7993 ++pass1_updated;
7994 #endif
7995 issue_tlbi = true;
7996 }
7997 }
7998 write_pte_fast(pte_p, tmplate);
7999 }
8000
8001 fff_skip_pve_pass1:
8002 pte_p = PT_ENTRY_NULL;
8003 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8004 pve_ptep_idx = 0;
8005 pve_p = pve_next(pve_p);
8006 }
8007 }
8008
8009 if (tlb_flush_needed) {
8010 FLUSH_PTE_STRONG();
8011 }
8012
8013 if (!issue_tlbi) {
8014 goto fff_finish;
8015 }
8016
8017 /* Pass 2: Issue any required TLB invalidations */
8018 pve_p = orig_pve_p;
8019 pte_p = orig_pte_p;
8020 pve_ptep_idx = 0;
8021
8022 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8023 if (pve_p != PV_ENTRY_NULL) {
8024 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8025 if (pte_p == PT_ENTRY_NULL) {
8026 goto fff_skip_pve_pass2;
8027 }
8028 }
8029
8030 #ifdef PVH_FLAG_IOMMU
8031 if (pvh_ptep_is_iommu(pte_p)) {
8032 goto fff_skip_pve_pass2;
8033 }
8034 #endif
8035
8036 #ifdef ARM_PTE_FF_MARKER
8037 pt_entry_t spte = *pte_p;
8038
8039 if (!(spte & ARM_PTE_FF_MARKER)) {
8040 goto fff_skip_pve_pass2;
8041 } else {
8042 spte &= (~ARM_PTE_FF_MARKER);
8043 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8044 write_pte_fast(pte_p, spte);
8045 ++pass2_updated;
8046 }
8047 #endif
8048 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8049 const pmap_t pmap = ptdp->pmap;
8050 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8051
8052 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8053 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8054 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8055 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8056 }
8057
8058 fff_skip_pve_pass2:
8059 pte_p = PT_ENTRY_NULL;
8060 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8061 pve_ptep_idx = 0;
8062 pve_p = pve_next(pve_p);
8063 }
8064 }
8065
8066 fff_finish:
8067 if (__improbable(pass1_updated != pass2_updated)) {
8068 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8069 __func__, pass1_updated, pass2_updated);
8070 }
8071
8072 /*
8073 * If we are using the same approach for ref and mod
8074 * faults on this PTE, do not clear the write fault;
8075 * this would cause both ref and mod to be set on the
8076 * page again, and prevent us from taking ANY read/write
8077 * fault on the mapping.
8078 */
8079 if (clear_write_fault && !ref_aliases_mod) {
8080 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8081 }
8082 if (tlb_flush_needed) {
8083 if (flush_range) {
8084 /* Delayed flush. Signal to the caller that the flush is needed. */
8085 flush_range->ptfr_flush_needed = true;
8086 } else {
8087 sync_tlb_flush();
8088 }
8089 }
8090
8091 /* update global "reusable" status for this page */
8092 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8093 ppattr_clear_reusable(pai);
8094 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8095 ppattr_set_reusable(pai);
8096 }
8097
8098 if (mod_fault) {
8099 ppattr_set_modfault(pai);
8100 }
8101 if (ref_fault) {
8102 ppattr_set_reffault(pai);
8103 }
8104 if (__probable(mustsynch)) {
8105 pvh_unlock(pai);
8106 }
8107 return result;
8108 }
8109
8110 MARK_AS_PMAP_TEXT boolean_t
8111 arm_force_fast_fault_internal(
8112 ppnum_t ppnum,
8113 vm_prot_t allow_mode,
8114 int options)
8115 {
8116 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8117 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8118 }
8119 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8120 }
8121
8122 /*
8123 * Routine: arm_force_fast_fault
8124 *
8125 * Function:
8126 * Force all mappings for this page to fault according
8127 * to the access modes allowed, so we can gather ref/modify
8128 * bits again.
8129 */
8130
8131 boolean_t
8132 arm_force_fast_fault(
8133 ppnum_t ppnum,
8134 vm_prot_t allow_mode,
8135 int options,
8136 __unused void *arg)
8137 {
8138 pmap_paddr_t phys = ptoa(ppnum);
8139
8140 assert(ppnum != vm_page_fictitious_addr);
8141
8142 if (!pa_valid(phys)) {
8143 return FALSE; /* Not a managed page. */
8144 }
8145
8146 #if XNU_MONITOR
8147 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8148 #else
8149 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8150 #endif
8151 }
8152
8153 /*
8154 * Routine: arm_clear_fast_fault
8155 *
8156 * Function:
8157 * Clear pending force fault for all mappings for this page based on
8158 * the observed fault type, update ref/modify bits.
8159 */
8160 MARK_AS_PMAP_TEXT static boolean_t
8161 arm_clear_fast_fault(
8162 ppnum_t ppnum,
8163 vm_prot_t fault_type,
8164 pt_entry_t *pte_p)
8165 {
8166 pmap_paddr_t pa = ptoa(ppnum);
8167 pv_entry_t *pve_p;
8168 unsigned int pai;
8169 boolean_t result;
8170 bool tlb_flush_needed = false;
8171 pv_entry_t **pv_h;
8172 unsigned int npve = 0;
8173 unsigned int pass1_updated = 0;
8174 unsigned int pass2_updated = 0;
8175
8176 assert(ppnum != vm_page_fictitious_addr);
8177
8178 if (!pa_valid(pa)) {
8179 return FALSE; /* Not a managed page. */
8180 }
8181
8182 result = FALSE;
8183 pai = pa_index(pa);
8184 pvh_assert_locked(pai);
8185 pv_h = pai_to_pvh(pai);
8186
8187 pve_p = PV_ENTRY_NULL;
8188 if (pte_p == PT_ENTRY_NULL) {
8189 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8190 pte_p = pvh_ptep(pv_h);
8191 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8192 pve_p = pvh_pve_list(pv_h);
8193 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8194 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8195 }
8196 }
8197
8198 pv_entry_t *orig_pve_p = pve_p;
8199 pt_entry_t *orig_pte_p = pte_p;
8200 int pve_ptep_idx = 0;
8201
8202 /*
8203 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8204 * TLB invalidation in pass 2.
8205 */
8206 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8207 pt_entry_t spte;
8208 pt_entry_t tmplate;
8209
8210 if (pve_p != PV_ENTRY_NULL) {
8211 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8212 if (pte_p == PT_ENTRY_NULL) {
8213 goto cff_skip_pve_pass1;
8214 }
8215 }
8216
8217 #ifdef PVH_FLAG_IOMMU
8218 if (pvh_ptep_is_iommu(pte_p)) {
8219 goto cff_skip_pve_pass1;
8220 }
8221 #endif
8222 if (*pte_p == ARM_PTE_EMPTY) {
8223 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8224 }
8225
8226 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8227 const pmap_t pmap = ptdp->pmap;
8228 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8229
8230 assert(va >= pmap->min && va < pmap->max);
8231
8232 spte = *pte_p;
8233 tmplate = spte;
8234
8235 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8236 {
8237 if (pmap == kernel_pmap) {
8238 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8239 } else {
8240 assert(pmap->type != PMAP_TYPE_NESTED);
8241 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8242 }
8243 }
8244
8245 tmplate |= ARM_PTE_AF;
8246
8247 pte_set_was_writeable(tmplate, false);
8248 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8249 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8250 tmplate = spte | ARM_PTE_AF;
8251
8252 {
8253 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8254 }
8255 }
8256
8257 #if MACH_ASSERT && XNU_MONITOR
8258 if (is_pte_xprr_protected(pmap, spte)) {
8259 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8260 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8261 "ppnum=0x%x, fault_type=0x%x",
8262 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8263 ppnum, fault_type);
8264 }
8265 }
8266 #endif /* MACH_ASSERT && XNU_MONITOR */
8267
8268 assert(spte != ARM_PTE_TYPE_FAULT);
8269 if (spte != tmplate) {
8270 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8271 #ifdef ARM_PTE_FF_MARKER
8272 assert(!(spte & ARM_PTE_FF_MARKER));
8273 tmplate |= ARM_PTE_FF_MARKER;
8274 ++pass1_updated;
8275 #endif
8276 tlb_flush_needed = true;
8277 }
8278 write_pte_fast(pte_p, tmplate);
8279 result = TRUE;
8280 }
8281
8282 cff_skip_pve_pass1:
8283 pte_p = PT_ENTRY_NULL;
8284 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8285 pve_ptep_idx = 0;
8286 pve_p = pve_next(pve_p);
8287 ++npve;
8288 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8289 break;
8290 }
8291 }
8292 }
8293
8294 if (!tlb_flush_needed) {
8295 goto cff_finish;
8296 }
8297
8298 FLUSH_PTE_STRONG();
8299
8300 /* Pass 2: Issue any required TLB invalidations */
8301 pve_p = orig_pve_p;
8302 pte_p = orig_pte_p;
8303 pve_ptep_idx = 0;
8304 npve = 0;
8305
8306 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8307 if (pve_p != PV_ENTRY_NULL) {
8308 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8309 if (pte_p == PT_ENTRY_NULL) {
8310 goto cff_skip_pve_pass2;
8311 }
8312 }
8313
8314 #ifdef PVH_FLAG_IOMMU
8315 if (pvh_ptep_is_iommu(pte_p)) {
8316 goto cff_skip_pve_pass2;
8317 }
8318 #endif
8319
8320 #ifdef ARM_PTE_FF_MARKER
8321 pt_entry_t spte = *pte_p;
8322
8323 if (!(spte & ARM_PTE_FF_MARKER)) {
8324 goto cff_skip_pve_pass2;
8325 } else {
8326 spte &= (~ARM_PTE_FF_MARKER);
8327 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8328 write_pte_fast(pte_p, spte);
8329 ++pass2_updated;
8330 }
8331 #endif
8332 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8333 const pmap_t pmap = ptdp->pmap;
8334 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8335
8336 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8337 pmap, true, false);
8338
8339 cff_skip_pve_pass2:
8340 pte_p = PT_ENTRY_NULL;
8341 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8342 pve_ptep_idx = 0;
8343 pve_p = pve_next(pve_p);
8344 ++npve;
8345 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8346 break;
8347 }
8348 }
8349 }
8350
8351 cff_finish:
8352 if (__improbable(pass1_updated != pass2_updated)) {
8353 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8354 __func__, pass1_updated, pass2_updated);
8355 }
8356 if (tlb_flush_needed) {
8357 sync_tlb_flush();
8358 }
8359 return result;
8360 }
8361
8362 /*
8363 * Determine if the fault was induced by software tracking of
8364 * modify/reference bits. If so, re-enable the mapping (and set
8365 * the appropriate bits).
8366 *
8367 * Returns KERN_SUCCESS if the fault was induced and was
8368 * successfully handled.
8369 *
8370 * Returns KERN_FAILURE if the fault was not induced and
8371 * the function was unable to deal with it.
8372 *
8373 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8374 * disallows this type of access.
8375 *
8376 * Returns KERN_ABORTED if the pmap lock is taken and a
8377 * preemption is pending.
8378 *
8379 */
8380 MARK_AS_PMAP_TEXT kern_return_t
8381 arm_fast_fault_internal(
8382 pmap_t pmap,
8383 vm_map_address_t va,
8384 vm_prot_t fault_type,
8385 __unused bool was_af_fault,
8386 __unused bool from_user)
8387 {
8388 kern_return_t result = KERN_FAILURE;
8389 pt_entry_t *ptep;
8390 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8391 unsigned int pai;
8392 pmap_paddr_t pa;
8393 validate_pmap_mutable(pmap);
8394
8395 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8396 return KERN_ABORTED;
8397 }
8398
8399 /*
8400 * If the entry doesn't exist, is completely invalid, or is already
8401 * valid, we can't fix it here.
8402 */
8403
8404 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8405 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8406 if (ptep != PT_ENTRY_NULL) {
8407 while (true) {
8408 spte = *((volatile pt_entry_t*)ptep);
8409
8410 pa = pte_to_pa(spte);
8411
8412 if ((spte == ARM_PTE_TYPE_FAULT) ||
8413 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8414 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8415 return result;
8416 }
8417
8418 if (!pa_valid(pa)) {
8419 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8420 #if XNU_MONITOR
8421 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8422 return KERN_PROTECTION_FAILURE;
8423 } else
8424 #endif
8425 return result;
8426 }
8427 pai = pa_index(pa);
8428 pvh_lock(pai);
8429 if (*ptep == spte) {
8430 /*
8431 * Double-check the spte value, as we care about the AF bit.
8432 * It's also possible that pmap_page_protect() transitioned the
8433 * PTE to compressed/empty before we grabbed the PVH lock.
8434 */
8435 break;
8436 }
8437 pvh_unlock(pai);
8438 }
8439 } else {
8440 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8441 return result;
8442 }
8443
8444
8445 if ((result != KERN_SUCCESS) &&
8446 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8447 /*
8448 * An attempted access will always clear ref/mod fault state, as
8449 * appropriate for the fault type. arm_clear_fast_fault will
8450 * update the associated PTEs for the page as appropriate; if
8451 * any PTEs are updated, we redrive the access. If the mapping
8452 * does not actually allow for the attempted access, the
8453 * following fault will (hopefully) fail to update any PTEs, and
8454 * thus cause arm_fast_fault to decide that it failed to handle
8455 * the fault.
8456 */
8457 if (ppattr_test_reffault(pai)) {
8458 ppattr_clear_reffault(pai);
8459 }
8460 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8461 ppattr_clear_modfault(pai);
8462 }
8463
8464 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8465 /*
8466 * Should this preserve KERN_PROTECTION_FAILURE? The
8467 * cost of not doing so is a another fault in a case
8468 * that should already result in an exception.
8469 */
8470 result = KERN_SUCCESS;
8471 }
8472 }
8473
8474 /*
8475 * If the PTE already has sufficient permissions, we can report the fault as handled.
8476 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8477 * on mappings of the same page
8478 */
8479 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8480 uintptr_t ap_ro, ap_rw, ap_x;
8481 if (pmap == kernel_pmap) {
8482 ap_ro = ARM_PTE_AP(AP_RONA);
8483 ap_rw = ARM_PTE_AP(AP_RWNA);
8484 ap_x = ARM_PTE_NX;
8485 } else {
8486 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8487 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8488 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8489 }
8490 /*
8491 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8492 * hardware they may be xPRR-protected, in which case they'll be handled
8493 * by the is_pte_xprr_protected() case above. Additionally, the exception
8494 * handling path currently does not call arm_fast_fault() without at least
8495 * VM_PROT_READ in fault_type.
8496 */
8497 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8498 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8499 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8500 result = KERN_SUCCESS;
8501 }
8502 }
8503 }
8504
8505 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8506 /*
8507 * A prior arm_clear_fast_fault() operation may have returned early due to
8508 * another pending PV list operation or an excessively large PV list.
8509 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8510 * taking a fault on the same mapping.
8511 */
8512 result = KERN_SUCCESS;
8513 }
8514
8515 pvh_unlock(pai);
8516 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8517 return result;
8518 }
8519
8520 kern_return_t
8521 arm_fast_fault(
8522 pmap_t pmap,
8523 vm_map_address_t va,
8524 vm_prot_t fault_type,
8525 bool was_af_fault,
8526 __unused bool from_user)
8527 {
8528 kern_return_t result = KERN_FAILURE;
8529
8530 if (va < pmap->min || va >= pmap->max) {
8531 return result;
8532 }
8533
8534 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8535 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8536 from_user);
8537
8538 do {
8539 #if XNU_MONITOR
8540 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8541 #else
8542 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8543 #endif
8544 } while (result == KERN_ABORTED);
8545
8546 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8547
8548 return result;
8549 }
8550
8551 void
8552 pmap_copy_page(
8553 ppnum_t psrc,
8554 ppnum_t pdst)
8555 {
8556 bcopy_phys((addr64_t) (ptoa(psrc)),
8557 (addr64_t) (ptoa(pdst)),
8558 PAGE_SIZE);
8559 }
8560
8561
8562 /*
8563 * pmap_copy_page copies the specified (machine independent) pages.
8564 */
8565 void
8566 pmap_copy_part_page(
8567 ppnum_t psrc,
8568 vm_offset_t src_offset,
8569 ppnum_t pdst,
8570 vm_offset_t dst_offset,
8571 vm_size_t len)
8572 {
8573 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8574 (addr64_t) (ptoa(pdst) + dst_offset),
8575 len);
8576 }
8577
8578
8579 /*
8580 * pmap_zero_page zeros the specified (machine independent) page.
8581 */
8582 void
8583 pmap_zero_page(
8584 ppnum_t pn)
8585 {
8586 assert(pn != vm_page_fictitious_addr);
8587 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8588 }
8589
8590 /*
8591 * pmap_zero_part_page
8592 * zeros the specified (machine independent) part of a page.
8593 */
8594 void
8595 pmap_zero_part_page(
8596 ppnum_t pn,
8597 vm_offset_t offset,
8598 vm_size_t len)
8599 {
8600 assert(pn != vm_page_fictitious_addr);
8601 assert(offset + len <= PAGE_SIZE);
8602 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8603 }
8604
8605 void
8606 pmap_map_globals(
8607 void)
8608 {
8609 pt_entry_t *ptep, pte;
8610
8611 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8612 assert(ptep != PT_ENTRY_NULL);
8613 assert(*ptep == ARM_PTE_EMPTY);
8614
8615 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8616 #if __ARM_KERNEL_PROTECT__
8617 pte |= ARM_PTE_NG;
8618 #endif /* __ARM_KERNEL_PROTECT__ */
8619 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8620 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8621 *ptep = pte;
8622 FLUSH_PTE();
8623 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8624
8625 #if KASAN
8626 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8627 #endif
8628 }
8629
8630 vm_offset_t
8631 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8632 {
8633 if (__improbable(index >= CPUWINDOWS_MAX)) {
8634 panic("%s: invalid index %u", __func__, index);
8635 }
8636 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8637 }
8638
8639 MARK_AS_PMAP_TEXT unsigned int
8640 pmap_map_cpu_windows_copy_internal(
8641 ppnum_t pn,
8642 vm_prot_t prot,
8643 unsigned int wimg_bits)
8644 {
8645 pt_entry_t *ptep = NULL, pte;
8646 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8647 unsigned int cpu_num;
8648 unsigned int i;
8649 vm_offset_t cpu_copywindow_vaddr = 0;
8650 bool need_strong_sync = false;
8651
8652 #if XNU_MONITOR
8653 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8654 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8655 #endif
8656
8657 #if XNU_MONITOR
8658 #ifdef __ARM_COHERENT_IO__
8659 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8660 panic("%s: attempted to map a managed page, "
8661 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8662 __FUNCTION__,
8663 pn, prot, wimg_bits);
8664 }
8665 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8666 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8667 }
8668
8669 #else /* __ARM_COHERENT_IO__ */
8670 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8671 #endif /* __ARM_COHERENT_IO__ */
8672 #endif /* XNU_MONITOR */
8673 cpu_num = pmap_cpu_data->cpu_number;
8674
8675 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8676 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8677 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8678 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8679 if (*ptep == ARM_PTE_TYPE_FAULT) {
8680 break;
8681 }
8682 }
8683 if (i == CPUWINDOWS_MAX) {
8684 panic("pmap_map_cpu_windows_copy: out of window");
8685 }
8686
8687 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8688 #if __ARM_KERNEL_PROTECT__
8689 pte |= ARM_PTE_NG;
8690 #endif /* __ARM_KERNEL_PROTECT__ */
8691
8692 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8693
8694 if (prot & VM_PROT_WRITE) {
8695 pte |= ARM_PTE_AP(AP_RWNA);
8696 } else {
8697 pte |= ARM_PTE_AP(AP_RONA);
8698 }
8699 #if HAS_FEAT_XS
8700 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8701 #endif
8702 write_pte_fast(ptep, pte);
8703 /*
8704 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8705 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8706 */
8707 FLUSH_PTE_STRONG();
8708 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8709 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8710
8711 return i;
8712 }
8713
8714 unsigned int
8715 pmap_map_cpu_windows_copy(
8716 ppnum_t pn,
8717 vm_prot_t prot,
8718 unsigned int wimg_bits)
8719 {
8720 #if XNU_MONITOR
8721 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8722 #else
8723 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8724 #endif
8725 }
8726
8727 MARK_AS_PMAP_TEXT void
8728 pmap_unmap_cpu_windows_copy_internal(
8729 unsigned int index)
8730 {
8731 pt_entry_t *ptep;
8732 unsigned int cpu_num;
8733 vm_offset_t cpu_copywindow_vaddr = 0;
8734 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8735
8736 cpu_num = pmap_cpu_data->cpu_number;
8737
8738 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8739 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8740 * (which are likely to have been on I/O memory) are complete before
8741 * tearing down the mapping. */
8742 __builtin_arm_dsb(DSB_SY);
8743 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8744 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8745 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8746 }
8747
8748 void
8749 pmap_unmap_cpu_windows_copy(
8750 unsigned int index)
8751 {
8752 #if XNU_MONITOR
8753 return pmap_unmap_cpu_windows_copy_ppl(index);
8754 #else
8755 return pmap_unmap_cpu_windows_copy_internal(index);
8756 #endif
8757 }
8758
8759 #if XNU_MONITOR
8760
8761 MARK_AS_PMAP_TEXT void
8762 pmap_invoke_with_page(
8763 ppnum_t page_number,
8764 void *ctx,
8765 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8766 {
8767 #pragma unused(page_number, ctx, callback)
8768 }
8769
8770 /*
8771 * Loop over every pmap_io_range (I/O ranges marked as owned by
8772 * the PPL in the device tree) and conditionally call callback() on each range
8773 * that needs to be included in the hibernation image.
8774 *
8775 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8776 * context is needed in the callback.
8777 * @param callback Callback function invoked on each range (gated by flag).
8778 */
8779 MARK_AS_PMAP_TEXT void
8780 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8781 {
8782 extern const pmap_io_range_t* io_attr_table;
8783 extern const unsigned int num_io_rgns;
8784 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8785 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8786 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8787 }
8788 }
8789 }
8790
8791 /**
8792 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8793 * PPL-owned page. Otherwise, do nothing.
8794 *
8795 * @param addr Physical address of the page to set the HASHED flag on.
8796 */
8797 MARK_AS_PMAP_TEXT void
8798 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8799 {
8800 /* Ignore non-managed kernel memory. */
8801 if (!pa_valid(addr)) {
8802 return;
8803 }
8804
8805 const unsigned int pai = pa_index(addr);
8806 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8807 pv_entry_t **pv_h = pai_to_pvh(pai);
8808
8809 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8810 pvh_lock(pai);
8811 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8812 pvh_unlock(pai);
8813 }
8814 }
8815
8816 /**
8817 * Loop through every physical page in the system and clear out the HASHED flag
8818 * on every PPL-owned page. That flag is used to keep track of which pages have
8819 * been hashed into the hibernation image during the hibernation entry process.
8820 *
8821 * The HASHED flag needs to be cleared out between hibernation cycles because the
8822 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8823 * image with the HASHED flag set on certain pages. It's important to clear the
8824 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8825 * into the hibernation image can't be compromised across hibernation cycles.
8826 */
8827 MARK_AS_PMAP_TEXT void
8828 pmap_clear_ppl_hashed_flag_all(void)
8829 {
8830 const unsigned int last_index = pa_index(vm_last_phys);
8831 pv_entry_t **pv_h = NULL;
8832
8833 for (int pai = 0; pai < last_index; ++pai) {
8834 pv_h = pai_to_pvh(pai);
8835
8836 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8837 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8838 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8839 pvh_lock(pai);
8840 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8841 pvh_unlock(pai);
8842 }
8843 }
8844 }
8845
8846 /**
8847 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8848 * ppl_hib driver will call this after all wired pages have been copied into the
8849 * hibernation image.
8850 */
8851 MARK_AS_PMAP_TEXT void
8852 pmap_check_ppl_hashed_flag_all(void)
8853 {
8854 const unsigned int last_index = pa_index(vm_last_phys);
8855 pv_entry_t **pv_h = NULL;
8856
8857 for (int pai = 0; pai < last_index; ++pai) {
8858 pv_h = pai_to_pvh(pai);
8859
8860 /**
8861 * The PMAP stacks are explicitly not saved into the image so skip checking
8862 * the pages that contain the PMAP stacks.
8863 */
8864 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8865 (pai < pa_index(pmap_stacks_end_pa));
8866
8867 if (!is_pmap_stack &&
8868 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8869 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8870 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8871 }
8872 }
8873 }
8874
8875 #endif /* XNU_MONITOR */
8876
8877 /*
8878 * Indicate that a pmap is intended to be used as a nested pmap
8879 * within one or more larger address spaces. This must be set
8880 * before pmap_nest() is called with this pmap as the 'subordinate'.
8881 */
8882 MARK_AS_PMAP_TEXT void
8883 pmap_set_nested_internal(
8884 pmap_t pmap)
8885 {
8886 validate_pmap_mutable(pmap);
8887 if (__improbable(pmap->type != PMAP_TYPE_USER)) {
8888 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8889 __func__, pmap, pmap->type);
8890 }
8891 pmap->type = PMAP_TYPE_NESTED;
8892 pmap_get_pt_ops(pmap)->free_id(pmap);
8893 }
8894
8895 void
8896 pmap_set_nested(
8897 pmap_t pmap)
8898 {
8899 #if XNU_MONITOR
8900 pmap_set_nested_ppl(pmap);
8901 #else
8902 pmap_set_nested_internal(pmap);
8903 #endif
8904 }
8905
8906 bool
8907 pmap_is_nested(
8908 pmap_t pmap)
8909 {
8910 return pmap->type == PMAP_TYPE_NESTED;
8911 }
8912
8913 /*
8914 * pmap_trim_range(pmap, start, end)
8915 *
8916 * pmap = pmap to operate on
8917 * start = start of the range
8918 * end = end of the range
8919 *
8920 * Attempts to deallocate TTEs for the given range in the nested range.
8921 */
8922 MARK_AS_PMAP_TEXT static void
8923 pmap_trim_range(
8924 pmap_t pmap,
8925 addr64_t start,
8926 addr64_t end)
8927 {
8928 addr64_t cur;
8929 addr64_t nested_region_start;
8930 addr64_t nested_region_end;
8931 addr64_t adjusted_start;
8932 addr64_t adjusted_end;
8933 addr64_t adjust_offmask;
8934 tt_entry_t * tte_p;
8935 pt_entry_t * pte_p;
8936 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8937
8938 if (__improbable(end < start)) {
8939 panic("%s: invalid address range, "
8940 "pmap=%p, start=%p, end=%p",
8941 __func__,
8942 pmap, (void*)start, (void*)end);
8943 }
8944
8945 nested_region_start = pmap->nested_region_addr;
8946 nested_region_end = nested_region_start + pmap->nested_region_size;
8947
8948 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8949 panic("%s: range outside nested region %p-%p, "
8950 "pmap=%p, start=%p, end=%p",
8951 __func__, (void *)nested_region_start, (void *)nested_region_end,
8952 pmap, (void*)start, (void*)end);
8953 }
8954
8955 /* Contract the range to TT page boundaries. */
8956 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8957 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8958 adjusted_end = end & ~adjust_offmask;
8959
8960 /* Iterate over the range, trying to remove TTEs. */
8961 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8962 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8963
8964 tte_p = pmap_tte(pmap, cur);
8965
8966 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8967 pte_p = (pt_entry_t *) ttetokv(*tte_p);
8968
8969 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8970 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8971 /* Deallocate for the nested map. */
8972 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8973 } else if (pmap->type == PMAP_TYPE_USER) {
8974 /**
8975 * Just remove for the parent map. If the leaf table pointed
8976 * to by the TTE being removed (owned by the nested pmap)
8977 * has any mappings, then this call will panic. This
8978 * enforces the policy that tables being trimmed must be
8979 * empty to prevent possible use-after-free attacks.
8980 */
8981 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8982 } else {
8983 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8984 }
8985 } else {
8986 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8987 }
8988 }
8989
8990 /* Remove empty L2 TTs. */
8991 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8992 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8993
8994 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8995 /* For each L1 entry in our range... */
8996 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8997
8998 bool remove_tt1e = true;
8999 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9000 tt_entry_t * tt2e_start;
9001 tt_entry_t * tt2e_end;
9002 tt_entry_t * tt2e_p;
9003 tt_entry_t tt1e;
9004
9005 if (tt1e_p == NULL) {
9006 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9007 continue;
9008 }
9009
9010 tt1e = *tt1e_p;
9011
9012 if (tt1e == ARM_TTE_TYPE_FAULT) {
9013 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9014 continue;
9015 }
9016
9017 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9018 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9019
9020 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9021 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9022 /*
9023 * If any TTEs are populated, don't remove the
9024 * L1 TT.
9025 */
9026 remove_tt1e = false;
9027 }
9028 }
9029
9030 if (remove_tt1e) {
9031 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9032 } else {
9033 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9034 }
9035 }
9036 }
9037
9038 /**
9039 * State machine for multi-step pmap trimming. Trimming is the action of
9040 * deallocating the TTEs of the shared region of pmaps down to a given range.
9041 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9042 * disabling preemption for too long. These steps include computing the bounds
9043 * of the shared region, trimming the head of the "grand", trimming the tail of
9044 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9045 * different conditions.
9046 *
9047 * @param grand the pmap in which the pages are nested
9048 * @param subord the pmap from which the pages are shared, or nested
9049 * @param vstart start of the used range in "grand"
9050 * @param size size of the used range
9051 * @param state the current state of the state machine
9052 *
9053 * @return the next state of the state machine, to be used in the next call
9054 * into this function.
9055 */
9056 MARK_AS_PMAP_TEXT pmap_trim_state_t
9057 pmap_trim_internal(
9058 pmap_t grand,
9059 pmap_t subord,
9060 addr64_t vstart,
9061 uint64_t size,
9062 pmap_trim_state_t state)
9063 {
9064 /* Validation needs to be done regardless of state. */
9065 addr64_t vend;
9066
9067 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9068 panic("%s: grand addr wraps around, "
9069 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9070 __func__, grand, subord, (void*)vstart, size, state);
9071 }
9072
9073 validate_pmap_mutable(grand);
9074 validate_pmap(subord);
9075
9076 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9077 panic("%s: subord is of non-nestable type 0x%hhx, "
9078 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9079 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9080 }
9081
9082 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9083 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9084 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9085 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9086 }
9087
9088 if (__improbable(grand->nested_pmap != subord)) {
9089 panic("%s: grand->nested != subord, "
9090 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9091 __func__, grand, subord, (void*)vstart, size, state);
9092 }
9093
9094 if (__improbable((size != 0) &&
9095 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9096 panic("%s: grand range not in nested region, "
9097 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9098 __func__, grand, subord, (void*)vstart, size, state);
9099 }
9100
9101 /* Trimming starts with figuring out the bounds for the grand. */
9102 if (state == PMAP_TRIM_STATE_START) {
9103 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9104
9105 /**
9106 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9107 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9108 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9109 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9110 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9111 * PMAP_TRIM_STATE_DONE.
9112 */
9113 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9114 assert(subord->nested_bounds_set);
9115
9116 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9117 if (!grand->nested_bounds_set) {
9118 /* Inherit the bounds from subord. */
9119 grand->nested_region_true_start = subord->nested_region_true_start;
9120 grand->nested_region_true_end = subord->nested_region_true_end;
9121 grand->nested_bounds_set = true;
9122 }
9123
9124 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9125
9126 /* Now that the grand has bounds, we are done. */
9127 return PMAP_TRIM_STATE_DONE;
9128 }
9129
9130 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9131 if ((!subord->nested_bounds_set) && size) {
9132 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9133 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9134
9135 subord->nested_region_true_start = vstart;
9136 subord->nested_region_true_end = vend;
9137 subord->nested_region_true_start &= ~adjust_offmask;
9138
9139 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9140 panic("%s: padded true end wraps around, "
9141 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9142 __func__, grand, subord, (void*)vstart, size, state);
9143 }
9144
9145 subord->nested_region_true_end &= ~adjust_offmask;
9146 subord->nested_bounds_set = true;
9147 }
9148
9149 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9150 if (subord->nested_bounds_set) {
9151 /* Inherit the bounds from subord. */
9152 grand->nested_region_true_start = subord->nested_region_true_start;
9153 grand->nested_region_true_end = subord->nested_region_true_end;
9154 grand->nested_bounds_set = true;
9155
9156 /* If we know the bounds, we can trim the pmap. */
9157 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9158
9159 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9160 } else {
9161 /* Don't trim if we don't know the bounds. */
9162 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9163
9164 return PMAP_TRIM_STATE_DONE;
9165 }
9166 }
9167
9168 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9169 if (!grand->nested_bounds_set) {
9170 panic("%s: !grand->nested_bounds_set, "
9171 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9172 __func__, grand, subord, (void*)vstart, size, state);
9173 }
9174
9175 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9176 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9177 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9178 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9179 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9180 (unsigned int)grand->nested_no_bounds_ref_state);
9181 }
9182
9183 #if XNU_MONITOR
9184 if (pmap_pending_preemption()) {
9185 return PMAP_TRIM_STATE_GRAND_AFTER;
9186 }
9187 #endif
9188
9189 state = PMAP_TRIM_STATE_GRAND_AFTER;
9190 }
9191
9192 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9193 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9194 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9195 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9196 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9197 (unsigned int)grand->nested_no_bounds_ref_state);
9198 }
9199
9200 #if XNU_MONITOR
9201 if (pmap_pending_preemption()) {
9202 return PMAP_TRIM_STATE_SUBORD;
9203 }
9204 #endif
9205
9206 state = PMAP_TRIM_STATE_SUBORD;
9207 }
9208
9209 /* START state is guaranteed to compute the bounds for the subord. */
9210 if (!subord->nested_bounds_set) {
9211 panic("%s: !subord->nested_bounds_set, "
9212 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9213 __func__, grand, subord, (void*)vstart, size, state);
9214 }
9215
9216 if (state == PMAP_TRIM_STATE_SUBORD) {
9217 /**
9218 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9219 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9220 * called once grand's nested tables have been fully trimmed, and can only be called once
9221 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9222 * the state update is visible only once the preceding trim operation is complete. An
9223 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9224 * but locking within pmap_trim_range() should make that harmless (and all but one will
9225 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9226 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9227 * of the state CAS.
9228 */
9229 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9230 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9231 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9232 (unsigned int)grand->nested_no_bounds_ref_state);
9233 }
9234 pmap_trim_subord(subord);
9235 }
9236
9237 return PMAP_TRIM_STATE_DONE;
9238 }
9239
9240 MARK_AS_PMAP_TEXT static void
9241 pmap_trim_self(pmap_t pmap)
9242 {
9243 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9244 /* If we have a no bounds ref, we need to drop it. */
9245 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9246 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9247 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9248 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9249 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9250 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9251
9252 if (nested_bounds_set) {
9253 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9254 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9255 }
9256 /*
9257 * Try trimming the nested pmap, in case we had the
9258 * last reference.
9259 */
9260 pmap_trim_subord(pmap->nested_pmap);
9261 }
9262 }
9263
9264 /*
9265 * pmap_trim_subord(grand, subord)
9266 *
9267 * grand = pmap that we have nested subord in
9268 * subord = nested pmap we are attempting to trim
9269 *
9270 * Trims subord if possible
9271 */
9272 MARK_AS_PMAP_TEXT static void
9273 pmap_trim_subord(pmap_t subord)
9274 {
9275 bool contract_subord = false;
9276
9277 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9278
9279 subord->nested_no_bounds_refcnt--;
9280
9281 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9282 /* If this was the last no bounds reference, trim subord. */
9283 contract_subord = true;
9284 }
9285
9286 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9287
9288 if (contract_subord) {
9289 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9290 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9291 }
9292 }
9293
9294 /**
9295 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9296 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9297 * disabling preemption for too long.
9298 *
9299 * @note When we load the shared region we always create pages tables for the
9300 * entire region. In practice, the shared cache may use just a portion
9301 * of that. Before we know the bounds of the shared region, it can
9302 * already be mapped into processes. Therefore, once the bounds are
9303 * known, "trimming" comes in handy to remove the unnecessary page
9304 * tables in the processes the shared region is mapped in, and eventually
9305 * those in the shared region itself. Note that the shared region must
9306 * be trimmed after the user processes because it has the L3 entries
9307 * everyone else is pointing to.
9308 *
9309 * @param grand the pmap in which the pages are nested
9310 * @param subord the pmap from which the pages are shared, or nested
9311 * @param vstart start of the used range in "grand"
9312 * @param size size of the used range
9313 */
9314 void
9315 pmap_trim(
9316 pmap_t grand,
9317 pmap_t subord,
9318 addr64_t vstart,
9319 uint64_t size)
9320 {
9321 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9322
9323 #if XNU_MONITOR
9324 /* On PPL systems, drives the state machine until its done. */
9325 while (state != PMAP_TRIM_STATE_DONE) {
9326 __assert_only pmap_trim_state_t old_state = state;
9327 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9328
9329 /* Are we making progress? */
9330 assert(old_state != state);
9331 }
9332
9333 pmap_ledger_check_balance(grand);
9334 pmap_ledger_check_balance(subord);
9335 #else
9336 state = pmap_trim_internal(grand, subord, vstart, size, state);
9337
9338 /* On non-PPL systems, we expect the implementation to finish in one call. */
9339 assert(state == PMAP_TRIM_STATE_DONE);
9340 #endif
9341 }
9342
9343 #if HAS_APPLE_PAC
9344 void *
9345 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9346 {
9347 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9348 panic("attempt to sign user pointer without process independent key");
9349 }
9350
9351 void *res = NULL;
9352 uint64_t current_intr_state = pmap_interrupts_disable();
9353
9354 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9355
9356 __compiler_materialize_and_prevent_reordering_on(value);
9357 switch (key) {
9358 case ptrauth_key_asia:
9359 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9360 break;
9361 case ptrauth_key_asda:
9362 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9363 break;
9364 default:
9365 __builtin_unreachable();
9366 }
9367 __compiler_materialize_and_prevent_reordering_on(res);
9368
9369 ml_disable_user_jop_key(jop_key, saved_jop_state);
9370
9371 pmap_interrupts_restore(current_intr_state);
9372
9373 return res;
9374 }
9375
9376 void *
9377 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9378 {
9379 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9380 }
9381
9382 void *
9383 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9384 {
9385 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9386 panic("attempt to auth user pointer without process independent key");
9387 }
9388
9389 void *res = NULL;
9390 uint64_t current_intr_state = pmap_interrupts_disable();
9391
9392 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9393 __compiler_materialize_and_prevent_reordering_on(value);
9394 res = ml_auth_ptr_unchecked(value, key, discriminator);
9395 __compiler_materialize_and_prevent_reordering_on(res);
9396 ml_disable_user_jop_key(jop_key, saved_jop_state);
9397
9398 pmap_interrupts_restore(current_intr_state);
9399
9400 return res;
9401 }
9402
9403 void *
9404 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9405 {
9406 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9407 }
9408 #endif /* HAS_APPLE_PAC */
9409
9410 /*
9411 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9412 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9413 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9414 * return value, to indicate where a preempted [un]nest operation should resume.
9415 * When the return value contains the ending address of the nested region with
9416 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9417 */
9418 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9419
9420 /*
9421 * kern_return_t pmap_nest(grand, subord, vstart, size)
9422 *
9423 * grand = the pmap that we will nest subord into
9424 * subord = the pmap that goes into the grand
9425 * vstart = start of range in pmap to be inserted
9426 * size = Size of nest area (up to 16TB)
9427 *
9428 * Inserts a pmap into another. This is used to implement shared segments.
9429 *
9430 */
9431
9432 /**
9433 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9434 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9435 * This function operates in 3 main phases:
9436 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9437 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9438 * the mapping range are present in subord.
9439 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9440 * contains pointers to subord's leaf-level pagetable pages for the specified
9441 * VA range.
9442 *
9443 * This function may return early due to pending AST_URGENT preemption; if so
9444 * it will indicate the need to be re-entered.
9445 *
9446 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9447 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9448 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9449 * @param size twig-aligned size of the nesting range
9450 * @param vrestart the twig-aligned starting address of the current call. May contain
9451 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9452 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9453 * KERN_RESOURCE_SHORTAGE on allocation failure.
9454 *
9455 * @return the virtual address at which to restart the operation, possibly including
9456 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9457 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9458 */
9459 MARK_AS_PMAP_TEXT vm_map_offset_t
9460 pmap_nest_internal(
9461 pmap_t grand,
9462 pmap_t subord,
9463 addr64_t vstart,
9464 uint64_t size,
9465 vm_map_offset_t vrestart,
9466 kern_return_t *krp)
9467 {
9468 kern_return_t kr = KERN_FAILURE;
9469 vm_map_offset_t vaddr;
9470 tt_entry_t *stte_p;
9471 tt_entry_t *gtte_p;
9472 unsigned int nested_region_asid_bitmap_size;
9473 unsigned int* nested_region_asid_bitmap = NULL;
9474 unsigned int new_nested_region_asid_bitmap_size;
9475 unsigned int* new_nested_region_asid_bitmap = NULL;
9476 int expand_options = 0;
9477 bool deref_subord = true;
9478 bool grand_locked = false;
9479
9480 addr64_t vend;
9481 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9482 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9483 }
9484 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9485 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9486 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9487 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9488 }
9489
9490 assert(krp != NULL);
9491 validate_pmap_mutable(grand);
9492 validate_pmap(subord);
9493 #if XNU_MONITOR
9494 /*
9495 * Ordering is important here. validate_pmap() has already ensured subord is a
9496 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9497 * be in the process of being destroyed. If destruction is already committed,
9498 * then the check of ref_count below will cover us. If destruction is initiated
9499 * during or after this call, then pmap_destroy() will catch the non-zero
9500 * nested_count.
9501 */
9502 os_atomic_inc(&subord->nested_count, relaxed);
9503 os_atomic_thread_fence(seq_cst);
9504 #endif
9505 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9506 panic("%s: invalid subordinate pmap %p", __func__, subord);
9507 }
9508
9509 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9510 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9511 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9512 }
9513
9514 #if XNU_MONITOR
9515 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9516 #endif
9517
9518 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9519 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9520 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9521 grand, vstart, size, (unsigned long long)vrestart);
9522 }
9523
9524 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9525 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9526 }
9527
9528 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9529 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9530 }
9531
9532 if (subord->nested_region_asid_bitmap == NULL) {
9533 nested_region_asid_bitmap_size = (unsigned int)(size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY);
9534
9535 #if XNU_MONITOR
9536 pmap_paddr_t pa = 0;
9537
9538 if (__improbable((nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9539 panic("%s: nested_region_asid_bitmap_size=%u will not fit in a page, "
9540 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9541 __FUNCTION__, nested_region_asid_bitmap_size,
9542 grand, subord, vstart, size);
9543 }
9544
9545 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9546
9547 if (kr != KERN_SUCCESS) {
9548 goto nest_cleanup;
9549 }
9550
9551 assert(pa);
9552
9553 nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9554 #else
9555 nested_region_asid_bitmap = kalloc_data(
9556 nested_region_asid_bitmap_size * sizeof(unsigned int),
9557 Z_WAITOK | Z_ZERO);
9558 #endif
9559
9560 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9561 kr = KERN_ABORTED;
9562 goto nest_cleanup;
9563 }
9564
9565 if (subord->nested_region_asid_bitmap == NULL) {
9566 subord->nested_region_asid_bitmap_size = nested_region_asid_bitmap_size;
9567 subord->nested_region_addr = vstart;
9568 subord->nested_region_size = (mach_vm_offset_t) size;
9569
9570 /**
9571 * Ensure that the rest of the subord->nested_region_* fields are
9572 * initialized and visible before setting the nested_region_asid_bitmap
9573 * field (which is used as the flag to say that the rest are initialized).
9574 */
9575 __builtin_arm_dmb(DMB_ISHST);
9576 subord->nested_region_asid_bitmap = nested_region_asid_bitmap;
9577 nested_region_asid_bitmap = NULL;
9578 }
9579 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9580 if (nested_region_asid_bitmap != NULL) {
9581 #if XNU_MONITOR
9582 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9583 #else
9584 kfree_data(nested_region_asid_bitmap,
9585 nested_region_asid_bitmap_size * sizeof(unsigned int));
9586 #endif
9587 nested_region_asid_bitmap = NULL;
9588 }
9589 }
9590
9591 /**
9592 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9593 * speculated before their initialization.
9594 */
9595 __builtin_arm_dmb(DMB_ISHLD);
9596
9597 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9598 uint64_t new_size;
9599
9600 nested_region_asid_bitmap = NULL;
9601 nested_region_asid_bitmap_size = 0;
9602 new_size = vend - subord->nested_region_addr;
9603
9604 /* We explicitly add 1 to the bitmap allocation size in order to avoid issues with truncation. */
9605 new_nested_region_asid_bitmap_size = (unsigned int)((new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY)) + 1;
9606
9607 #if XNU_MONITOR
9608 pmap_paddr_t pa = 0;
9609
9610 if (__improbable((new_nested_region_asid_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9611 panic("%s: new_nested_region_asid_bitmap_size=%u will not fit in a page, "
9612 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9613 __FUNCTION__, new_nested_region_asid_bitmap_size,
9614 grand, subord, vstart, new_size);
9615 }
9616
9617 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9618
9619 if (kr != KERN_SUCCESS) {
9620 goto nest_cleanup;
9621 }
9622
9623 assert(pa);
9624
9625 new_nested_region_asid_bitmap = (unsigned int *)phystokv(pa);
9626 #else
9627 new_nested_region_asid_bitmap = kalloc_data(
9628 new_nested_region_asid_bitmap_size * sizeof(unsigned int),
9629 Z_WAITOK | Z_ZERO);
9630 #endif
9631 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9632 kr = KERN_ABORTED;
9633 goto nest_cleanup;
9634 }
9635
9636 if (subord->nested_region_size < new_size) {
9637 bcopy(subord->nested_region_asid_bitmap,
9638 new_nested_region_asid_bitmap, subord->nested_region_asid_bitmap_size);
9639 nested_region_asid_bitmap_size = subord->nested_region_asid_bitmap_size;
9640 nested_region_asid_bitmap = subord->nested_region_asid_bitmap;
9641 subord->nested_region_asid_bitmap = new_nested_region_asid_bitmap;
9642 subord->nested_region_asid_bitmap_size = new_nested_region_asid_bitmap_size;
9643 subord->nested_region_size = new_size;
9644 new_nested_region_asid_bitmap = NULL;
9645 }
9646 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9647 if (nested_region_asid_bitmap != NULL) {
9648 #if XNU_MONITOR
9649 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9650 #else
9651 kfree_data(nested_region_asid_bitmap,
9652 nested_region_asid_bitmap_size * sizeof(unsigned int));
9653 #endif
9654 nested_region_asid_bitmap = NULL;
9655 }
9656 if (new_nested_region_asid_bitmap != NULL) {
9657 #if XNU_MONITOR
9658 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9659 #else
9660 kfree_data(new_nested_region_asid_bitmap,
9661 new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9662 #endif
9663 new_nested_region_asid_bitmap = NULL;
9664 }
9665 }
9666
9667 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9668 kr = KERN_ABORTED;
9669 goto nest_cleanup;
9670 }
9671
9672 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, relaxed)) {
9673 /*
9674 * If this is grand's first nesting operation, keep the reference on subord.
9675 * It will be released by pmap_destroy_internal() when grand is destroyed.
9676 */
9677 deref_subord = false;
9678
9679 if (!subord->nested_bounds_set) {
9680 /*
9681 * We are nesting without the shared regions bounds
9682 * being known. We'll have to trim the pmap later.
9683 */
9684 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9685 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9686 panic("%s: grand %p already nested", __func__, grand);
9687 }
9688 subord->nested_no_bounds_refcnt++;
9689 }
9690
9691 grand->nested_region_addr = vstart;
9692 grand->nested_region_size = (mach_vm_offset_t) size;
9693 } else {
9694 if (__improbable(grand->nested_pmap != subord)) {
9695 panic("pmap_nest() pmap %p has a nested pmap", grand);
9696 } else if (__improbable(grand->nested_region_addr > vstart)) {
9697 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9698 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9699 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9700 }
9701 }
9702
9703 vaddr = vrestart & ~PMAP_NEST_GRAND;
9704 if (vaddr < subord->nested_region_true_start) {
9705 vaddr = subord->nested_region_true_start;
9706 }
9707
9708 addr64_t true_end = vend;
9709 if (true_end > subord->nested_region_true_end) {
9710 true_end = subord->nested_region_true_end;
9711 }
9712 __unused unsigned int ttecount = 0;
9713
9714 if (vrestart & PMAP_NEST_GRAND) {
9715 goto nest_grand;
9716 }
9717
9718 while (vaddr < true_end) {
9719 stte_p = pmap_tte(subord, vaddr);
9720 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9721 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9722 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9723
9724 if (kr != KERN_SUCCESS) {
9725 goto done;
9726 }
9727
9728 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9729 }
9730 vaddr += pt_attr_twig_size(pt_attr);
9731 vrestart = vaddr;
9732 ++ttecount;
9733 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9734 pmap_pending_preemption())) {
9735 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9736 kr = KERN_SUCCESS;
9737 goto done;
9738 }
9739 }
9740 /*
9741 * copy TTEs from subord pmap into grand pmap
9742 */
9743
9744 vaddr = (vm_map_offset_t) vstart;
9745 if (vaddr < subord->nested_region_true_start) {
9746 vaddr = subord->nested_region_true_start;
9747 }
9748 vrestart = vaddr | PMAP_NEST_GRAND;
9749
9750 nest_grand:
9751 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9752
9753 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9754 kr = KERN_ABORTED;
9755 goto done;
9756 }
9757 while (vaddr < true_end) {
9758 stte_p = pmap_tte(subord, vaddr);
9759 gtte_p = pmap_tte(grand, vaddr);
9760 if (gtte_p == PT_ENTRY_NULL) {
9761 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9762 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9763 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9764 if (kr == KERN_SUCCESS) {
9765 kr = KERN_ABORTED;
9766 }
9767 }
9768
9769 if (kr != KERN_SUCCESS) {
9770 goto done;
9771 }
9772
9773 gtte_p = pmap_tt2e(grand, vaddr);
9774 }
9775 /* Don't leak a page table page. Don't violate break-before-make. */
9776 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9777 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9778 __func__, gtte_p, grand);
9779 }
9780 *gtte_p = *stte_p;
9781
9782 vaddr += pt_attr_twig_size(pt_attr);
9783 vrestart = vaddr | PMAP_NEST_GRAND;
9784 ++ttecount;
9785 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9786 pmap_pending_preemption())) {
9787 break;
9788 }
9789 }
9790 if (vaddr >= true_end) {
9791 vrestart = vend | PMAP_NEST_GRAND;
9792 }
9793
9794 kr = KERN_SUCCESS;
9795 done:
9796
9797 FLUSH_PTE();
9798 __builtin_arm_isb(ISB_SY);
9799
9800 if (grand_locked) {
9801 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9802 }
9803
9804 nest_cleanup:
9805 #if XNU_MONITOR
9806 if (kr != KERN_SUCCESS) {
9807 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9808 *krp = kr;
9809 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9810 }
9811 #else
9812 if (kr != KERN_SUCCESS) {
9813 *krp = kr;
9814 }
9815 #endif
9816 if (nested_region_asid_bitmap != NULL) {
9817 #if XNU_MONITOR
9818 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_asid_bitmap), PAGE_SIZE);
9819 #else
9820 kfree_data(nested_region_asid_bitmap,
9821 nested_region_asid_bitmap_size * sizeof(unsigned int));
9822 #endif
9823 }
9824 if (new_nested_region_asid_bitmap != NULL) {
9825 #if XNU_MONITOR
9826 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_asid_bitmap), PAGE_SIZE);
9827 #else
9828 kfree_data(new_nested_region_asid_bitmap,
9829 new_nested_region_asid_bitmap_size * sizeof(unsigned int));
9830 #endif
9831 }
9832 if (deref_subord) {
9833 #if XNU_MONITOR
9834 os_atomic_dec(&subord->nested_count, relaxed);
9835 #endif
9836 pmap_destroy_internal(subord);
9837 }
9838 return vrestart;
9839 }
9840
9841 kern_return_t
9842 pmap_nest(
9843 pmap_t grand,
9844 pmap_t subord,
9845 addr64_t vstart,
9846 uint64_t size)
9847 {
9848 kern_return_t kr = KERN_SUCCESS;
9849 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9850 vm_map_offset_t vend = vaddr + size;
9851 __unused vm_map_offset_t vlast = vaddr;
9852
9853 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9854 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9855 VM_KERNEL_ADDRHIDE(vstart));
9856
9857 pmap_verify_preemptible();
9858 #if XNU_MONITOR
9859 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9860 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9861 if (kr == KERN_RESOURCE_SHORTAGE) {
9862 pmap_alloc_page_for_ppl(0);
9863 kr = KERN_SUCCESS;
9864 } else if (kr == KERN_ABORTED) {
9865 /**
9866 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9867 * that it won't update kr when KERN_SUCCESS is to be returned.
9868 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9869 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9870 */
9871 kr = KERN_SUCCESS;
9872 continue;
9873 } else if (kr != KERN_SUCCESS) {
9874 break;
9875 } else if (vaddr == vlast) {
9876 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9877 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9878 }
9879 vlast = vaddr;
9880 }
9881
9882 pmap_ledger_check_balance(grand);
9883 pmap_ledger_check_balance(subord);
9884 #else
9885 /**
9886 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9887 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9888 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9889 */
9890 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9891 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9892 }
9893 #endif
9894
9895 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9896
9897 return kr;
9898 }
9899
9900 /*
9901 * kern_return_t pmap_unnest(grand, vaddr)
9902 *
9903 * grand = the pmap that will have the virtual range unnested
9904 * vaddr = start of range in pmap to be unnested
9905 * size = size of range in pmap to be unnested
9906 *
9907 */
9908
9909 kern_return_t
9910 pmap_unnest(
9911 pmap_t grand,
9912 addr64_t vaddr,
9913 uint64_t size)
9914 {
9915 return pmap_unnest_options(grand, vaddr, size, 0);
9916 }
9917
9918 /**
9919 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9920 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9921 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9922 * still have the region nested. The mappings in 'grand' will be left empty
9923 * with the assumption that they will be demand-filled by subsequent access faults.
9924 *
9925 * This function operates in 2 main phases:
9926 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9927 * them non-global.
9928 * 2. Clearing of the twig-level TTEs for the address range in grand.
9929 *
9930 * This function may return early due to pending AST_URGENT preemption; if so
9931 * it will indicate the need to be re-entered.
9932 *
9933 * @param grand pmap from which to unnest mappings
9934 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9935 * @param size twig-aligned size of the nested range
9936 * @param vrestart the page-aligned starting address of the current call. May contain
9937 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9938 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9939 * grand is being torn down and step 1) above is not needed.
9940 *
9941 * @return the virtual address at which to restart the operation, possibly including
9942 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9943 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9944 */
9945 MARK_AS_PMAP_TEXT vm_map_offset_t
9946 pmap_unnest_options_internal(
9947 pmap_t grand,
9948 addr64_t vaddr,
9949 uint64_t size,
9950 vm_map_offset_t vrestart,
9951 unsigned int option)
9952 {
9953 vm_map_offset_t start;
9954 vm_map_offset_t addr;
9955 tt_entry_t *tte_p;
9956 unsigned int current_index;
9957 unsigned int start_index;
9958 unsigned int max_index;
9959 unsigned int entry_count = 0;
9960
9961 addr64_t vend;
9962 addr64_t true_end;
9963 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9964 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9965 }
9966 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9967 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9968 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9969 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9970 }
9971
9972 validate_pmap_mutable(grand);
9973
9974 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9975
9976 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9977 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9978 (unsigned long long)vaddr, (unsigned long long)size);
9979 }
9980
9981 if (__improbable(grand->nested_pmap == NULL)) {
9982 panic("%s: %p has no nested pmap", __func__, grand);
9983 }
9984
9985 true_end = vend;
9986 if (true_end > grand->nested_pmap->nested_region_true_end) {
9987 true_end = grand->nested_pmap->nested_region_true_end;
9988 }
9989
9990 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
9991 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
9992 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
9993 }
9994
9995 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
9996 return vrestart;
9997 }
9998
9999 start = vrestart;
10000 if (start < grand->nested_pmap->nested_region_true_start) {
10001 start = grand->nested_pmap->nested_region_true_start;
10002 }
10003 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10004 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10005 bool flush_tlb = false;
10006
10007 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10008 pt_entry_t *bpte, *cpte;
10009
10010 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10011
10012 bpte = pmap_pte(grand->nested_pmap, addr);
10013
10014 /*
10015 * If we've re-entered this function partway through unnesting a leaf region, the
10016 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10017 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10018 * address.
10019 */
10020 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap) ||
10021 (addr & pt_attr_twig_offmask(pt_attr))) {
10022 /*
10023 * Mark the 'twig' region as being unnested. Every mapping entered within
10024 * the nested pmap in this region will now be marked non-global. Do this
10025 * before marking any of the PTEs within the region as non-global to avoid
10026 * the possibility of pmap_enter() subsequently inserting a global mapping
10027 * in the region, which could lead to a TLB conflict if a non-global entry
10028 * is later inserted for the same VA in a pmap which has fully unnested this
10029 * region.
10030 */
10031 setbit(current_index, (int *)grand->nested_pmap->nested_region_asid_bitmap);
10032 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10033 pmap_paddr_t pa;
10034 unsigned int pai = 0;
10035 boolean_t managed = FALSE;
10036 pt_entry_t spte;
10037
10038 if ((*cpte != ARM_PTE_TYPE_FAULT)
10039 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10040 spte = *((volatile pt_entry_t*)cpte);
10041 while (!managed) {
10042 pa = pte_to_pa(spte);
10043 if (!pa_valid(pa)) {
10044 break;
10045 }
10046 pai = pa_index(pa);
10047 pvh_lock(pai);
10048 spte = *((volatile pt_entry_t*)cpte);
10049 pa = pte_to_pa(spte);
10050 if (pai == pa_index(pa)) {
10051 managed = TRUE;
10052 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10053 }
10054 pvh_unlock(pai);
10055 }
10056
10057 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10058 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10059 flush_tlb = true;
10060 }
10061
10062 if (managed) {
10063 pvh_assert_locked(pai);
10064 pvh_unlock(pai);
10065 }
10066 }
10067
10068 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10069 vrestart = addr;
10070 ++entry_count;
10071 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10072 pmap_pending_preemption())) {
10073 goto unnest_subord_done;
10074 }
10075 }
10076 }
10077 addr = vlim;
10078 vrestart = addr;
10079 ++entry_count;
10080 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10081 pmap_pending_preemption())) {
10082 break;
10083 }
10084 }
10085
10086 unnest_subord_done:
10087 if (flush_tlb) {
10088 FLUSH_PTE_STRONG();
10089 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10090 }
10091
10092 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10093 if (current_index < max_index) {
10094 return vrestart;
10095 }
10096 }
10097
10098 /*
10099 * invalidate all pdes for segment at vaddr in pmap grand
10100 */
10101 if (vrestart & PMAP_NEST_GRAND) {
10102 addr = vrestart & ~PMAP_NEST_GRAND;
10103 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10104 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10105 }
10106 } else {
10107 addr = vaddr;
10108 vrestart = vaddr | PMAP_NEST_GRAND;
10109 }
10110
10111 /**
10112 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10113 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10114 * upon reentry.
10115 */
10116 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10117 return vrestart;
10118 }
10119
10120 if (addr < grand->nested_pmap->nested_region_true_start) {
10121 addr = grand->nested_pmap->nested_region_true_start;
10122 }
10123
10124 while (addr < true_end) {
10125 tte_p = pmap_tte(grand, addr);
10126 /*
10127 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10128 * so it's possible that a region we're trying to unnest may not have been
10129 * nested in the first place.
10130 */
10131 if (tte_p != NULL) {
10132 *tte_p = ARM_TTE_TYPE_FAULT;
10133 }
10134 addr += pt_attr_twig_size(pt_attr);
10135 vrestart = addr | PMAP_NEST_GRAND;
10136 ++entry_count;
10137 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10138 pmap_pending_preemption())) {
10139 break;
10140 }
10141 }
10142 if (addr >= true_end) {
10143 vrestart = vend | PMAP_NEST_GRAND;
10144 }
10145
10146 FLUSH_PTE_STRONG();
10147 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10148
10149 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10150
10151 return vrestart;
10152 }
10153
10154 kern_return_t
10155 pmap_unnest_options(
10156 pmap_t grand,
10157 addr64_t vaddr,
10158 uint64_t size,
10159 unsigned int option)
10160 {
10161 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10162 vm_map_offset_t vend = vaddr + size;
10163
10164 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10165 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10166
10167 pmap_verify_preemptible();
10168 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10169 #if XNU_MONITOR
10170 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10171 #else
10172 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10173 #endif
10174 }
10175
10176 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10177
10178 return KERN_SUCCESS;
10179 }
10180
10181 boolean_t
10182 pmap_adjust_unnest_parameters(
10183 __unused pmap_t p,
10184 __unused vm_map_offset_t *s,
10185 __unused vm_map_offset_t *e)
10186 {
10187 return TRUE; /* to get to log_unnest_badness()... */
10188 }
10189
10190 #if PMAP_FORK_NEST
10191 /**
10192 * Perform any necessary pre-nesting of the parent's shared region at fork()
10193 * time.
10194 *
10195 * @note This should only be called from vm_map_fork().
10196 *
10197 * @param old_pmap The pmap of the parent task.
10198 * @param new_pmap The pmap of the child task.
10199 * @param nesting_start An output parameter that is updated with the start
10200 * address of the range that was pre-nested
10201 * @param nesting_end An output parameter that is updated with the end
10202 * address of the range that was pre-nested
10203 *
10204 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10205 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10206 */
10207 kern_return_t
10208 pmap_fork_nest(
10209 pmap_t old_pmap,
10210 pmap_t new_pmap,
10211 vm_map_offset_t *nesting_start,
10212 vm_map_offset_t *nesting_end)
10213 {
10214 if (old_pmap == NULL || new_pmap == NULL) {
10215 return KERN_INVALID_ARGUMENT;
10216 }
10217 if (old_pmap->nested_pmap == NULL) {
10218 return KERN_SUCCESS;
10219 }
10220 pmap_nest(new_pmap,
10221 old_pmap->nested_pmap,
10222 old_pmap->nested_region_addr,
10223 old_pmap->nested_region_size);
10224 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10225 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10226 new_pmap->nested_region_size == old_pmap->nested_region_size,
10227 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10228 new_pmap->nested_pmap,
10229 new_pmap->nested_region_addr,
10230 new_pmap->nested_region_size,
10231 old_pmap->nested_pmap,
10232 old_pmap->nested_region_addr,
10233 old_pmap->nested_region_size);
10234 *nesting_start = old_pmap->nested_region_addr;
10235 *nesting_end = *nesting_start + old_pmap->nested_region_size;
10236 return KERN_SUCCESS;
10237 }
10238 #endif /* PMAP_FORK_NEST */
10239
10240 /*
10241 * disable no-execute capability on
10242 * the specified pmap
10243 */
10244 #if DEVELOPMENT || DEBUG
10245 void
10246 pmap_disable_NX(
10247 pmap_t pmap)
10248 {
10249 pmap->nx_enabled = FALSE;
10250 }
10251 #else
10252 void
10253 pmap_disable_NX(
10254 __unused pmap_t pmap)
10255 {
10256 }
10257 #endif
10258
10259 /*
10260 * flush a range of hardware TLB entries.
10261 * NOTE: assumes the smallest TLB entry in use will be for
10262 * an ARM small page (4K).
10263 */
10264
10265 #if __ARM_RANGE_TLBI__
10266 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10267 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10268 #else
10269 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10270 #endif // __ARM_RANGE_TLBI__
10271
10272 static void
10273 flush_mmu_tlb_region_asid_async(
10274 vm_offset_t va,
10275 size_t length,
10276 pmap_t pmap,
10277 bool last_level_only __unused,
10278 bool strong __unused)
10279 {
10280 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10281 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10282 ppnum_t npages = (ppnum_t)(length >> pmap_page_shift);
10283 uint32_t asid;
10284
10285 asid = pmap->hw_asid;
10286
10287 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10288 boolean_t flush_all = FALSE;
10289
10290 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10291 flush_all = TRUE;
10292 }
10293 if (flush_all) {
10294 flush_mmu_tlb_async();
10295 } else {
10296 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10297 }
10298 return;
10299 }
10300 #if __ARM_RANGE_TLBI__
10301 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10302 va = generate_rtlbi_param(npages, asid, va, pmap_page_shift);
10303 if (pmap->type == PMAP_TYPE_NESTED) {
10304 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10305 } else {
10306 flush_mmu_tlb_range_async(va, last_level_only, strong);
10307 }
10308 return;
10309 }
10310 #endif
10311 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10312 va = tlbi_asid(asid) | tlbi_addr(va);
10313
10314 if (pmap->type == PMAP_TYPE_NESTED) {
10315 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10316 } else {
10317 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10318 }
10319 }
10320
10321 MARK_AS_PMAP_TEXT static void
10322 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10323 {
10324 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10325 }
10326
10327 void
10328 flush_mmu_tlb_region(
10329 vm_offset_t va,
10330 unsigned length)
10331 {
10332 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10333 sync_tlb_flush();
10334 }
10335
10336 unsigned int
10337 pmap_cache_attributes(
10338 ppnum_t pn)
10339 {
10340 pmap_paddr_t paddr;
10341 unsigned int pai;
10342 unsigned int result;
10343 pp_attr_t pp_attr_current;
10344
10345 paddr = ptoa(pn);
10346
10347 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10348
10349 if (!pa_valid(paddr)) {
10350 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10351 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10352 }
10353
10354 result = VM_WIMG_DEFAULT;
10355
10356 pai = pa_index(paddr);
10357
10358 pp_attr_current = pp_attr_table[pai];
10359 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10360 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10361 }
10362 return result;
10363 }
10364
10365 MARK_AS_PMAP_TEXT static void
10366 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10367 {
10368 if ((wimg_bits_prev != wimg_bits_new)
10369 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10370 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10371 && (wimg_bits_new != VM_WIMG_COPYBACK))
10372 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10373 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10374 pmap_sync_page_attributes_phys(pn);
10375 }
10376
10377 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10378 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10379 }
10380 }
10381
10382 MARK_AS_PMAP_TEXT __unused void
10383 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10384 {
10385 pmap_paddr_t paddr = ptoa(pn);
10386 const unsigned int pai = pa_index(paddr);
10387
10388 if (__improbable(!pa_valid(paddr))) {
10389 panic("%s called on non-managed page 0x%08x", __func__, pn);
10390 }
10391
10392 pvh_lock(pai);
10393
10394 #if XNU_MONITOR
10395 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10396 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10397 }
10398 #endif
10399
10400 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10401
10402 pvh_unlock(pai);
10403
10404 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10405 }
10406
10407 void *
10408 pmap_map_compressor_page(ppnum_t pn)
10409 {
10410 #if __ARM_PTE_PHYSMAP__
10411 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10412 if (cacheattr != VM_WIMG_DEFAULT) {
10413 #if XNU_MONITOR
10414 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10415 #else
10416 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10417 #endif
10418 }
10419 #endif
10420 return (void*)phystokv(ptoa(pn));
10421 }
10422
10423 void
10424 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10425 {
10426 #if __ARM_PTE_PHYSMAP__
10427 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10428 if (cacheattr != VM_WIMG_DEFAULT) {
10429 #if XNU_MONITOR
10430 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10431 #else
10432 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10433 #endif
10434 }
10435 #endif
10436 }
10437
10438 /**
10439 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10440 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10441 *
10442 * @param user_page_list List of pages to be updated.
10443 * @param page_cnt Number of pages in total in user_page_list.
10444 * @param cacheattr The new cache attribute.
10445 *
10446 * @return Success if true is returned.
10447 */
10448 bool
10449 pmap_batch_set_cache_attributes(
10450 upl_page_info_array_t user_page_list,
10451 unsigned int page_cnt,
10452 unsigned int cacheattr)
10453 {
10454 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10455
10456 if (page_cnt == 0) {
10457 return true;
10458 }
10459
10460 batch_set_cache_attr_state_t states;
10461 states.page_index = 0;
10462 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10463 states.tlb_flush_pass_needed = false;
10464 states.rt_cache_flush_pass_needed = false;
10465
10466 /* Verify we are being called from a preemptible context. */
10467 pmap_verify_preemptible();
10468
10469 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10470 #if XNU_MONITOR
10471 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10472 #else /* !XNU_MONITOR */
10473 states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10474 #endif /* XNU_MONITOR */
10475 }
10476
10477 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10478 return true;
10479 }
10480
10481 /**
10482 * Flushes TLB entries associated with the page specified by paddr, but do not
10483 * issue barriers yet.
10484 *
10485 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10486 */
10487 MARK_AS_PMAP_TEXT static void
10488 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10489 {
10490 #if __ARM_PTE_PHYSMAP__
10491 /* Flush the physical aperture mappings. */
10492 const vm_offset_t kva = phystokv(paddr);
10493 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10494 #endif /* __ARM_PTE_PHYSMAP__ */
10495
10496 /* Flush the mappings tracked in the ptes. */
10497 const unsigned int pai = pa_index(paddr);
10498 pv_entry_t **pv_h = pai_to_pvh(pai);
10499
10500 pt_entry_t *pte_p = PT_ENTRY_NULL;
10501 pv_entry_t *pve_p = PV_ENTRY_NULL;
10502
10503 pvh_assert_locked(pai);
10504
10505 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10506 pte_p = pvh_ptep(pv_h);
10507 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10508 pve_p = pvh_pve_list(pv_h);
10509 pte_p = PT_ENTRY_NULL;
10510 }
10511
10512 int pve_ptep_idx = 0;
10513 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10514 if (pve_p != PV_ENTRY_NULL) {
10515 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10516 if (pte_p == PT_ENTRY_NULL) {
10517 goto flush_tlb_skip_pte;
10518 }
10519 }
10520
10521 #ifdef PVH_FLAG_IOMMU
10522 if (pvh_ptep_is_iommu(pte_p)) {
10523 goto flush_tlb_skip_pte;
10524 }
10525 #endif /* PVH_FLAG_IOMMU */
10526 pmap_t pmap = ptep_get_pmap(pte_p);
10527 vm_map_address_t va = ptep_get_va(pte_p);
10528
10529 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10530 pmap, true, false);
10531
10532 flush_tlb_skip_pte:
10533 pte_p = PT_ENTRY_NULL;
10534 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10535 pve_ptep_idx = 0;
10536 pve_p = pve_next(pve_p);
10537 }
10538 }
10539 }
10540
10541 /**
10542 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10543 *
10544 * @param pai The Physical Address Index of the entry.
10545 * @param cacheattr The new cache attribute.
10546 */
10547 MARK_AS_PMAP_TEXT static void
10548 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10549 {
10550 pvh_assert_locked(pai);
10551
10552 pp_attr_t pp_attr_current, pp_attr_template;
10553 do {
10554 pp_attr_current = pp_attr_table[pai];
10555 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10556
10557 /**
10558 * WIMG bits should only be updated under the PVH lock, but we should do
10559 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10560 */
10561 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10562 }
10563
10564 /**
10565 * Batch updates the cache attributes of a list of pages in three passes.
10566 *
10567 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10568 * In pass two, TLB entries are flushed for each page in the list if necessary.
10569 * In pass three, caches are cleaned for each page in the list if necessary.
10570 *
10571 * When running in PPL, this function may decide to return to the caller in response
10572 * to AST_URGENT.
10573 *
10574 * @param user_page_list List of pages to be updated.
10575 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10576 * @param page_cnt Number of pages in total in user_page_list.
10577 * @param cacheattr The new cache attributes.
10578 *
10579 * @return The new state of the state machine.
10580 */
10581 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10582 pmap_batch_set_cache_attributes_internal(
10583 #if XNU_MONITOR
10584 volatile upl_page_info_t *user_page_list,
10585 #else /* !XNU_MONITOR */
10586 upl_page_info_array_t user_page_list,
10587 #endif /* XNU_MONITOR */
10588 batch_set_cache_attr_state_t states,
10589 unsigned int page_cnt,
10590 unsigned int cacheattr)
10591 {
10592 uint64_t page_index = states.page_index;
10593 uint64_t state = states.state;
10594 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10595 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10596
10597 /* For verifying progress. */
10598 __assert_only const uint64_t page_index_old = page_index;
10599 __assert_only const uint64_t state_old = state;
10600
10601 /* Assert page_index and state are within their range. */
10602 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10603 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10604 }
10605
10606 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10607 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10608 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10609 while (page_index < page_cnt) {
10610 const ppnum_t pn = user_page_list[page_index].phys_addr;
10611 const pmap_paddr_t paddr = ptoa(pn);
10612
10613 if (!pa_valid(paddr)) {
10614 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10615 }
10616
10617 const unsigned int pai = pa_index(paddr);
10618
10619 /* Lock the page. */
10620 pvh_lock(pai);
10621
10622 #if XNU_MONITOR
10623 if (ppattr_pa_test_monitor(paddr)) {
10624 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10625 }
10626 #endif /* XNU_MONITOR */
10627 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10628
10629 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10630 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10631 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10632 }
10633
10634 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10635
10636 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10637 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10638 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10639 }
10640
10641 /* Update the cache attributes in PTE and PP_ATTR table. */
10642 if (wimg_bits_new != wimg_bits_prev) {
10643 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10644 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10645 }
10646
10647 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10648 rt_cache_flush_pass_needed = true;
10649 }
10650
10651 pvh_unlock(pai);
10652
10653 page_index++;
10654
10655 #if XNU_MONITOR
10656 /**
10657 * Check for AST_URGENT every page, as the pve list search in cache
10658 * update can take non-constant time.
10659 */
10660 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10661 goto pbscai_exit;
10662 }
10663 #endif /* XNU_MONITOR */
10664 }
10665
10666 /* page_index == page_cnt && !pmap_pending_preemption() */
10667 if (tlb_flush_pass_needed) {
10668 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10669 } else if (rt_cache_flush_pass_needed) {
10670 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10671 } else {
10672 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10673 }
10674 page_index = 0;
10675
10676 /* Sync the PTE writes before potential TLB/Cache flushes. */
10677 FLUSH_PTE_STRONG();
10678
10679 #if XNU_MONITOR
10680 if (__improbable(pmap_pending_preemption())) {
10681 goto pbscai_exit;
10682 }
10683 #endif /* XNU_MONITOR */
10684 }
10685
10686 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10687 /**
10688 * Pass 2: for each physical page and for each mapping, we need to flush
10689 * the TLB for it.
10690 */
10691 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10692 while (page_index < page_cnt) {
10693 const ppnum_t pn = user_page_list[page_index].phys_addr;
10694
10695 const pmap_paddr_t paddr = ptoa(pn);
10696 if (!pa_valid(paddr)) {
10697 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10698 }
10699
10700 const unsigned int pai = pa_index(paddr);
10701
10702 pvh_lock(pai);
10703 pmap_flush_tlb_for_paddr_locked_async(paddr);
10704 pvh_unlock(pai);
10705
10706 page_index++;
10707
10708 #if XNU_MONITOR
10709 /**
10710 * Check for AST_URGENT every page, as the pve list search in cache
10711 * update can take non-constant time.
10712 */
10713 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10714 goto pbscai_exit;
10715 }
10716 #endif /* XNU_MONITOR */
10717 }
10718
10719 #if HAS_FEAT_XS
10720 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10721 arm64_sync_tlb(false);
10722 #else
10723 /**
10724 * For targets that distinguish between mild and strong DSB, mild DSB
10725 * will not drain the prefetcher. This can lead to prefetch-driven
10726 * cache fills that defeat the uncacheable requirement of the RT memory type.
10727 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10728 */
10729 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10730 #endif
10731
10732 if (rt_cache_flush_pass_needed) {
10733 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10734 } else {
10735 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10736 }
10737 page_index = 0;
10738
10739 #if XNU_MONITOR
10740 if (__improbable(pmap_pending_preemption())) {
10741 goto pbscai_exit;
10742 }
10743 #endif /* XNU_MONITOR */
10744 }
10745
10746 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10747 /* Pass 3: Flush the cache if the page is recently set to RT */
10748 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10749 #if !XNU_MONITOR
10750 /**
10751 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10752 * in the state where DC by VA instructions remain enabled.
10753 */
10754 disable_preemption();
10755 #endif /* !XNU_MONITOR */
10756
10757 assert(get_preemption_level() > 0);
10758
10759 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10760 /**
10761 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10762 * and the host will handle cache maintenance for it. So we don't need to
10763 * worry about enabling the ops here for AVP.
10764 */
10765 enable_dc_mva_ops();
10766 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10767
10768 while (page_index < page_cnt) {
10769 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10770
10771 if (!pa_valid(paddr)) {
10772 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10773 }
10774
10775 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10776
10777 page_index++;
10778
10779 #if XNU_MONITOR
10780 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10781 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10782 disable_dc_mva_ops();
10783 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10784 goto pbscai_exit;
10785 }
10786 #endif /* XNU_MONITOR */
10787 }
10788
10789 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10790 disable_dc_mva_ops();
10791 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10792
10793 #if !XNU_MONITOR
10794 enable_preemption();
10795 #endif /* !XNU_MONITOR */
10796
10797 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10798 page_index = 0;
10799 }
10800
10801 #if XNU_MONITOR
10802 pbscai_exit:
10803 #endif /* XNU_MONITOR */
10804 /* Assert page_index and state are within their range. */
10805 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10806
10807 /* Make sure we are making progress in this call. */
10808 assert(page_index > page_index_old || state > state_old);
10809
10810 batch_set_cache_attr_state_t states_new;
10811 states_new.page_index = page_index;
10812 states_new.state = state;
10813 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10814 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10815 return states_new;
10816 }
10817
10818 MARK_AS_PMAP_TEXT static void
10819 pmap_set_cache_attributes_priv(
10820 ppnum_t pn,
10821 unsigned int cacheattr,
10822 boolean_t external __unused)
10823 {
10824 pmap_paddr_t paddr;
10825 unsigned int pai;
10826 pp_attr_t pp_attr_current;
10827 pp_attr_t pp_attr_template;
10828 unsigned int wimg_bits_prev, wimg_bits_new;
10829
10830 paddr = ptoa(pn);
10831
10832 if (!pa_valid(paddr)) {
10833 return; /* Not a managed page. */
10834 }
10835
10836 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10837 cacheattr = VM_WIMG_DEFAULT;
10838 }
10839
10840 pai = pa_index(paddr);
10841
10842 pvh_lock(pai);
10843
10844 #if XNU_MONITOR
10845 if (external && ppattr_pa_test_monitor(paddr)) {
10846 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10847 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10848 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10849 }
10850 #endif
10851
10852 do {
10853 pp_attr_current = pp_attr_table[pai];
10854 wimg_bits_prev = VM_WIMG_DEFAULT;
10855 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10856 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10857 }
10858
10859 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10860
10861 /**
10862 * WIMG bits should only be updated under the PVH lock, but we should do
10863 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10864 */
10865 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10866
10867 wimg_bits_new = VM_WIMG_DEFAULT;
10868 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10869 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10870 }
10871
10872 if (wimg_bits_new != wimg_bits_prev) {
10873 pmap_update_cache_attributes_locked(pn, cacheattr, true);
10874 }
10875
10876 pvh_unlock(pai);
10877
10878 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10879 }
10880
10881 MARK_AS_PMAP_TEXT void
10882 pmap_set_cache_attributes_internal(
10883 ppnum_t pn,
10884 unsigned int cacheattr)
10885 {
10886 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10887 }
10888
10889 void
10890 pmap_set_cache_attributes(
10891 ppnum_t pn,
10892 unsigned int cacheattr)
10893 {
10894 #if XNU_MONITOR
10895 pmap_set_cache_attributes_ppl(pn, cacheattr);
10896 #else
10897 pmap_set_cache_attributes_internal(pn, cacheattr);
10898 #endif
10899 }
10900
10901 /**
10902 * Updates the page numbered ppnum to have attribute specified by attributes.
10903 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10904 * The necessity of the TLB flush is returned in case this function is called
10905 * in a batched manner and the TLB flush is intended to be done at a different
10906 * timing.
10907 *
10908 * @param ppnum Page Number of the page to be updated.
10909 * @param attributes The new cache attributes.
10910 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10911 * immediately.
10912 *
10913 * @return Returns true if a TLB flush is needed for this update regardless of
10914 * whether a flush has occurred already.
10915 */
10916 MARK_AS_PMAP_TEXT bool
10917 pmap_update_cache_attributes_locked(
10918 ppnum_t ppnum,
10919 unsigned attributes,
10920 bool perform_tlbi)
10921 {
10922 pmap_paddr_t phys = ptoa(ppnum);
10923 pv_entry_t *pve_p;
10924 pt_entry_t *pte_p;
10925 pv_entry_t **pv_h;
10926 pt_entry_t tmplate;
10927 unsigned int pai;
10928 boolean_t tlb_flush_needed = false;
10929
10930 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10931
10932 if (pmap_panic_dev_wimg_on_managed) {
10933 switch (attributes & VM_WIMG_MASK) {
10934 case VM_WIMG_IO: // nGnRnE
10935 case VM_WIMG_POSTED: // nGnRE
10936 /* supported on DRAM, but slow, so we disallow */
10937
10938 case VM_WIMG_POSTED_REORDERED: // nGRE
10939 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10940 /* unsupported on DRAM */
10941
10942 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10943 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10944 break;
10945
10946 default:
10947 /* not device type memory, all good */
10948
10949 break;
10950 }
10951 }
10952
10953 #if __ARM_PTE_PHYSMAP__
10954 vm_offset_t kva = phystokv(phys);
10955 pte_p = pmap_pte(kernel_pmap, kva);
10956
10957 tmplate = *pte_p;
10958 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10959 #if XNU_MONITOR
10960 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10961 #else
10962 tmplate |= wimg_to_pte(attributes, phys);
10963 #endif
10964 if (tmplate & ARM_PTE_HINT_MASK) {
10965 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10966 __FUNCTION__, pte_p, (void *)kva, tmplate);
10967 }
10968
10969 if (perform_tlbi) {
10970 write_pte_strong(pte_p, tmplate);
10971 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10972 } else {
10973 write_pte_fast(pte_p, tmplate);
10974 }
10975 tlb_flush_needed = true;
10976 #endif
10977
10978 pai = pa_index(phys);
10979
10980 pv_h = pai_to_pvh(pai);
10981
10982 pte_p = PT_ENTRY_NULL;
10983 pve_p = PV_ENTRY_NULL;
10984 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10985 pte_p = pvh_ptep(pv_h);
10986 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10987 pve_p = pvh_pve_list(pv_h);
10988 pte_p = PT_ENTRY_NULL;
10989 }
10990
10991 int pve_ptep_idx = 0;
10992 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10993 vm_map_address_t va;
10994 pmap_t pmap;
10995
10996 if (pve_p != PV_ENTRY_NULL) {
10997 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10998 if (pte_p == PT_ENTRY_NULL) {
10999 goto cache_skip_pve;
11000 }
11001 }
11002
11003 #ifdef PVH_FLAG_IOMMU
11004 if (pvh_ptep_is_iommu(pte_p)) {
11005 goto cache_skip_pve;
11006 }
11007 #endif
11008 pmap = ptep_get_pmap(pte_p);
11009 #if HAS_FEAT_XS
11010 /**
11011 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11012 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11013 */
11014 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11015 #endif /* HAS_FEAT_XS */
11016 va = ptep_get_va(pte_p);
11017
11018 tmplate = *pte_p;
11019 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11020 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11021
11022 if (perform_tlbi) {
11023 write_pte_strong(pte_p, tmplate);
11024 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11025 pmap, true, false);
11026 } else {
11027 write_pte_fast(pte_p, tmplate);
11028 }
11029 tlb_flush_needed = true;
11030
11031 cache_skip_pve:
11032 pte_p = PT_ENTRY_NULL;
11033 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11034 pve_ptep_idx = 0;
11035 pve_p = pve_next(pve_p);
11036 }
11037 }
11038 if (perform_tlbi && tlb_flush_needed) {
11039 #if HAS_FEAT_XS
11040 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11041 arm64_sync_tlb(false);
11042 #else
11043 /**
11044 * For targets that distinguish between mild and strong DSB, mild DSB
11045 * will not drain the prefetcher. This can lead to prefetch-driven
11046 * cache fills that defeat the uncacheable requirement of the RT memory type.
11047 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11048 */
11049 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11050 #endif
11051 }
11052
11053 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11054
11055 return tlb_flush_needed;
11056 }
11057
11058 /**
11059 * Mark a pmap as being dedicated to use for a commpage mapping.
11060 * The pmap itself will never be activated on a CPU; its mappings will
11061 * only be embedded in userspace pmaps at a fixed virtual address.
11062 *
11063 * @param pmap the pmap to mark as belonging to a commpage.
11064 */
11065 static void
11066 pmap_set_commpage(pmap_t pmap)
11067 {
11068 #if XNU_MONITOR
11069 assert(!pmap_ppl_locked_down);
11070 #endif
11071 assert(pmap->type == PMAP_TYPE_USER);
11072 pmap->type = PMAP_TYPE_COMMPAGE;
11073 /*
11074 * Free the pmap's ASID. This pmap should not ever be directly
11075 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11076 * ASID space contention but will also cause pmap_switch() to panic
11077 * if an attacker tries to activate this pmap. Disable preemption to
11078 * accommodate the *_nopreempt spinlock in free_asid().
11079 */
11080 mp_disable_preemption();
11081 pmap_get_pt_ops(pmap)->free_id(pmap);
11082 mp_enable_preemption();
11083 }
11084
11085 static void
11086 pmap_update_tt3e(
11087 pmap_t pmap,
11088 vm_address_t address,
11089 tt_entry_t template)
11090 {
11091 tt_entry_t *ptep, pte;
11092
11093 ptep = pmap_tt3e(pmap, address);
11094 if (ptep == NULL) {
11095 panic("%s: no ptep?", __FUNCTION__);
11096 }
11097
11098 pte = *ptep;
11099 pte = tte_to_pa(pte) | template;
11100 write_pte_strong(ptep, pte);
11101 }
11102
11103 /* Note absence of non-global bit */
11104 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11105 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11106 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11107 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11108
11109 /* Note absence of non-global bit and no-execute bit. */
11110 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11111 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11112 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11113 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11114
11115 void
11116 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11117 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11118 {
11119 kern_return_t kr;
11120 pmap_paddr_t data_pa = 0; // data address
11121 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11122 pmap_paddr_t text_pa = 0; // text address
11123
11124 *kernel_data_addr = 0;
11125 *kernel_text_addr = 0;
11126 *user_text_addr = 0;
11127
11128 #if XNU_MONITOR
11129 data_pa = pmap_alloc_page_for_kern(0);
11130 assert(data_pa);
11131 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11132 ro_data_pa = pmap_alloc_page_for_kern(0);
11133 assert(ro_data_pa);
11134 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11135 #if CONFIG_ARM_PFZ
11136 text_pa = pmap_alloc_page_for_kern(0);
11137 assert(text_pa);
11138 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11139 #endif
11140
11141 #else /* XNU_MONITOR */
11142 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11143 /*
11144 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11145 * mapped at page granularity, so a separate page for kernel RO data would not
11146 * be useful.
11147 */
11148 ro_data_pa = data_pa;
11149 #if CONFIG_ARM_PFZ
11150 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11151 #endif
11152
11153 #endif /* XNU_MONITOR */
11154
11155 /*
11156 * In order to avoid burning extra pages on mapping the shared page, we
11157 * create a dedicated pmap for the shared page. We forcibly nest the
11158 * translation tables from this pmap into other pmaps. The level we
11159 * will nest at depends on the MMU configuration (page size, TTBR range,
11160 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11161 *
11162 * Note that this is NOT "the nested pmap" (which is used to nest the
11163 * shared cache).
11164 *
11165 * Note that we update parameters of the entry for our unique needs (NG
11166 * entry, etc.).
11167 */
11168 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11169 assert(commpage_pmap_default != NULL);
11170 pmap_set_commpage(commpage_pmap_default);
11171
11172 /* The user 64-bit mappings... */
11173 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11174 assert(kr == KERN_SUCCESS);
11175 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11176
11177 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11178 assert(kr == KERN_SUCCESS);
11179 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11180 #if CONFIG_ARM_PFZ
11181 /* User mapping of comm page text section for 64 bit mapping only
11182 *
11183 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11184 * user processes to get this page mapped in, they should never call into
11185 * this page.
11186 *
11187 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11188 * is slid in the same L3 as the data commpage. It is either outside the
11189 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11190 * it is reserved and unavailable to mach VM for future mappings.
11191 */
11192 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11193 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11194
11195 vm_map_address_t commpage_text_va = 0;
11196
11197 do {
11198 int text_leaf_index = random() % num_ptes;
11199
11200 // Generate a VA for the commpage text with the same root and twig index as data
11201 // comm page, but with new leaf index we've just generated.
11202 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11203 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11204 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11205
11206 // Assert that this is empty
11207 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11208 assert(ptep != PT_ENTRY_NULL);
11209 assert(*ptep == ARM_TTE_EMPTY);
11210
11211 // At this point, we've found the address we want to insert our comm page at
11212 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11213 assert(kr == KERN_SUCCESS);
11214 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11215 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11216
11217 *user_text_addr = commpage_text_va;
11218 #endif
11219
11220 /* ...and the user 32-bit mappings. */
11221 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11222 assert(kr == KERN_SUCCESS);
11223 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11224
11225 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11226 assert(kr == KERN_SUCCESS);
11227 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11228 #if __ARM_MIXED_PAGE_SIZE__
11229 /**
11230 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11231 * new set of page tables that point to the exact same 16K shared page as
11232 * before. Only the first 4K of the 16K shared page is mapped since that's
11233 * the only part that contains relevant data.
11234 */
11235 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11236 assert(commpage_pmap_4k != NULL);
11237 pmap_set_commpage(commpage_pmap_4k);
11238
11239 /* The user 64-bit mappings... */
11240 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11241 assert(kr == KERN_SUCCESS);
11242 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11243
11244 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11245 assert(kr == KERN_SUCCESS);
11246 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11247
11248 /* ...and the user 32-bit mapping. */
11249 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11250 assert(kr == KERN_SUCCESS);
11251 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11252
11253 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11254 assert(kr == KERN_SUCCESS);
11255 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11256 #endif
11257
11258 /* For manipulation in kernel, go straight to physical page */
11259 *kernel_data_addr = phystokv(data_pa);
11260 assert(commpage_ro_data_kva == 0);
11261 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11262 assert(commpage_text_kva == 0);
11263 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11264 }
11265
11266
11267 /*
11268 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11269 * with user controlled TTEs for regions that aren't explicitly reserved by the
11270 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11271 */
11272 #if (ARM_PGSHIFT == 14)
11273 /**
11274 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11275 * commpage completely above the maximum 32-bit userspace VA.
11276 */
11277 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11278
11279 /**
11280 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11281 * userspace VAs can nest the commpage completely above the maximum 64-bit
11282 * userpace VA, but that technically isn't true on macOS. On those systems, the
11283 * commpage lives within the userspace VA range, but is protected by the VM as
11284 * a reserved region (see vm_reserved_regions[] definition for more info).
11285 */
11286
11287 #elif (ARM_PGSHIFT == 12)
11288 /**
11289 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11290 * above the maximum userspace VA.
11291 */
11292 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11293 #else
11294 #error Nested shared page mapping is unsupported on this config
11295 #endif
11296
11297 MARK_AS_PMAP_TEXT kern_return_t
11298 pmap_insert_commpage_internal(
11299 pmap_t pmap)
11300 {
11301 kern_return_t kr = KERN_SUCCESS;
11302 vm_offset_t commpage_vaddr;
11303 pt_entry_t *ttep, *src_ttep;
11304 int options = 0;
11305 pmap_t commpage_pmap = commpage_pmap_default;
11306
11307 /* Validate the pmap input before accessing its data. */
11308 validate_pmap_mutable(pmap);
11309
11310 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11311 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11312
11313 #if __ARM_MIXED_PAGE_SIZE__
11314 #if !__ARM_16K_PG__
11315 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11316 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11317 #endif /* !__ARM_16K_PG__ */
11318
11319 /* Choose the correct shared page pmap to use. */
11320 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11321 if (pmap_page_size == 16384) {
11322 commpage_pmap = commpage_pmap_default;
11323 } else if (pmap_page_size == 4096) {
11324 commpage_pmap = commpage_pmap_4k;
11325 } else {
11326 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11327 }
11328 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11329
11330 #if XNU_MONITOR
11331 options |= PMAP_OPTIONS_NOWAIT;
11332 #endif /* XNU_MONITOR */
11333
11334 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11335 #error We assume a single page.
11336 #endif
11337
11338 if (pmap_is_64bit(pmap)) {
11339 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11340 } else {
11341 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11342 }
11343
11344
11345 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11346
11347 /*
11348 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11349 * two (2MB) depending on the address space layout. For 16KB pages, each level
11350 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11351 * to "nest".
11352 *
11353 * Note: This is not "nesting" in the shared cache sense. This definition of
11354 * nesting just means inserting pointers to pre-allocated tables inside of
11355 * the passed in pmap to allow us to share page tables (which map the shared
11356 * page) for every task. This saves at least one page of memory per process
11357 * compared to creating new page tables in every process for mapping the
11358 * shared page.
11359 */
11360
11361 /**
11362 * Allocate the twig page tables if needed, and slam a pointer to the shared
11363 * page's tables into place.
11364 */
11365 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11366 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11367
11368 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11369
11370 if (kr != KERN_SUCCESS) {
11371 #if XNU_MONITOR
11372 if (kr == KERN_RESOURCE_SHORTAGE) {
11373 return kr;
11374 } else
11375 #endif
11376 if (kr == KERN_ABORTED) {
11377 return kr;
11378 } else {
11379 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11380 }
11381 }
11382
11383 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11384 }
11385
11386 if (*ttep != ARM_PTE_EMPTY) {
11387 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11388 }
11389
11390 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11391
11392 *ttep = *src_ttep;
11393 FLUSH_PTE_STRONG();
11394
11395 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11396
11397 return kr;
11398 }
11399
11400 static void
11401 pmap_unmap_commpage(
11402 pmap_t pmap)
11403 {
11404 pt_entry_t *ttep;
11405 vm_offset_t commpage_vaddr;
11406 pmap_t commpage_pmap = commpage_pmap_default;
11407
11408 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11409 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11410
11411 #if __ARM_MIXED_PAGE_SIZE__
11412 #if !__ARM_16K_PG__
11413 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11414 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11415 #endif /* !__ARM_16K_PG__ */
11416
11417 /* Choose the correct shared page pmap to use. */
11418 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11419 if (pmap_page_size == 16384) {
11420 commpage_pmap = commpage_pmap_default;
11421 } else if (pmap_page_size == 4096) {
11422 commpage_pmap = commpage_pmap_4k;
11423 } else {
11424 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11425 }
11426 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11427
11428 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11429 #error We assume a single page.
11430 #endif
11431
11432 if (pmap_is_64bit(pmap)) {
11433 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11434 } else {
11435 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11436 }
11437
11438
11439 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11440
11441 if (ttep == NULL) {
11442 return;
11443 }
11444
11445 /* It had better be mapped to the shared page. */
11446 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11447 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11448 }
11449
11450 *ttep = ARM_TTE_EMPTY;
11451 FLUSH_PTE_STRONG();
11452
11453 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11454 sync_tlb_flush();
11455 }
11456
11457 void
11458 pmap_insert_commpage(
11459 pmap_t pmap)
11460 {
11461 kern_return_t kr = KERN_FAILURE;
11462 #if XNU_MONITOR
11463 do {
11464 kr = pmap_insert_commpage_ppl(pmap);
11465
11466 if (kr == KERN_RESOURCE_SHORTAGE) {
11467 pmap_alloc_page_for_ppl(0);
11468 }
11469 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11470
11471 pmap_ledger_check_balance(pmap);
11472 #else
11473 do {
11474 kr = pmap_insert_commpage_internal(pmap);
11475 } while (kr == KERN_ABORTED);
11476 #endif
11477
11478 if (kr != KERN_SUCCESS) {
11479 panic("%s: failed to insert the shared page, kr=%d, "
11480 "pmap=%p",
11481 __FUNCTION__, kr,
11482 pmap);
11483 }
11484 }
11485
11486 static boolean_t
11487 pmap_is_64bit(
11488 pmap_t pmap)
11489 {
11490 return pmap->is_64bit;
11491 }
11492
11493 bool
11494 pmap_is_exotic(
11495 pmap_t pmap __unused)
11496 {
11497 return false;
11498 }
11499
11500
11501 /* ARMTODO -- an implementation that accounts for
11502 * holes in the physical map, if any.
11503 */
11504 boolean_t
11505 pmap_valid_page(
11506 ppnum_t pn)
11507 {
11508 return pa_valid(ptoa(pn));
11509 }
11510
11511 boolean_t
11512 pmap_bootloader_page(
11513 ppnum_t pn)
11514 {
11515 pmap_paddr_t paddr = ptoa(pn);
11516
11517 if (pa_valid(paddr)) {
11518 return FALSE;
11519 }
11520 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11521 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11522 }
11523
11524 MARK_AS_PMAP_TEXT boolean_t
11525 pmap_is_empty_internal(
11526 pmap_t pmap,
11527 vm_map_offset_t va_start,
11528 vm_map_offset_t va_end)
11529 {
11530 vm_map_offset_t block_start, block_end;
11531 tt_entry_t *tte_p;
11532
11533 if (pmap == NULL) {
11534 return TRUE;
11535 }
11536
11537 validate_pmap(pmap);
11538
11539 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11540 unsigned int initial_not_in_kdp = not_in_kdp;
11541
11542 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11543 pmap_lock(pmap, PMAP_LOCK_SHARED);
11544 }
11545
11546
11547 /* TODO: This will be faster if we increment ttep at each level. */
11548 block_start = va_start;
11549
11550 while (block_start < va_end) {
11551 pt_entry_t *bpte_p, *epte_p;
11552 pt_entry_t *pte_p;
11553
11554 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11555 if (block_end > va_end) {
11556 block_end = va_end;
11557 }
11558
11559 tte_p = pmap_tte(pmap, block_start);
11560 if ((tte_p != PT_ENTRY_NULL)
11561 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11562 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11563 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11564 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11565
11566 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11567 if (*pte_p != ARM_PTE_EMPTY) {
11568 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11569 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11570 }
11571 return FALSE;
11572 }
11573 }
11574 }
11575 block_start = block_end;
11576 }
11577
11578 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11579 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11580 }
11581
11582 return TRUE;
11583 }
11584
11585 boolean_t
11586 pmap_is_empty(
11587 pmap_t pmap,
11588 vm_map_offset_t va_start,
11589 vm_map_offset_t va_end)
11590 {
11591 #if XNU_MONITOR
11592 return pmap_is_empty_ppl(pmap, va_start, va_end);
11593 #else
11594 return pmap_is_empty_internal(pmap, va_start, va_end);
11595 #endif
11596 }
11597
11598 vm_map_offset_t
11599 pmap_max_offset(
11600 boolean_t is64,
11601 unsigned int option)
11602 {
11603 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11604 }
11605
11606 vm_map_offset_t
11607 pmap_max_64bit_offset(
11608 __unused unsigned int option)
11609 {
11610 vm_map_offset_t max_offset_ret = 0;
11611
11612 #if defined(__arm64__)
11613 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11614 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11615 max_offset_ret = arm64_pmap_max_offset_default;
11616 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11617 max_offset_ret = min_max_offset;
11618 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11619 max_offset_ret = MACH_VM_MAX_ADDRESS;
11620 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11621 if (arm64_pmap_max_offset_default) {
11622 max_offset_ret = arm64_pmap_max_offset_default;
11623 } else if (max_mem > 0xC0000000) {
11624 // devices with > 3GB of memory
11625 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11626 } else if (max_mem > 0x40000000) {
11627 // devices with > 1GB and <= 3GB of memory
11628 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11629 } else {
11630 // devices with <= 1 GB of memory
11631 max_offset_ret = min_max_offset;
11632 }
11633 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11634 if (arm64_pmap_max_offset_default) {
11635 // Allow the boot-arg to override jumbo size
11636 max_offset_ret = arm64_pmap_max_offset_default;
11637 } else {
11638 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11639 }
11640 } else {
11641 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11642 }
11643
11644 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11645 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11646 assert(max_offset_ret >= min_max_offset);
11647 }
11648 #else
11649 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11650 #endif
11651
11652 return max_offset_ret;
11653 }
11654
11655 vm_map_offset_t
11656 pmap_max_32bit_offset(
11657 unsigned int option)
11658 {
11659 vm_map_offset_t max_offset_ret = 0;
11660
11661 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11662 max_offset_ret = arm_pmap_max_offset_default;
11663 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11664 max_offset_ret = VM_MAX_ADDRESS;
11665 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11666 max_offset_ret = VM_MAX_ADDRESS;
11667 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11668 if (arm_pmap_max_offset_default) {
11669 max_offset_ret = arm_pmap_max_offset_default;
11670 } else if (max_mem > 0x20000000) {
11671 max_offset_ret = VM_MAX_ADDRESS;
11672 } else {
11673 max_offset_ret = VM_MAX_ADDRESS;
11674 }
11675 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11676 max_offset_ret = VM_MAX_ADDRESS;
11677 } else {
11678 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11679 }
11680
11681 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11682 return max_offset_ret;
11683 }
11684
11685 #if CONFIG_DTRACE
11686 /*
11687 * Constrain DTrace copyin/copyout actions
11688 */
11689 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11690 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11691
11692 kern_return_t
11693 dtrace_copyio_preflight(
11694 __unused addr64_t va)
11695 {
11696 if (current_map() == kernel_map) {
11697 return KERN_FAILURE;
11698 } else {
11699 return KERN_SUCCESS;
11700 }
11701 }
11702
11703 kern_return_t
11704 dtrace_copyio_postflight(
11705 __unused addr64_t va)
11706 {
11707 return KERN_SUCCESS;
11708 }
11709 #endif /* CONFIG_DTRACE */
11710
11711
11712 void
11713 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11714 {
11715 }
11716
11717
11718 void
11719 pmap_flush(
11720 __unused pmap_flush_context *cpus_to_flush)
11721 {
11722 /* not implemented yet */
11723 return;
11724 }
11725
11726 #if XNU_MONITOR
11727
11728 /*
11729 * Enforce that the address range described by kva and nbytes is not currently
11730 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11731 * unintentionally writing to PPL-owned memory.
11732 */
11733 void
11734 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11735 {
11736 vm_offset_t end;
11737 if (os_add_overflow(kva, nbytes, &end)) {
11738 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11739 }
11740 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11741 pmap_paddr_t pa = kvtophys_nofail(ckva);
11742 pp_attr_t attr;
11743 unsigned int pai = pa_index(pa);
11744 if (ckva == phystokv(pa)) {
11745 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11746 }
11747 do {
11748 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11749 if (attr & PP_ATTR_MONITOR) {
11750 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11751 }
11752 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11753 }
11754 }
11755
11756 void
11757 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11758 {
11759 vm_offset_t end;
11760 if (os_add_overflow(kva, nbytes, &end)) {
11761 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11762 }
11763 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11764 pmap_paddr_t pa = kvtophys_nofail(ckva);
11765
11766 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11767 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11768 }
11769 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11770 ppattr_pa_clear_no_monitor(pa);
11771 }
11772 }
11773
11774 /**
11775 * Lock down a page, making all mappings read-only, and preventing further
11776 * mappings or removal of this particular kva's mapping. Effectively, it makes
11777 * the physical page at kva immutable (see the ppl_writable parameter for an
11778 * exception to this).
11779 *
11780 * @param kva Valid address to any mapping of the physical page to lockdown.
11781 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11782 * @param ppl_writable True if the PPL should still be able to write to the page
11783 * using the physical aperture mapping. False will make the
11784 * page read-only for both the kernel and PPL in the
11785 * physical aperture.
11786 */
11787
11788 MARK_AS_PMAP_TEXT static void
11789 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11790 {
11791 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11792 }
11793
11794 /**
11795 * Lock down a page, giving all mappings the specified maximum permissions, and
11796 * preventing further mappings or removal of this particular kva's mapping.
11797 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11798 * parameter for an exception to this).
11799 *
11800 * @param kva Valid address to any mapping of the physical page to lockdown.
11801 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11802 * @param ppl_writable True if the PPL should still be able to write to the page
11803 * using the physical aperture mapping. False will make the
11804 * page read-only for both the kernel and PPL in the
11805 * physical aperture.
11806 * @param prot Maximum permissions to allow in existing alias mappings
11807 */
11808 MARK_AS_PMAP_TEXT static void
11809 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11810 {
11811 const pmap_paddr_t pa = kvtophys_nofail(kva);
11812 const unsigned int pai = pa_index(pa);
11813
11814 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11815 pvh_lock(pai);
11816 pv_entry_t **pvh = pai_to_pvh(pai);
11817 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11818
11819 if (__improbable(ppattr_pa_test_monitor(pa))) {
11820 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11821 }
11822
11823 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11824 panic("%s: %#lx already locked down/executable (%#llx)",
11825 __func__, kva, (uint64_t)pvh_flags);
11826 }
11827
11828
11829 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11830
11831 /* Update the physical aperture mapping to prevent kernel write access. */
11832 const unsigned int new_xprr_perm =
11833 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11834 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11835
11836 pvh_unlock(pai);
11837
11838 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11839
11840 /**
11841 * Double-check that the mapping didn't change physical addresses before the
11842 * LOCKDOWN flag was set (there is a brief window between the above
11843 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11844 *
11845 * This doesn't solve the ABA problem, but this doesn't have to since once
11846 * the pvh_lock() is grabbed no new mappings can be created on this physical
11847 * page without the LOCKDOWN flag already set (so any future mappings can
11848 * only be RO, and no existing mappings can be removed).
11849 */
11850 if (kvtophys_nofail(kva) != pa) {
11851 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11852 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11853 }
11854 }
11855
11856 /**
11857 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11858 * kernel once again.
11859 *
11860 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11861 * to unlockdown a page that was never locked down, will panic.
11862 *
11863 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11864 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11865 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11866 * passed to the paired pmap_ppl_lockdown_page() call. Any
11867 * deviation will result in a panic.
11868 */
11869 MARK_AS_PMAP_TEXT static void
11870 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11871 {
11872 pvh_assert_locked(pai);
11873 pv_entry_t **pvh = pai_to_pvh(pai);
11874 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11875
11876 if (__improbable(!(pvh_flags & lockdown_flag))) {
11877 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11878 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11879 }
11880
11881
11882 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11883
11884 /* Restore the pre-lockdown physical aperture mapping permissions. */
11885 const unsigned int old_xprr_perm =
11886 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11887 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11888 }
11889
11890 /**
11891 * Release a page from being locked down to the PPL, making it writable to the
11892 * kernel once again.
11893 *
11894 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11895 * to unlockdown a page that was never locked down, will panic.
11896 *
11897 * @param kva Valid address to any mapping of the physical page to unlockdown.
11898 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11899 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11900 * passed to the paired pmap_ppl_lockdown_page() call. Any
11901 * deviation will result in a panic.
11902 */
11903 MARK_AS_PMAP_TEXT static void
11904 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11905 {
11906 const pmap_paddr_t pa = kvtophys_nofail(kva);
11907 const unsigned int pai = pa_index(pa);
11908
11909 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11910 pvh_lock(pai);
11911 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11912 pvh_unlock(pai);
11913 }
11914
11915 #else /* XNU_MONITOR */
11916
11917 void __unused
11918 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11919 {
11920 }
11921
11922 void __unused
11923 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11924 {
11925 }
11926
11927 #endif /* !XNU_MONITOR */
11928
11929
11930 MARK_AS_PMAP_TEXT static inline void
11931 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11932 {
11933 #if XNU_MONITOR
11934 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11935 #else
11936 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11937 #endif
11938 }
11939
11940 MARK_AS_PMAP_TEXT static inline void
11941 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11942 {
11943 #if XNU_MONITOR
11944 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11945 #else
11946 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11947 #endif
11948 }
11949
11950 /**
11951 * Perform basic validation checks on the destination only and
11952 * corresponding offset/sizes prior to writing to a read only allocation.
11953 *
11954 * @note Should be called before writing to an allocation from the read
11955 * only allocator.
11956 *
11957 * @param zid The ID of the zone the allocation belongs to.
11958 * @param va VA of element being modified (destination).
11959 * @param offset Offset being written to, in the element.
11960 * @param new_data_size Size of modification.
11961 *
11962 */
11963
11964 MARK_AS_PMAP_TEXT static void
11965 pmap_ro_zone_validate_element_dst(
11966 zone_id_t zid,
11967 vm_offset_t va,
11968 vm_offset_t offset,
11969 vm_size_t new_data_size)
11970 {
11971 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11972 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11973 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11974 }
11975
11976 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
11977
11978 /* Check element is from correct zone and properly aligned */
11979 zone_require_ro(zid, elem_size, (void*)va);
11980
11981 if (__improbable(new_data_size > (elem_size - offset))) {
11982 panic("%s: New data size %lu too large for elem size %lu at addr %p",
11983 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
11984 }
11985 if (__improbable(offset >= elem_size)) {
11986 panic("%s: Offset %lu too large for elem size %lu at addr %p",
11987 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
11988 }
11989 }
11990
11991
11992 /**
11993 * Perform basic validation checks on the source, destination and
11994 * corresponding offset/sizes prior to writing to a read only allocation.
11995 *
11996 * @note Should be called before writing to an allocation from the read
11997 * only allocator.
11998 *
11999 * @param zid The ID of the zone the allocation belongs to.
12000 * @param va VA of element being modified (destination).
12001 * @param offset Offset being written to, in the element.
12002 * @param new_data Pointer to new data (source).
12003 * @param new_data_size Size of modification.
12004 *
12005 */
12006
12007 MARK_AS_PMAP_TEXT static void
12008 pmap_ro_zone_validate_element(
12009 zone_id_t zid,
12010 vm_offset_t va,
12011 vm_offset_t offset,
12012 const vm_offset_t new_data,
12013 vm_size_t new_data_size)
12014 {
12015 vm_offset_t sum = 0;
12016
12017 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12018 panic("%s: Integer addition overflow %p + %lu = %lu",
12019 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12020 }
12021
12022 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12023 }
12024
12025 /**
12026 * Ensure that physical page is locked down and pinned, before writing to it.
12027 *
12028 * @note Should be called before writing to an allocation from the read
12029 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12030 * ensure that it is called after the modification.
12031 *
12032 *
12033 * @param pa Physical address of the element being modified.
12034 * @param va Virtual address of element being modified.
12035 * @param size Size of the modification.
12036 *
12037 */
12038
12039 MARK_AS_PMAP_TEXT static void
12040 pmap_ro_zone_lock_phy_page(
12041 const pmap_paddr_t pa,
12042 vm_offset_t va,
12043 vm_size_t size)
12044 {
12045 const unsigned int pai = pa_index(pa);
12046 pvh_lock(pai);
12047
12048 /* Ensure that the physical page is locked down */
12049 #if XNU_MONITOR
12050 pv_entry_t **pvh = pai_to_pvh(pai);
12051 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12052 panic("%s: Physical page not locked down %llx", __func__, pa);
12053 }
12054 #endif /* XNU_MONITOR */
12055
12056 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
12057 pmap_pin_kernel_pages(va, size);
12058 }
12059
12060 /**
12061 * Unlock and unpin physical page after writing to it.
12062 *
12063 * @note Should be called after writing to an allocation from the read
12064 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12065 * ensure that it has been called prior to the modification.
12066 *
12067 * @param pa Physical address of the element that was modified.
12068 * @param va Virtual address of element that was modified.
12069 * @param size Size of the modification.
12070 *
12071 */
12072
12073 MARK_AS_PMAP_TEXT static void
12074 pmap_ro_zone_unlock_phy_page(
12075 const pmap_paddr_t pa,
12076 vm_offset_t va,
12077 vm_size_t size)
12078 {
12079 const unsigned int pai = pa_index(pa);
12080 pmap_unpin_kernel_pages(va, size);
12081 pvh_unlock(pai);
12082 }
12083
12084 /**
12085 * Function to copy kauth_cred from new_data to kv.
12086 * Function defined in "kern_prot.c"
12087 *
12088 * @note Will be removed upon completion of
12089 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12090 *
12091 * @param kv Address to copy new data to.
12092 * @param new_data Pointer to new data.
12093 *
12094 */
12095
12096 extern void
12097 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12098
12099 /**
12100 * Zalloc-specific memcpy that writes through the physical aperture
12101 * and ensures the element being modified is from a read-only zone.
12102 *
12103 * @note Designed to work only with the zone allocator's read-only submap.
12104 *
12105 * @param zid The ID of the zone to allocate from.
12106 * @param va VA of element to be modified.
12107 * @param offset Offset from element.
12108 * @param new_data Pointer to new data.
12109 * @param new_data_size Size of modification.
12110 *
12111 */
12112
12113 void
12114 pmap_ro_zone_memcpy(
12115 zone_id_t zid,
12116 vm_offset_t va,
12117 vm_offset_t offset,
12118 const vm_offset_t new_data,
12119 vm_size_t new_data_size)
12120 {
12121 #if XNU_MONITOR
12122 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12123 #else /* XNU_MONITOR */
12124 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12125 #endif /* XNU_MONITOR */
12126 }
12127
12128 MARK_AS_PMAP_TEXT void
12129 pmap_ro_zone_memcpy_internal(
12130 zone_id_t zid,
12131 vm_offset_t va,
12132 vm_offset_t offset,
12133 const vm_offset_t new_data,
12134 vm_size_t new_data_size)
12135 {
12136 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12137
12138 if (!new_data || new_data_size == 0) {
12139 return;
12140 }
12141
12142 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12143 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12144 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12145 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12146 }
12147
12148 /**
12149 * Zalloc-specific function to atomically mutate fields of an element that
12150 * belongs to a read-only zone, via the physcial aperture.
12151 *
12152 * @note Designed to work only with the zone allocator's read-only submap.
12153 *
12154 * @param zid The ID of the zone the element belongs to.
12155 * @param va VA of element to be modified.
12156 * @param offset Offset in element.
12157 * @param op Atomic operation to perform.
12158 * @param value Mutation value.
12159 *
12160 */
12161
12162 uint64_t
12163 pmap_ro_zone_atomic_op(
12164 zone_id_t zid,
12165 vm_offset_t va,
12166 vm_offset_t offset,
12167 zro_atomic_op_t op,
12168 uint64_t value)
12169 {
12170 #if XNU_MONITOR
12171 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12172 #else /* XNU_MONITOR */
12173 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12174 #endif /* XNU_MONITOR */
12175 }
12176
12177 MARK_AS_PMAP_TEXT uint64_t
12178 pmap_ro_zone_atomic_op_internal(
12179 zone_id_t zid,
12180 vm_offset_t va,
12181 vm_offset_t offset,
12182 zro_atomic_op_t op,
12183 uint64_t value)
12184 {
12185 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12186 vm_size_t value_size = op & 0xf;
12187
12188 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12189 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12190 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12191 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12192
12193 return value;
12194 }
12195
12196 /**
12197 * bzero for allocations from read only zones, that writes through the
12198 * physical aperture.
12199 *
12200 * @note This is called by the zfree path of all allocations from read
12201 * only zones.
12202 *
12203 * @param zid The ID of the zone the allocation belongs to.
12204 * @param va VA of element to be zeroed.
12205 * @param offset Offset in the element.
12206 * @param size Size of allocation.
12207 *
12208 */
12209
12210 void
12211 pmap_ro_zone_bzero(
12212 zone_id_t zid,
12213 vm_offset_t va,
12214 vm_offset_t offset,
12215 vm_size_t size)
12216 {
12217 #if XNU_MONITOR
12218 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12219 #else /* XNU_MONITOR */
12220 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12221 #endif /* XNU_MONITOR */
12222 }
12223
12224 MARK_AS_PMAP_TEXT void
12225 pmap_ro_zone_bzero_internal(
12226 zone_id_t zid,
12227 vm_offset_t va,
12228 vm_offset_t offset,
12229 vm_size_t size)
12230 {
12231 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12232 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12233 pmap_ro_zone_lock_phy_page(pa, va, size);
12234 bzero((void*)phystokv(pa), size);
12235 pmap_ro_zone_unlock_phy_page(pa, va, size);
12236 }
12237
12238 /**
12239 * Removes write access from the Physical Aperture.
12240 *
12241 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12242 * @note Designed to work only with the zone allocator's read-only submap.
12243 *
12244 * @param va VA of the page to restore write access to.
12245 *
12246 */
12247 MARK_AS_PMAP_TEXT static void
12248 pmap_phys_write_disable(vm_address_t va)
12249 {
12250 #if XNU_MONITOR
12251 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12252 #else /* XNU_MONITOR */
12253 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12254 #endif /* XNU_MONITOR */
12255 }
12256
12257 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12258
12259 MARK_AS_PMAP_TEXT mach_vm_size_t
12260 pmap_query_resident_internal(
12261 pmap_t pmap,
12262 vm_map_address_t start,
12263 vm_map_address_t end,
12264 mach_vm_size_t *compressed_bytes_p)
12265 {
12266 mach_vm_size_t resident_bytes = 0;
12267 mach_vm_size_t compressed_bytes = 0;
12268
12269 pt_entry_t *bpte, *epte;
12270 pt_entry_t *pte_p;
12271 tt_entry_t *tte_p;
12272
12273 if (pmap == NULL) {
12274 return PMAP_RESIDENT_INVALID;
12275 }
12276
12277 validate_pmap(pmap);
12278
12279 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12280
12281 /* Ensure that this request is valid, and addresses exactly one TTE. */
12282 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12283 (end % pt_attr_page_size(pt_attr)))) {
12284 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12285 }
12286
12287 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12288 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12289 }
12290
12291 pmap_lock(pmap, PMAP_LOCK_SHARED);
12292 tte_p = pmap_tte(pmap, start);
12293 if (tte_p == (tt_entry_t *) NULL) {
12294 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12295 return PMAP_RESIDENT_INVALID;
12296 }
12297 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12298 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12299 bpte = &pte_p[pte_index(pt_attr, start)];
12300 epte = &pte_p[pte_index(pt_attr, end)];
12301
12302 for (; bpte < epte; bpte++) {
12303 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12304 compressed_bytes += pt_attr_page_size(pt_attr);
12305 } else if (pa_valid(pte_to_pa(*bpte))) {
12306 resident_bytes += pt_attr_page_size(pt_attr);
12307 }
12308 }
12309 }
12310 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12311
12312 if (compressed_bytes_p) {
12313 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12314 *compressed_bytes_p += compressed_bytes;
12315 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12316 }
12317
12318 return resident_bytes;
12319 }
12320
12321 mach_vm_size_t
12322 pmap_query_resident(
12323 pmap_t pmap,
12324 vm_map_address_t start,
12325 vm_map_address_t end,
12326 mach_vm_size_t *compressed_bytes_p)
12327 {
12328 mach_vm_size_t total_resident_bytes;
12329 mach_vm_size_t compressed_bytes;
12330 vm_map_address_t va;
12331
12332
12333 if (pmap == PMAP_NULL) {
12334 if (compressed_bytes_p) {
12335 *compressed_bytes_p = 0;
12336 }
12337 return 0;
12338 }
12339
12340 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12341
12342 total_resident_bytes = 0;
12343 compressed_bytes = 0;
12344
12345 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12346 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12347 VM_KERNEL_ADDRHIDE(end));
12348
12349 va = start;
12350 while (va < end) {
12351 vm_map_address_t l;
12352 mach_vm_size_t resident_bytes;
12353
12354 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12355
12356 if (l > end) {
12357 l = end;
12358 }
12359 #if XNU_MONITOR
12360 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12361 #else
12362 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12363 #endif
12364 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12365 break;
12366 }
12367
12368 total_resident_bytes += resident_bytes;
12369
12370 va = l;
12371 }
12372
12373 if (compressed_bytes_p) {
12374 *compressed_bytes_p = compressed_bytes;
12375 }
12376
12377 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12378 total_resident_bytes);
12379
12380 return total_resident_bytes;
12381 }
12382
12383 #if MACH_ASSERT
12384 static void
12385 pmap_check_ledgers(
12386 pmap_t pmap)
12387 {
12388 int pid;
12389 char *procname;
12390
12391 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12392 /*
12393 * This pmap was not or is no longer fully associated
12394 * with a task (e.g. the old pmap after a fork()/exec() or
12395 * spawn()). Its "ledger" still points at a task that is
12396 * now using a different (and active) address space, so
12397 * we can't check that all the pmap ledgers are balanced here.
12398 *
12399 * If the "pid" is set, that means that we went through
12400 * pmap_set_process() in task_terminate_internal(), so
12401 * this task's ledger should not have been re-used and
12402 * all the pmap ledgers should be back to 0.
12403 */
12404 return;
12405 }
12406
12407 pid = pmap->pmap_pid;
12408 procname = pmap->pmap_procname;
12409
12410 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12411 }
12412 #endif /* MACH_ASSERT */
12413
12414 void
12415 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12416 {
12417 }
12418
12419 /**
12420 * The minimum shared region nesting size is used by the VM to determine when to
12421 * break up large mappings to nested regions. The smallest size that these
12422 * mappings can be broken into is determined by what page table level those
12423 * regions are being nested in at and the size of the page tables.
12424 *
12425 * For instance, if a nested region is nesting at L2 for a process utilizing
12426 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12427 * block entry).
12428 *
12429 * @param pmap The target pmap to determine the block size based on whether it's
12430 * using 16KB or 4KB page tables.
12431 */
12432 uint64_t
12433 pmap_shared_region_size_min(__unused pmap_t pmap)
12434 {
12435 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12436
12437 /**
12438 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12439 * 4KB pages). This means that a target pmap will contain L2 entries that
12440 * point to shared L3 page tables in the shared region pmap.
12441 */
12442 return pt_attr_twig_size(pt_attr);
12443 }
12444
12445 boolean_t
12446 pmap_enforces_execute_only(
12447 pmap_t pmap)
12448 {
12449 return pmap != kernel_pmap;
12450 }
12451
12452 MARK_AS_PMAP_TEXT void
12453 pmap_set_vm_map_cs_enforced_internal(
12454 pmap_t pmap,
12455 bool new_value)
12456 {
12457 validate_pmap_mutable(pmap);
12458 pmap->pmap_vm_map_cs_enforced = new_value;
12459 }
12460
12461 void
12462 pmap_set_vm_map_cs_enforced(
12463 pmap_t pmap,
12464 bool new_value)
12465 {
12466 #if XNU_MONITOR
12467 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12468 #else
12469 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12470 #endif
12471 }
12472
12473 extern int cs_process_enforcement_enable;
12474 bool
12475 pmap_get_vm_map_cs_enforced(
12476 pmap_t pmap)
12477 {
12478 if (cs_process_enforcement_enable) {
12479 return true;
12480 }
12481 return pmap->pmap_vm_map_cs_enforced;
12482 }
12483
12484 MARK_AS_PMAP_TEXT void
12485 pmap_set_jit_entitled_internal(
12486 __unused pmap_t pmap)
12487 {
12488 return;
12489 }
12490
12491 void
12492 pmap_set_jit_entitled(
12493 pmap_t pmap)
12494 {
12495 #if XNU_MONITOR
12496 pmap_set_jit_entitled_ppl(pmap);
12497 #else
12498 pmap_set_jit_entitled_internal(pmap);
12499 #endif
12500 }
12501
12502 bool
12503 pmap_get_jit_entitled(
12504 __unused pmap_t pmap)
12505 {
12506 return false;
12507 }
12508
12509 MARK_AS_PMAP_TEXT void
12510 pmap_set_tpro_internal(
12511 __unused pmap_t pmap)
12512 {
12513 return;
12514 }
12515
12516 void
12517 pmap_set_tpro(
12518 pmap_t pmap)
12519 {
12520 #if XNU_MONITOR
12521 pmap_set_tpro_ppl(pmap);
12522 #else /* XNU_MONITOR */
12523 pmap_set_tpro_internal(pmap);
12524 #endif /* XNU_MONITOR */
12525 }
12526
12527 bool
12528 pmap_get_tpro(
12529 __unused pmap_t pmap)
12530 {
12531 return false;
12532 }
12533
12534 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12535
12536 MARK_AS_PMAP_TEXT kern_return_t
12537 pmap_query_page_info_internal(
12538 pmap_t pmap,
12539 vm_map_offset_t va,
12540 int *disp_p)
12541 {
12542 pmap_paddr_t pa;
12543 int disp;
12544 unsigned int pai;
12545 pt_entry_t *pte_p, pte;
12546 pv_entry_t **pv_h, *pve_p;
12547
12548 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12549 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12550 *disp_p = 0;
12551 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12552 return KERN_INVALID_ARGUMENT;
12553 }
12554
12555 validate_pmap(pmap);
12556 pmap_lock(pmap, PMAP_LOCK_SHARED);
12557
12558 try_again:
12559 disp = 0;
12560 pte_p = pmap_pte(pmap, va);
12561 if (pte_p == PT_ENTRY_NULL) {
12562 goto done;
12563 }
12564 pte = *(volatile pt_entry_t*)pte_p;
12565 pa = pte_to_pa(pte);
12566 if (pa == 0) {
12567 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12568 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12569 if (pte & ARM_PTE_COMPRESSED_ALT) {
12570 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12571 }
12572 }
12573 } else {
12574 disp |= PMAP_QUERY_PAGE_PRESENT;
12575 pai = pa_index(pa);
12576 if (!pa_valid(pa)) {
12577 goto done;
12578 }
12579 pvh_lock(pai);
12580 if (pte != *(volatile pt_entry_t*)pte_p) {
12581 /* something changed: try again */
12582 pvh_unlock(pai);
12583 pmap_query_page_info_retries++;
12584 goto try_again;
12585 }
12586 pv_h = pai_to_pvh(pai);
12587 pve_p = PV_ENTRY_NULL;
12588 int pve_ptep_idx = 0;
12589 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12590 pve_p = pvh_pve_list(pv_h);
12591 while (pve_p != PV_ENTRY_NULL &&
12592 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12593 pve_p = pve_next(pve_p);
12594 }
12595 }
12596
12597 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12598 disp |= PMAP_QUERY_PAGE_ALTACCT;
12599 } else if (ppattr_test_reusable(pai)) {
12600 disp |= PMAP_QUERY_PAGE_REUSABLE;
12601 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12602 disp |= PMAP_QUERY_PAGE_INTERNAL;
12603 }
12604 pvh_unlock(pai);
12605 }
12606
12607 done:
12608 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12609 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12610 *disp_p = disp;
12611 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12612 return KERN_SUCCESS;
12613 }
12614
12615 kern_return_t
12616 pmap_query_page_info(
12617 pmap_t pmap,
12618 vm_map_offset_t va,
12619 int *disp_p)
12620 {
12621 #if XNU_MONITOR
12622 return pmap_query_page_info_ppl(pmap, va, disp_p);
12623 #else
12624 return pmap_query_page_info_internal(pmap, va, disp_p);
12625 #endif
12626 }
12627
12628
12629
12630 uint32_t
12631 pmap_user_va_bits(pmap_t pmap __unused)
12632 {
12633 #if __ARM_MIXED_PAGE_SIZE__
12634 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12635 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12636 #else
12637 return 64 - T0SZ_BOOT;
12638 #endif
12639 }
12640
12641 uint32_t
12642 pmap_kernel_va_bits(void)
12643 {
12644 return 64 - T1SZ_BOOT;
12645 }
12646
12647 static vm_map_size_t
12648 pmap_user_va_size(pmap_t pmap)
12649 {
12650 return 1ULL << pmap_user_va_bits(pmap);
12651 }
12652
12653
12654
12655 bool
12656 pmap_in_ppl(void)
12657 {
12658 // Unsupported
12659 return false;
12660 }
12661
12662 __attribute__((__noreturn__))
12663 void
12664 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12665 {
12666 panic("%s called on an unsupported platform.", __FUNCTION__);
12667 }
12668
12669 void *
12670 pmap_claim_reserved_ppl_page(void)
12671 {
12672 // Unsupported
12673 return NULL;
12674 }
12675
12676 void
12677 pmap_free_reserved_ppl_page(void __unused *kva)
12678 {
12679 // Unsupported
12680 }
12681
12682
12683 #if PMAP_CS_PPL_MONITOR
12684
12685 /* Immutable part of the trust cache runtime */
12686 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12687
12688 /* Mutable part of the trust cache runtime */
12689 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12690
12691 /* Lock for the trust cache runtime */
12692 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12693
12694 MARK_AS_PMAP_TEXT kern_return_t
12695 pmap_check_trust_cache_runtime_for_uuid_internal(
12696 const uint8_t check_uuid[kUUIDSize])
12697 {
12698 kern_return_t ret = KERN_DENIED;
12699
12700 if (amfi->TrustCache.version < 3) {
12701 /* AMFI change hasn't landed in the build */
12702 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12703 return KERN_NOT_SUPPORTED;
12704 }
12705
12706 /* Lock the runtime as shared */
12707 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12708
12709 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12710 &ppl_trust_cache_rt,
12711 check_uuid,
12712 NULL);
12713
12714 /* Unlock the runtime */
12715 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12716
12717 if (tc_ret.error == kTCReturnSuccess) {
12718 ret = KERN_SUCCESS;
12719 } else if (tc_ret.error == kTCReturnNotFound) {
12720 ret = KERN_NOT_FOUND;
12721 } else {
12722 ret = KERN_FAILURE;
12723 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12724 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12725 }
12726
12727 return ret;
12728 }
12729
12730 kern_return_t
12731 pmap_check_trust_cache_runtime_for_uuid(
12732 const uint8_t check_uuid[kUUIDSize])
12733 {
12734 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12735 }
12736
12737 MARK_AS_PMAP_TEXT kern_return_t
12738 pmap_load_trust_cache_with_type_internal(
12739 TCType_t type,
12740 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12741 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12742 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12743 {
12744 kern_return_t ret = KERN_DENIED;
12745 pmap_img4_payload_t *payload = NULL;
12746 size_t img4_payload_len = 0;
12747 size_t payload_len_aligned = 0;
12748 size_t manifest_len_aligned = 0;
12749
12750 /* Ignore the auxiliary manifest until we add support for it */
12751 (void)img4_aux_manifest;
12752 (void)img4_aux_manifest_len;
12753
12754
12755 #if PMAP_CS_INCLUDE_CODE_SIGNING
12756 if (pmap_cs) {
12757 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12758 panic("trust cache type not loadable from interface: %u", type);
12759 } else if (type >= kTCTypeTotal) {
12760 panic("attempted to load an unsupported trust cache type: %u", type);
12761 }
12762
12763 /* Validate entitlement for the calling process */
12764 if (TCTypeConfig[type].entitlementValue != NULL) {
12765 const bool entitlement_satisfied = check_entitlement_pmap(
12766 NULL,
12767 "com.apple.private.pmap.load-trust-cache",
12768 TCTypeConfig[type].entitlementValue,
12769 false,
12770 true);
12771
12772 if (entitlement_satisfied == false) {
12773 panic("attempted to load trust cache without entitlement: %u", type);
12774 }
12775 }
12776 }
12777 #endif
12778
12779 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12780 ret = pmap_reserve_ppl_page();
12781 if (ret != KERN_SUCCESS) {
12782 if (ret != KERN_RESOURCE_SHORTAGE) {
12783 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12784 }
12785 return ret;
12786 }
12787
12788 /* Align the passed in lengths to the page size -- round_page is overflow safe */
12789 payload_len_aligned = round_page(pmap_img4_payload_len);
12790 manifest_len_aligned = round_page(img4_manifest_len);
12791
12792 /* Ensure we have valid data passed in */
12793 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12794 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12795
12796 /*
12797 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12798 * data structure used by libTrustCache to manage the payload. We need to be able to
12799 * write to that data structure, so we keep the payload PPL writable.
12800 */
12801 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12802 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12803
12804 /* Should be safe to read from this now */
12805 payload = (pmap_img4_payload_t*)pmap_img4_payload;
12806
12807 /* Acquire a writable version of the trust cache data structure */
12808 TrustCache_t *trust_cache = &payload->trust_cache;
12809 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12810
12811 /* Calculate the correct length of the img4 payload */
12812 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12813 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12814 }
12815
12816 /* Exclusively lock the runtime */
12817 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12818
12819 /* Load the trust cache */
12820 TCReturn_t tc_ret = amfi->TrustCache.load(
12821 &ppl_trust_cache_rt,
12822 type,
12823 trust_cache,
12824 (const uintptr_t)payload->img4_payload, img4_payload_len,
12825 (const uintptr_t)img4_manifest, img4_manifest_len);
12826
12827 /* Unlock the runtime */
12828 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12829
12830 if (tc_ret.error == kTCReturnSuccess) {
12831 ret = KERN_SUCCESS;
12832 } else {
12833 if (tc_ret.error == kTCReturnDuplicate) {
12834 ret = KERN_ALREADY_IN_SET;
12835 } else {
12836 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12837 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12838
12839 ret = KERN_FAILURE;
12840 }
12841
12842 /* Unlock the payload data */
12843 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12844 trust_cache = NULL;
12845 payload = NULL;
12846 }
12847
12848 /* Unlock the manifest since it is no longer needed */
12849 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12850
12851 /* Return the CoreCrypto reserved page back to the free list */
12852 pmap_release_reserved_ppl_page();
12853
12854 return ret;
12855 }
12856
12857 kern_return_t
12858 pmap_load_trust_cache_with_type(
12859 TCType_t type,
12860 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12861 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12862 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12863 {
12864 kern_return_t ret = KERN_DENIED;
12865
12866 ret = pmap_load_trust_cache_with_type_ppl(
12867 type,
12868 pmap_img4_payload, pmap_img4_payload_len,
12869 img4_manifest, img4_manifest_len,
12870 img4_aux_manifest, img4_aux_manifest_len);
12871
12872 while (ret == KERN_RESOURCE_SHORTAGE) {
12873 /* Allocate a page from the free list */
12874 pmap_alloc_page_for_ppl(0);
12875
12876 /* Attempt the call again */
12877 ret = pmap_load_trust_cache_with_type_ppl(
12878 type,
12879 pmap_img4_payload, pmap_img4_payload_len,
12880 img4_manifest, img4_manifest_len,
12881 img4_aux_manifest, img4_aux_manifest_len);
12882 }
12883
12884 return ret;
12885 }
12886
12887 MARK_AS_PMAP_TEXT kern_return_t
12888 pmap_query_trust_cache_safe(
12889 TCQueryType_t query_type,
12890 const uint8_t cdhash[kTCEntryHashSize],
12891 TrustCacheQueryToken_t *query_token)
12892 {
12893 kern_return_t ret = KERN_NOT_FOUND;
12894
12895 /* Validate the query type preemptively */
12896 if (query_type >= kTCQueryTypeTotal) {
12897 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12898 return KERN_INVALID_ARGUMENT;
12899 }
12900
12901 /* Lock the runtime as shared */
12902 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12903
12904 TCReturn_t tc_ret = amfi->TrustCache.query(
12905 &ppl_trust_cache_rt,
12906 query_type,
12907 cdhash,
12908 query_token);
12909
12910 /* Unlock the runtime */
12911 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12912
12913 if (tc_ret.error == kTCReturnSuccess) {
12914 ret = KERN_SUCCESS;
12915 } else if (tc_ret.error == kTCReturnNotFound) {
12916 ret = KERN_NOT_FOUND;
12917 } else {
12918 ret = KERN_FAILURE;
12919 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12920 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12921 }
12922
12923 return ret;
12924 }
12925
12926 MARK_AS_PMAP_TEXT kern_return_t
12927 pmap_query_trust_cache_internal(
12928 TCQueryType_t query_type,
12929 const uint8_t cdhash[kTCEntryHashSize],
12930 TrustCacheQueryToken_t *query_token)
12931 {
12932 kern_return_t ret = KERN_NOT_FOUND;
12933 TrustCacheQueryToken_t query_token_safe = {0};
12934 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12935
12936 /* Copy in the CDHash into PPL storage */
12937 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12938
12939 /* Query through the safe API since we're in the PPL now */
12940 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12941
12942 if (query_token != NULL) {
12943 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12944 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12945 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12946 }
12947
12948 return ret;
12949 }
12950
12951 kern_return_t
12952 pmap_query_trust_cache(
12953 TCQueryType_t query_type,
12954 const uint8_t cdhash[kTCEntryHashSize],
12955 TrustCacheQueryToken_t *query_token)
12956 {
12957 kern_return_t ret = KERN_NOT_FOUND;
12958
12959 ret = pmap_query_trust_cache_ppl(
12960 query_type,
12961 cdhash,
12962 query_token);
12963
12964 return ret;
12965 }
12966
12967 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
12968 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12969
12970 MARK_AS_PMAP_TEXT void
12971 pmap_toggle_developer_mode_internal(
12972 bool state)
12973 {
12974 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12975
12976 /*
12977 * Only the following state transitions are allowed:
12978 * -- not set --> false
12979 * -- not set --> true
12980 * -- true --> false
12981 * -- true --> true
12982 * -- false --> false
12983 *
12984 * We never allow false --> true transitions.
12985 */
12986 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
12987
12988 if ((current == false) && (state == true) && state_set) {
12989 panic("PMAP_CS: attempted to enable developer mode incorrectly");
12990 }
12991
12992 /* We're going to update the developer mode state, so update this first */
12993 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
12994
12995 /* Update the developer mode state on the system */
12996 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
12997 }
12998
12999 void
13000 pmap_toggle_developer_mode(
13001 bool state)
13002 {
13003 pmap_toggle_developer_mode_ppl(state);
13004 }
13005
13006 #endif /* PMAP_CS_PPL_MONITOR */
13007
13008 #if PMAP_CS_INCLUDE_CODE_SIGNING
13009
13010 static int
13011 pmap_cs_profiles_rbtree_compare(
13012 void *profile0,
13013 void *profile1)
13014 {
13015 if (profile0 < profile1) {
13016 return -1;
13017 } else if (profile0 > profile1) {
13018 return 1;
13019 }
13020 return 0;
13021 }
13022
13023 /* Red-black tree for managing provisioning profiles */
13024 MARK_AS_PMAP_DATA static
13025 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13026
13027 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13028 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13029
13030 /* Lock for the profile red-black tree */
13031 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13032
13033 void
13034 pmap_initialize_provisioning_profiles(void)
13035 {
13036 /* Initialize the profiles red-black tree lock */
13037 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13038 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13039
13040 /* Initialize the red-black tree itself */
13041 RB_INIT(&pmap_cs_registered_profiles);
13042
13043 printf("initialized PPL provisioning profile data\n");
13044 }
13045
13046 static bool
13047 pmap_is_testflight_profile(
13048 pmap_cs_profile_t *profile_obj)
13049 {
13050 const char *entitlement_name = "beta-reports-active";
13051 const size_t entitlement_length = strlen(entitlement_name);
13052 CEQueryOperation_t query[2] = {0};
13053
13054 /* If the profile provisions no entitlements, then it isn't a test flight one */
13055 if (profile_obj->entitlements_ctx == NULL) {
13056 return false;
13057 }
13058
13059 /* Build our CoreEntitlements query */
13060 query[0].opcode = kCEOpSelectKey;
13061 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13062 query[0].parameters.stringParameter.length = entitlement_length;
13063 query[1] = CEMatchBool(true);
13064
13065 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13066 profile_obj->entitlements_ctx,
13067 query, 2);
13068
13069 if (ce_err == amfi->CoreEntitlements.kNoError) {
13070 return true;
13071 }
13072
13073 return false;
13074 }
13075
13076 static bool
13077 pmap_is_development_profile(
13078 pmap_cs_profile_t *profile_obj)
13079 {
13080 /* Check for UPP */
13081 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13082 *profile_obj->profile_ctx,
13083 CESelectDictValue("ProvisionsAllDevices"));
13084 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13085 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13086 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13087 return false;
13088 }
13089 }
13090
13091 /* Check for TestFlight profile */
13092 if (pmap_is_testflight_profile(profile_obj) == true) {
13093 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13094 return false;
13095 }
13096
13097 pmap_cs_log_info("%p: development profile", profile_obj);
13098 return true;
13099 }
13100
13101 static kern_return_t
13102 pmap_initialize_profile_entitlements(
13103 pmap_cs_profile_t *profile_obj)
13104 {
13105 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13106 *profile_obj->profile_ctx,
13107 CESelectDictValue("Entitlements"));
13108
13109 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13110 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13111 profile_obj->entitlements_ctx = NULL;
13112
13113 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13114 return KERN_NOT_FOUND;
13115 }
13116
13117 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13118 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13119
13120 CEValidationResult ce_result = {0};
13121 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13122 pmap_cs_core_entitlements_runtime,
13123 &ce_result,
13124 der_start, der_end);
13125 if (ce_err != amfi->CoreEntitlements.kNoError) {
13126 pmap_cs_log_error("unable to validate profile entitlements: %s",
13127 amfi->CoreEntitlements.GetErrorString(ce_err));
13128
13129 return KERN_ABORTED;
13130 }
13131
13132 struct CEQueryContext query_ctx = {0};
13133 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13134 pmap_cs_core_entitlements_runtime,
13135 ce_result,
13136 &query_ctx);
13137 if (ce_err != amfi->CoreEntitlements.kNoError) {
13138 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13139 amfi->CoreEntitlements.GetErrorString(ce_err));
13140
13141 return KERN_ABORTED;
13142 }
13143
13144 /* Setup the entitlements context within the profile object */
13145 profile_obj->entitlements_ctx_storage = query_ctx;
13146 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13147
13148 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13149 return KERN_SUCCESS;
13150 }
13151
13152 kern_return_t
13153 pmap_register_provisioning_profile_internal(
13154 const vm_address_t payload_addr,
13155 const vm_size_t payload_size)
13156 {
13157 kern_return_t ret = KERN_DENIED;
13158 pmap_cs_profile_t *profile_obj = NULL;
13159 pmap_profile_payload_t *profile_payload = NULL;
13160 vm_size_t max_profile_blob_size = 0;
13161 const uint8_t *profile_content = NULL;
13162 size_t profile_content_length = 0;
13163
13164
13165 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13166 ret = pmap_reserve_ppl_page();
13167 if (ret != KERN_SUCCESS) {
13168 if (ret != KERN_RESOURCE_SHORTAGE) {
13169 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13170 }
13171 return ret;
13172 }
13173
13174 /* Ensure we have valid data passed in */
13175 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13176
13177 /*
13178 * Lockdown the data passed in. The pmap profile payload also contains the profile
13179 * data structure used by the PPL to manage the payload. We need to be able to write
13180 * to that data structure, so we keep the payload PPL writable.
13181 */
13182 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13183
13184 /* Should be safe to read from this now */
13185 profile_payload = (pmap_profile_payload_t*)payload_addr;
13186
13187 /* Ensure the profile blob size provided is valid */
13188 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13189 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13190 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13191 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13192 }
13193
13194 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13195 const bool allow_development_root_cert = true;
13196 #else
13197 const bool allow_development_root_cert = false;
13198 #endif
13199
13200 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13201 profile_payload->profile_blob, profile_payload->profile_blob_size,
13202 allow_development_root_cert,
13203 &profile_content, &profile_content_length);
13204
13205 /* Release the PPL page allocated for CoreCrypto */
13206 pmap_release_reserved_ppl_page();
13207
13208 if (ct_result != 0) {
13209 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13210 } else if ((profile_content == NULL) || profile_content_length == 0) {
13211 panic("PMAP_CS: profile does not have any content: %p | %lu",
13212 profile_content, profile_content_length);
13213 }
13214
13215 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13216 pmap_cs_core_entitlements_runtime,
13217 CCDER_CONSTRUCTED_SET,
13218 false,
13219 profile_content, profile_content + profile_content_length);
13220 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13221 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13222 }
13223
13224 /* Acquire a writable version of the profile data structure */
13225 profile_obj = &profile_payload->profile_obj_storage;
13226 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13227
13228 profile_obj->original_payload = profile_payload;
13229 profile_obj->profile_ctx_storage = profile_ctx_storage;
13230 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13231 os_atomic_store(&profile_obj->reference_count, 0, release);
13232
13233 /* Setup the entitlements provisioned by the profile */
13234 ret = pmap_initialize_profile_entitlements(profile_obj);
13235 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13236 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13237 }
13238
13239 /* Setup properties of the profile */
13240 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13241
13242 /* Mark as validated since it passed all checks */
13243 profile_obj->profile_validated = true;
13244
13245 /* Add the profile to the red-black tree */
13246 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13247 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13248 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13249 }
13250 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13251
13252 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13253 return KERN_SUCCESS;
13254 }
13255
13256 kern_return_t
13257 pmap_register_provisioning_profile(
13258 const vm_address_t payload_addr,
13259 const vm_size_t payload_size)
13260 {
13261 kern_return_t ret = KERN_DENIED;
13262
13263 ret = pmap_register_provisioning_profile_ppl(
13264 payload_addr,
13265 payload_size);
13266
13267 while (ret == KERN_RESOURCE_SHORTAGE) {
13268 /* Allocate a page from the free list */
13269 pmap_alloc_page_for_ppl(0);
13270
13271 /* Attempt the call again */
13272 ret = pmap_register_provisioning_profile_ppl(
13273 payload_addr,
13274 payload_size);
13275 }
13276
13277 return ret;
13278 }
13279
13280 kern_return_t
13281 pmap_unregister_provisioning_profile_internal(
13282 pmap_cs_profile_t *profile_obj)
13283 {
13284 kern_return_t ret = KERN_DENIED;
13285
13286 /* Lock the red-black tree exclusively */
13287 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13288
13289 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13290 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13291 }
13292
13293 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13294 if (reference_count != 0) {
13295 ret = KERN_FAILURE;
13296 goto exit;
13297 }
13298
13299 /* Remove the profile from the red-black tree */
13300 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13301
13302 /* Unregistration was a success */
13303 ret = KERN_SUCCESS;
13304
13305 exit:
13306 /* Unlock the red-black tree */
13307 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13308
13309 if (ret == KERN_SUCCESS) {
13310 /* Get the original payload address */
13311 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13312 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13313
13314 /* Get the original payload size */
13315 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13316 payload_size = round_page(payload_size);
13317
13318 /* Unlock the profile payload */
13319 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13320 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13321 profile_payload, payload_size);
13322
13323 profile_obj = NULL;
13324 }
13325 return ret;
13326 }
13327
13328 kern_return_t
13329 pmap_unregister_provisioning_profile(
13330 pmap_cs_profile_t *profile_obj)
13331 {
13332 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13333 }
13334
13335 kern_return_t
13336 pmap_associate_provisioning_profile_internal(
13337 pmap_cs_code_directory_t *cd_entry,
13338 pmap_cs_profile_t *profile_obj)
13339 {
13340 kern_return_t ret = KERN_DENIED;
13341
13342 /* Acquire the lock on the code directory */
13343 pmap_cs_lock_code_directory(cd_entry);
13344
13345 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13346 pmap_cs_log_error("disallowing profile association with verified signature");
13347 goto exit;
13348 } else if (cd_entry->profile_obj != NULL) {
13349 pmap_cs_log_error("disallowing multiple profile associations with signature");
13350 goto exit;
13351 }
13352
13353 /* Lock the red-black tree as shared */
13354 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13355
13356 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13357 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13358 } else if (profile_obj->profile_validated == false) {
13359 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13360 }
13361
13362 /* Associate the profile with the signature */
13363 cd_entry->profile_obj = profile_obj;
13364
13365 /* Increment the reference count on the profile object */
13366 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13367 if (reference_count == 0) {
13368 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13369 }
13370
13371 /* Unlock the red-black tree */
13372 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13373
13374 /* Association was a success */
13375 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13376 ret = KERN_SUCCESS;
13377
13378 exit:
13379 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13380
13381 return ret;
13382 }
13383
13384 kern_return_t
13385 pmap_associate_provisioning_profile(
13386 pmap_cs_code_directory_t *cd_entry,
13387 pmap_cs_profile_t *profile_obj)
13388 {
13389 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13390 }
13391
13392 kern_return_t
13393 pmap_disassociate_provisioning_profile_internal(
13394 pmap_cs_code_directory_t *cd_entry)
13395 {
13396 pmap_cs_profile_t *profile_obj = NULL;
13397 kern_return_t ret = KERN_DENIED;
13398
13399 /* Acquire the lock on the code directory */
13400 pmap_cs_lock_code_directory(cd_entry);
13401
13402 if (cd_entry->profile_obj == NULL) {
13403 ret = KERN_NOT_FOUND;
13404 goto exit;
13405 }
13406 profile_obj = cd_entry->profile_obj;
13407
13408 /* Disassociate the profile from the signature */
13409 cd_entry->profile_obj = NULL;
13410
13411 /* Disassociation was a success */
13412 ret = KERN_SUCCESS;
13413
13414 exit:
13415 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13416
13417 if (ret == KERN_SUCCESS) {
13418 /* Decrement the reference count on the profile object */
13419 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13420 if (reference_count == UINT32_MAX) {
13421 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13422 }
13423 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13424 }
13425 return ret;
13426 }
13427
13428 kern_return_t
13429 pmap_disassociate_provisioning_profile(
13430 pmap_cs_code_directory_t *cd_entry)
13431 {
13432 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13433 }
13434
13435 kern_return_t
13436 pmap_associate_kernel_entitlements_internal(
13437 pmap_cs_code_directory_t *cd_entry,
13438 const void *kernel_entitlements)
13439 {
13440 kern_return_t ret = KERN_DENIED;
13441
13442 if (kernel_entitlements == NULL) {
13443 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13444 }
13445
13446 /* Acquire the lock on the code directory */
13447 pmap_cs_lock_code_directory(cd_entry);
13448
13449 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13450 ret = KERN_DENIED;
13451 goto out;
13452 } else if (cd_entry->kernel_entitlements != NULL) {
13453 ret = KERN_DENIED;
13454 goto out;
13455 }
13456 cd_entry->kernel_entitlements = kernel_entitlements;
13457
13458 /* Association was a success */
13459 ret = KERN_SUCCESS;
13460
13461 out:
13462 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13463 return ret;
13464 }
13465
13466 kern_return_t
13467 pmap_associate_kernel_entitlements(
13468 pmap_cs_code_directory_t *cd_entry,
13469 const void *kernel_entitlements)
13470 {
13471 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13472 }
13473
13474 kern_return_t
13475 pmap_resolve_kernel_entitlements_internal(
13476 pmap_t pmap,
13477 const void **kernel_entitlements)
13478 {
13479 const void *entitlements = NULL;
13480 pmap_cs_code_directory_t *cd_entry = NULL;
13481 kern_return_t ret = KERN_DENIED;
13482
13483 /* Validate the PMAP object */
13484 validate_pmap(pmap);
13485
13486 /* Ensure no kernel PMAP */
13487 if (pmap == kernel_pmap) {
13488 return KERN_NOT_FOUND;
13489 }
13490
13491 /* Attempt a shared lock on the PMAP */
13492 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13493 return KERN_ABORTED;
13494 }
13495
13496 /*
13497 * Acquire the code signature from the PMAP. This function is called when
13498 * performing an entitlement check, and since we've confirmed this isn't
13499 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13500 * with a code signature.
13501 */
13502 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13503 if (cd_entry == NULL) {
13504 ret = KERN_NOT_FOUND;
13505 goto out;
13506 }
13507
13508 entitlements = cd_entry->kernel_entitlements;
13509 if (entitlements == NULL) {
13510 ret = KERN_NOT_FOUND;
13511 goto out;
13512 }
13513
13514 /* Pin and write out the entitlements object pointer */
13515 if (kernel_entitlements != NULL) {
13516 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13517 *kernel_entitlements = entitlements;
13518 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13519 }
13520
13521 /* Successfully resolved the entitlements */
13522 ret = KERN_SUCCESS;
13523
13524 out:
13525 /* Unlock the code signature object */
13526 if (cd_entry != NULL) {
13527 lck_rw_unlock_shared(&cd_entry->rwlock);
13528 cd_entry = NULL;
13529 }
13530
13531 /* Unlock the PMAP object */
13532 pmap_unlock(pmap, PMAP_LOCK_SHARED);
13533
13534 return ret;
13535 }
13536
13537 kern_return_t
13538 pmap_resolve_kernel_entitlements(
13539 pmap_t pmap,
13540 const void **kernel_entitlements)
13541 {
13542 kern_return_t ret = KERN_DENIED;
13543
13544 do {
13545 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13546 } while (ret == KERN_ABORTED);
13547
13548 return ret;
13549 }
13550
13551 kern_return_t
13552 pmap_accelerate_entitlements_internal(
13553 pmap_cs_code_directory_t *cd_entry)
13554 {
13555 const coreentitlements_t *CoreEntitlements = NULL;
13556 const CS_SuperBlob *superblob = NULL;
13557 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13558 size_t signature_length = 0;
13559 size_t acceleration_length = 0;
13560 size_t required_length = 0;
13561 kern_return_t ret = KERN_DENIED;
13562
13563 /* Setup the CoreEntitlements interface */
13564 CoreEntitlements = &amfi->CoreEntitlements;
13565
13566 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13567
13568 /* Acquire the lock on the code directory */
13569 pmap_cs_lock_code_directory(cd_entry);
13570
13571 /*
13572 * Only reconstituted code signatures can be accelerated. This is only a policy
13573 * decision we make since this allows us to re-use any unused space within the
13574 * locked down code signature region. There is also a decent bit of validation
13575 * within the reconstitution function to ensure blobs are ordered and do not
13576 * contain any padding around them which can cause issues here.
13577 *
13578 * This also serves as a check to ensure the signature is trusted.
13579 */
13580 if (cd_entry->unneeded_code_signature_unlocked == false) {
13581 ret = KERN_DENIED;
13582 goto out;
13583 }
13584
13585 if (cd_entry->ce_ctx == NULL) {
13586 ret = KERN_SUCCESS;
13587 goto out;
13588 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13589 ret = KERN_SUCCESS;
13590 goto out;
13591 }
13592
13593 /* We only support accelerating when size <= PAGE_SIZE */
13594 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13595 if (ce_err != CoreEntitlements->kNoError) {
13596 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13597 /* Small entitlement blobs aren't eligible */
13598 ret = KERN_SUCCESS;
13599 goto out;
13600 }
13601 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13602 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13603 } else if (acceleration_length > PAGE_SIZE) {
13604 ret = KERN_ABORTED;
13605 goto out;
13606 }
13607 assert(acceleration_length > 0);
13608
13609 superblob = cd_entry->superblob;
13610 signature_length = ntohl(superblob->length);
13611
13612 /* Adjust the required length for the overhead structure -- can't overflow */
13613 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13614 if (required_length > PAGE_SIZE) {
13615 ret = KERN_ABORTED;
13616 goto out;
13617 }
13618
13619 /*
13620 * First we'll check if the code signature has enough space within the locked down
13621 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13622 * allocate the buffer, and if not, we'll just allocate an entire page from the
13623 * free list.
13624 *
13625 * When we're storing the buffer within the code signature, we also need to make
13626 * sure we account for alignment of the buffer.
13627 */
13628 const vm_address_t align_mask = sizeof(void*) - 1;
13629 size_t required_length_within_sig = required_length + align_mask;
13630
13631 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13632 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13633 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13634
13635 /* We need to resolve to the physical aperture */
13636 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13637 acceleration_buf = (void*)phystokv(phys_addr);
13638
13639 /* Ensure the offset within the page wasn't lost */
13640 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13641
13642 acceleration_buf->allocated = false;
13643 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13644 } else {
13645 if (required_length <= pmap_cs_blob_limit) {
13646 struct pmap_cs_blob *bucket = NULL;
13647 size_t bucket_size = 0;
13648
13649 /* Allocate a buffer from the blob allocator */
13650 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13651 if (ret != KERN_SUCCESS) {
13652 goto out;
13653 }
13654 acceleration_buf = (void*)bucket->blob;
13655 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13656 } else {
13657 pmap_paddr_t phys_addr = 0;
13658 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13659 if (ret != KERN_SUCCESS) {
13660 goto out;
13661 }
13662 acceleration_buf = (void*)phystokv(phys_addr);
13663 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13664 }
13665 acceleration_buf->allocated = true;
13666 }
13667 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13668 acceleration_buf->length = acceleration_length;
13669
13670 /* Take the acceleration buffer lock */
13671 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13672
13673 /* Setup the global acceleration buffer state */
13674 pmap_cs_acceleration_buf = acceleration_buf;
13675
13676 /* Accelerate the entitlements */
13677 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13678 if (ce_err != CoreEntitlements->kNoError) {
13679 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13680 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13681 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13682 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13683 }
13684
13685 /*
13686 * The global acceleration buffer lock is unlocked by the allocation function itself
13687 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13688 * an assert that the lock is unlocked here since another thread could have acquired
13689 * it by now.
13690 */
13691 ret = KERN_SUCCESS;
13692
13693 out:
13694 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13695 return ret;
13696 }
13697
13698 kern_return_t
13699 pmap_accelerate_entitlements(
13700 pmap_cs_code_directory_t *cd_entry)
13701 {
13702 kern_return_t ret = KERN_DENIED;
13703
13704 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13705 while (ret == KERN_RESOURCE_SHORTAGE) {
13706 /* Allocate a page for the PPL */
13707 pmap_alloc_page_for_ppl(0);
13708
13709 /* Try again */
13710 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13711 }
13712
13713 return ret;
13714 }
13715
13716 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13717
13718 MARK_AS_PMAP_TEXT bool
13719 pmap_lookup_in_loaded_trust_caches_internal(
13720 const uint8_t cdhash[CS_CDHASH_LEN])
13721 {
13722 kern_return_t kr = KERN_NOT_FOUND;
13723
13724 #if PMAP_CS_PPL_MONITOR
13725 /*
13726 * If we have the PPL monitor, then this function can only be called from
13727 * within the PPL. Calling it directly would've caused a panic, so we can
13728 * assume that we're in the PPL here.
13729 */
13730 uint8_t cdhash_safe[CS_CDHASH_LEN];
13731 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13732
13733 kr = pmap_query_trust_cache_safe(
13734 kTCQueryTypeLoadable,
13735 cdhash_safe,
13736 NULL);
13737 #else
13738 kr = query_trust_cache(
13739 kTCQueryTypeLoadable,
13740 cdhash,
13741 NULL);
13742 #endif
13743
13744 if (kr == KERN_SUCCESS) {
13745 return true;
13746 }
13747 return false;
13748 }
13749
13750 bool
13751 pmap_lookup_in_loaded_trust_caches(
13752 const uint8_t cdhash[CS_CDHASH_LEN])
13753 {
13754 #if XNU_MONITOR
13755 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13756 #else
13757 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13758 #endif
13759 }
13760
13761 MARK_AS_PMAP_TEXT uint32_t
13762 pmap_lookup_in_static_trust_cache_internal(
13763 const uint8_t cdhash[CS_CDHASH_LEN])
13764 {
13765 TrustCacheQueryToken_t query_token = {0};
13766 kern_return_t kr = KERN_NOT_FOUND;
13767 uint64_t flags = 0;
13768 uint8_t hash_type = 0;
13769
13770 #if PMAP_CS_PPL_MONITOR
13771 /*
13772 * If we have the PPL monitor, then this function can only be called from
13773 * within the PPL. Calling it directly would've caused a panic, so we can
13774 * assume that we're in the PPL here.
13775 */
13776 uint8_t cdhash_safe[CS_CDHASH_LEN];
13777 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13778
13779 kr = pmap_query_trust_cache_safe(
13780 kTCQueryTypeStatic,
13781 cdhash_safe,
13782 &query_token);
13783 #else
13784 kr = query_trust_cache(
13785 kTCQueryTypeStatic,
13786 cdhash,
13787 &query_token);
13788 #endif
13789
13790 if (kr == KERN_SUCCESS) {
13791 amfi->TrustCache.queryGetFlags(&query_token, &flags);
13792 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13793
13794 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13795 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13796 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13797 }
13798
13799 return 0;
13800 }
13801
13802 uint32_t
13803 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13804 {
13805 #if XNU_MONITOR
13806 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13807 #else
13808 return pmap_lookup_in_static_trust_cache_internal(cdhash);
13809 #endif
13810 }
13811
13812 #if PMAP_CS_INCLUDE_CODE_SIGNING
13813
13814 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13815 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13816
13817 MARK_AS_PMAP_TEXT void
13818 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13819 {
13820
13821 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13822 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13823 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13824
13825 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13826 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13827 }
13828
13829 MARK_AS_PMAP_TEXT bool
13830 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13831 {
13832 bool match = false;
13833
13834 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13835 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13836 match = true;
13837 }
13838 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13839
13840 if (match) {
13841 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13842 }
13843
13844 return match;
13845 }
13846
13847 void
13848 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13849 {
13850 #if XNU_MONITOR
13851 pmap_set_compilation_service_cdhash_ppl(cdhash);
13852 #else
13853 pmap_set_compilation_service_cdhash_internal(cdhash);
13854 #endif
13855 }
13856
13857 bool
13858 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13859 {
13860 #if XNU_MONITOR
13861 return pmap_match_compilation_service_cdhash_ppl(cdhash);
13862 #else
13863 return pmap_match_compilation_service_cdhash_internal(cdhash);
13864 #endif
13865 }
13866
13867 /*
13868 * As part of supporting local signing on the device, we need the PMAP layer
13869 * to store the local signing key so that PMAP_CS can validate with it. We
13870 * store it at the PMAP layer such that it is accessible to both AMFI and
13871 * PMAP_CS should they need it.
13872 */
13873 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13874 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13875
13876 MARK_AS_PMAP_TEXT void
13877 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13878 {
13879 bool key_set = false;
13880
13881 /*
13882 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13883 * a successful exchange means that the local signing public key has _not_ been
13884 * set. In case the key has been set, we panic as we would never expect the
13885 * kernel to attempt to set the key more than once.
13886 */
13887 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13888
13889 if (key_set) {
13890 panic("attempted to set the local signing public key multiple times");
13891 }
13892
13893 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13894 pmap_cs_log_info("set local signing public key");
13895 }
13896
13897 void
13898 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13899 {
13900 #if XNU_MONITOR
13901 return pmap_set_local_signing_public_key_ppl(public_key);
13902 #else
13903 return pmap_set_local_signing_public_key_internal(public_key);
13904 #endif
13905 }
13906
13907 uint8_t*
13908 pmap_get_local_signing_public_key(void)
13909 {
13910 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13911
13912 if (key_set) {
13913 return pmap_local_signing_public_key;
13914 }
13915
13916 return NULL;
13917 }
13918
13919 /*
13920 * Locally signed applications need to be explicitly authorized by an entitled application
13921 * before we allow them to run.
13922 */
13923 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13924 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13925
13926 MARK_AS_PMAP_TEXT void
13927 pmap_unrestrict_local_signing_internal(
13928 const uint8_t cdhash[CS_CDHASH_LEN])
13929 {
13930
13931 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13932 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13933 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13934
13935 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13936 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13937 }
13938
13939 void
13940 pmap_unrestrict_local_signing(
13941 const uint8_t cdhash[CS_CDHASH_LEN])
13942 {
13943 #if XNU_MONITOR
13944 return pmap_unrestrict_local_signing_ppl(cdhash);
13945 #else
13946 return pmap_unrestrict_local_signing_internal(cdhash);
13947 #endif
13948 }
13949
13950 #if PMAP_CS
13951 MARK_AS_PMAP_TEXT static void
13952 pmap_restrict_local_signing(void)
13953 {
13954 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13955 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13956 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13957 }
13958
13959 MARK_AS_PMAP_TEXT static bool
13960 pmap_local_signing_restricted(
13961 const uint8_t cdhash[CS_CDHASH_LEN])
13962 {
13963 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13964 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13965 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13966
13967 return ret != 0;
13968 }
13969
13970 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13971 #endif
13972
13973 MARK_AS_PMAP_TEXT void
13974 pmap_footprint_suspend_internal(
13975 vm_map_t map,
13976 boolean_t suspend)
13977 {
13978 #if DEVELOPMENT || DEBUG
13979 if (suspend) {
13980 current_thread()->pmap_footprint_suspended = TRUE;
13981 map->pmap->footprint_was_suspended = TRUE;
13982 } else {
13983 current_thread()->pmap_footprint_suspended = FALSE;
13984 }
13985 #else /* DEVELOPMENT || DEBUG */
13986 (void) map;
13987 (void) suspend;
13988 #endif /* DEVELOPMENT || DEBUG */
13989 }
13990
13991 void
13992 pmap_footprint_suspend(
13993 vm_map_t map,
13994 boolean_t suspend)
13995 {
13996 #if XNU_MONITOR
13997 pmap_footprint_suspend_ppl(map, suspend);
13998 #else
13999 pmap_footprint_suspend_internal(map, suspend);
14000 #endif
14001 }
14002
14003 MARK_AS_PMAP_TEXT void
14004 pmap_nop_internal(pmap_t pmap __unused)
14005 {
14006 validate_pmap_mutable(pmap);
14007 }
14008
14009 void
14010 pmap_nop(pmap_t pmap)
14011 {
14012 #if XNU_MONITOR
14013 pmap_nop_ppl(pmap);
14014 #else
14015 pmap_nop_internal(pmap);
14016 #endif
14017 }
14018
14019 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14020
14021 struct page_table_dump_header {
14022 uint64_t pa;
14023 uint64_t num_entries;
14024 uint64_t start_va;
14025 uint64_t end_va;
14026 };
14027
14028 static kern_return_t
14029 pmap_dump_page_tables_recurse(pmap_t pmap,
14030 const tt_entry_t *ttp,
14031 unsigned int cur_level,
14032 unsigned int level_mask,
14033 uint64_t start_va,
14034 void *buf_start,
14035 void *buf_end,
14036 size_t *bytes_copied)
14037 {
14038 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14039 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14040
14041 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14042 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14043 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14044 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14045
14046 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14047
14048 if (cur_level == pt_attr_root_level(pt_attr)) {
14049 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14050 }
14051
14052 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14053 const tt_entry_t *tt_end = &ttp[num_entries];
14054
14055 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14056 return KERN_INSUFFICIENT_BUFFER_SIZE;
14057 }
14058
14059 if (level_mask & (1U << cur_level)) {
14060 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14061 header->pa = ml_static_vtop((vm_offset_t)ttp);
14062 header->num_entries = num_entries;
14063 header->start_va = start_va;
14064 header->end_va = start_va + (num_entries * size);
14065
14066 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14067 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14068 }
14069 uint64_t current_va = start_va;
14070
14071 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14072 tt_entry_t tte = *ttep;
14073
14074 if (!(tte & valid_mask)) {
14075 continue;
14076 }
14077
14078 if ((tte & type_mask) == type_block) {
14079 continue;
14080 } else {
14081 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14082 panic("%s: corrupt entry %#llx at %p, "
14083 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14084 __FUNCTION__, tte, ttep,
14085 ttp, cur_level, bufp, buf_end);
14086 }
14087
14088 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14089
14090 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14091 level_mask, current_va, buf_start, buf_end, bytes_copied);
14092
14093 if (recurse_result != KERN_SUCCESS) {
14094 return recurse_result;
14095 }
14096 }
14097 }
14098
14099 return KERN_SUCCESS;
14100 }
14101
14102 kern_return_t
14103 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14104 {
14105 if (not_in_kdp) {
14106 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14107 }
14108 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14109 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14110 }
14111
14112 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14113
14114 kern_return_t
14115 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14116 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14117 {
14118 return KERN_NOT_SUPPORTED;
14119 }
14120 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14121
14122
14123 #ifdef CONFIG_XNUPOST
14124 #ifdef __arm64__
14125 static volatile bool pmap_test_took_fault = false;
14126
14127 static bool
14128 pmap_test_fault_handler(arm_saved_state_t * state)
14129 {
14130 bool retval = false;
14131 uint32_t esr = get_saved_state_esr(state);
14132 esr_exception_class_t class = ESR_EC(esr);
14133 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14134
14135 if ((class == ESR_EC_DABORT_EL1) &&
14136 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14137 pmap_test_took_fault = true;
14138 /* return to the instruction immediately after the call to NX page */
14139 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14140 retval = true;
14141 }
14142
14143 return retval;
14144 }
14145
14146 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14147 static NOKASAN bool
14148 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14149 {
14150 pmap_t old_pmap = NULL;
14151
14152 pmap_test_took_fault = false;
14153
14154 /*
14155 * We're potentially switching pmaps without using the normal thread
14156 * mechanism; disable interrupts and preemption to avoid any unexpected
14157 * memory accesses.
14158 */
14159 uint64_t old_int_state = pmap_interrupts_disable();
14160 mp_disable_preemption();
14161
14162 if (pmap != NULL) {
14163 old_pmap = current_pmap();
14164 pmap_switch(pmap);
14165
14166 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14167 #if __ARM_PAN_AVAILABLE__
14168 __builtin_arm_wsr("pan", 0);
14169 #endif /* __ARM_PAN_AVAILABLE__ */
14170 }
14171
14172 ml_expect_fault_begin(pmap_test_fault_handler, va);
14173
14174 if (is_write) {
14175 *((volatile uint64_t*)(va)) = 0xdec0de;
14176 } else {
14177 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14178 (void)tmp;
14179 }
14180
14181 /* Save the fault bool, and undo the gross stuff we did. */
14182 bool took_fault = pmap_test_took_fault;
14183 ml_expect_fault_end();
14184
14185 if (pmap != NULL) {
14186 #if __ARM_PAN_AVAILABLE__
14187 __builtin_arm_wsr("pan", 1);
14188 #endif /* __ARM_PAN_AVAILABLE__ */
14189
14190 pmap_switch(old_pmap);
14191 }
14192
14193 mp_enable_preemption();
14194 pmap_interrupts_restore(old_int_state);
14195 bool retval = (took_fault == should_fault);
14196 return retval;
14197 }
14198
14199 static bool
14200 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14201 {
14202 bool retval = pmap_test_access(pmap, va, should_fault, false);
14203
14204 if (!retval) {
14205 T_FAIL("%s: %s, "
14206 "pmap=%p, va=%p, should_fault=%u",
14207 __func__, should_fault ? "did not fault" : "faulted",
14208 pmap, (void*)va, (unsigned)should_fault);
14209 }
14210
14211 return retval;
14212 }
14213
14214 static bool
14215 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14216 {
14217 bool retval = pmap_test_access(pmap, va, should_fault, true);
14218
14219 if (!retval) {
14220 T_FAIL("%s: %s, "
14221 "pmap=%p, va=%p, should_fault=%u",
14222 __func__, should_fault ? "did not fault" : "faulted",
14223 pmap, (void*)va, (unsigned)should_fault);
14224 }
14225
14226 return retval;
14227 }
14228
14229 static bool
14230 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14231 {
14232 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14233 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14234
14235 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14236
14237 if (!retval) {
14238 T_FAIL("%s: bits=%u, "
14239 "pa=%p, should_be_set=%u",
14240 __func__, bits,
14241 (void*)pa, should_be_set);
14242 }
14243
14244 return retval;
14245 }
14246
14247 static __attribute__((noinline)) bool
14248 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14249 {
14250 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14251 return retval;
14252 }
14253
14254 static int
14255 pmap_test_test_config(unsigned int flags)
14256 {
14257 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14258 unsigned int map_count = 0;
14259 unsigned long page_ratio = 0;
14260 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14261
14262 if (!pmap) {
14263 panic("Failed to allocate pmap");
14264 }
14265
14266 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14267 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14268 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14269 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14270
14271 if (pmap_page_size <= native_page_size) {
14272 page_ratio = native_page_size / pmap_page_size;
14273 } else {
14274 /*
14275 * We claim to support a page_ratio of less than 1, which is
14276 * not currently supported by the pmap layer; panic.
14277 */
14278 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14279 "flags=%u",
14280 __func__, native_page_size, pmap_page_size,
14281 flags);
14282 }
14283
14284 if (PAGE_RATIO > 1) {
14285 /*
14286 * The kernel is deliberately pretending to have 16KB pages.
14287 * The pmap layer has code that supports this, so pretend the
14288 * page size is larger than it is.
14289 */
14290 pmap_page_size = PAGE_SIZE;
14291 native_page_size = PAGE_SIZE;
14292 }
14293
14294 /*
14295 * Get two pages from the VM; one to be mapped wired, and one to be
14296 * mapped nonwired.
14297 */
14298 vm_page_t unwired_vm_page = vm_page_grab();
14299 vm_page_t wired_vm_page = vm_page_grab();
14300
14301 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14302 panic("Failed to grab VM pages");
14303 }
14304
14305 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14306 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14307
14308 pmap_paddr_t pa = ptoa(pn);
14309 pmap_paddr_t wired_pa = ptoa(wired_pn);
14310
14311 /*
14312 * We'll start mappings at the second twig TT. This keeps us from only
14313 * using the first entry in each TT, which would trivially be address
14314 * 0; one of the things we will need to test is retrieving the VA for
14315 * a given PTE.
14316 */
14317 vm_map_address_t va_base = pmap_twig_size;
14318 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14319
14320 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14321 /*
14322 * Not exactly a functional failure, but this test relies on
14323 * there being a spare PTE slot we can use to pin the TT.
14324 */
14325 panic("Cannot pin translation table");
14326 }
14327
14328 /*
14329 * Create the wired mapping; this will prevent the pmap layer from
14330 * reclaiming our test TTs, which would interfere with this test
14331 * ("interfere" -> "make it panic").
14332 */
14333 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14334
14335 #if XNU_MONITOR
14336 /*
14337 * If the PPL is enabled, make sure that the kernel cannot write
14338 * to PPL memory.
14339 */
14340 if (!pmap_ppl_disable) {
14341 T_LOG("Validate that kernel cannot write to PPL memory.");
14342 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14343 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14344 }
14345 #endif
14346
14347 /*
14348 * Create read-only mappings of the nonwired page; if the pmap does
14349 * not use the same page size as the kernel, create multiple mappings
14350 * so that the kernel page is fully mapped.
14351 */
14352 for (map_count = 0; map_count < page_ratio; map_count++) {
14353 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14354 }
14355
14356 /* Validate that all the PTEs have the expected PA and VA. */
14357 for (map_count = 0; map_count < page_ratio; map_count++) {
14358 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14359
14360 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14361 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14362 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14363 }
14364
14365 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14366 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14367 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14368 }
14369 }
14370
14371 T_LOG("Validate that reads to our mapping do not fault.");
14372 pmap_test_read(pmap, va_base, false);
14373
14374 T_LOG("Validate that writes to our mapping fault.");
14375 pmap_test_write(pmap, va_base, true);
14376
14377 T_LOG("Make the first mapping writable.");
14378 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14379
14380 T_LOG("Validate that writes to our mapping do not fault.");
14381 pmap_test_write(pmap, va_base, false);
14382
14383
14384 T_LOG("Make the first mapping execute-only");
14385 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14386
14387
14388 T_LOG("Validate that reads to our mapping do not fault.");
14389 pmap_test_read(pmap, va_base, false);
14390
14391 T_LOG("Validate that writes to our mapping fault.");
14392 pmap_test_write(pmap, va_base, true);
14393
14394
14395 /*
14396 * For page ratios of greater than 1: validate that writes to the other
14397 * mappings still fault. Remove the mappings afterwards (we're done
14398 * with page ratio testing).
14399 */
14400 for (map_count = 1; map_count < page_ratio; map_count++) {
14401 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14402 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14403 }
14404
14405 T_LOG("Mark the page unreferenced and unmodified.");
14406 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14407 pmap_test_check_refmod(pa, 0);
14408
14409 /*
14410 * Begin testing the ref/mod state machine. Re-enter the mapping with
14411 * different protection/fault_type settings, and confirm that the
14412 * ref/mod state matches our expectations at each step.
14413 */
14414 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14415 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14416 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14417
14418 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14419 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14420 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14421 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14422
14423 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14424 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14425 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14426 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14427
14428 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14429 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14430 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14431
14432 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14433 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14434 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14435 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14436
14437 /*
14438 * Shared memory testing; we'll have two mappings; one read-only,
14439 * one read-write.
14440 */
14441 vm_map_address_t rw_base = va_base;
14442 vm_map_address_t ro_base = va_base + pmap_page_size;
14443
14444 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14445 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14446
14447 /*
14448 * Test that we take faults as expected for unreferenced/unmodified
14449 * pages. Also test the arm_fast_fault interface, to ensure that
14450 * mapping permissions change as expected.
14451 */
14452 T_LOG("!ref/!mod: expect no access");
14453 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14454 pmap_test_read_write(pmap, ro_base, false, false);
14455 pmap_test_read_write(pmap, rw_base, false, false);
14456
14457 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14458 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14459 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14460 pmap_test_read_write(pmap, ro_base, true, false);
14461 pmap_test_read_write(pmap, rw_base, true, false);
14462
14463 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14464 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14465 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14466 pmap_test_read_write(pmap, ro_base, true, false);
14467 pmap_test_read_write(pmap, rw_base, true, true);
14468
14469 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14470 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14471 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14472 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14473 pmap_test_read_write(pmap, ro_base, true, false);
14474 pmap_test_read_write(pmap, rw_base, true, true);
14475
14476 T_LOG("RW protect both mappings; should not change protections.");
14477 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14478 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14479 pmap_test_read_write(pmap, ro_base, true, false);
14480 pmap_test_read_write(pmap, rw_base, true, true);
14481
14482 T_LOG("Read protect both mappings; RW mapping should become RO.");
14483 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14484 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14485 pmap_test_read_write(pmap, ro_base, true, false);
14486 pmap_test_read_write(pmap, rw_base, true, false);
14487
14488 T_LOG("RW protect the page; mappings should not change protections.");
14489 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14490 pmap_page_protect(pn, VM_PROT_ALL);
14491 pmap_test_read_write(pmap, ro_base, true, false);
14492 pmap_test_read_write(pmap, rw_base, true, true);
14493
14494 T_LOG("Read protect the page; RW mapping should become RO.");
14495 pmap_page_protect(pn, VM_PROT_READ);
14496 pmap_test_read_write(pmap, ro_base, true, false);
14497 pmap_test_read_write(pmap, rw_base, true, false);
14498
14499 T_LOG("Validate that disconnect removes all known mappings of the page.");
14500 pmap_disconnect(pn);
14501 if (!pmap_verify_free(pn)) {
14502 T_FAIL("Page still has mappings");
14503 }
14504
14505 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14506 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14507 pmap_destroy(pmap);
14508
14509 T_LOG("Release the pages back to the VM.");
14510 vm_page_lock_queues();
14511 vm_page_free(unwired_vm_page);
14512 vm_page_free(wired_vm_page);
14513 vm_page_unlock_queues();
14514
14515 T_LOG("Testing successful!");
14516 return 0;
14517 }
14518 #endif /* __arm64__ */
14519
14520 kern_return_t
14521 pmap_test(void)
14522 {
14523 T_LOG("Starting pmap_tests");
14524 #ifdef __arm64__
14525 int flags = 0;
14526 flags |= PMAP_CREATE_64BIT;
14527
14528 #if __ARM_MIXED_PAGE_SIZE__
14529 T_LOG("Testing VM_PAGE_SIZE_4KB");
14530 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14531 T_LOG("Testing VM_PAGE_SIZE_16KB");
14532 pmap_test_test_config(flags);
14533 #else /* __ARM_MIXED_PAGE_SIZE__ */
14534 pmap_test_test_config(flags);
14535 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14536
14537 #endif /* __arm64__ */
14538 T_PASS("completed pmap_test successfully");
14539 return KERN_SUCCESS;
14540 }
14541 #endif /* CONFIG_XNUPOST */
14542
14543 /*
14544 * The following function should never make it to RELEASE code, since
14545 * it provides a way to get the PPL to modify text pages.
14546 */
14547 #if DEVELOPMENT || DEBUG
14548
14549 #define ARM_UNDEFINED_INSN 0xe7f000f0
14550 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14551
14552 /**
14553 * Forcibly overwrite executable text with an illegal instruction.
14554 *
14555 * @note Only used for xnu unit testing.
14556 *
14557 * @param pa The physical address to corrupt.
14558 *
14559 * @return KERN_SUCCESS on success.
14560 */
14561 kern_return_t
14562 pmap_test_text_corruption(pmap_paddr_t pa)
14563 {
14564 #if XNU_MONITOR
14565 return pmap_test_text_corruption_ppl(pa);
14566 #else /* XNU_MONITOR */
14567 return pmap_test_text_corruption_internal(pa);
14568 #endif /* XNU_MONITOR */
14569 }
14570
14571 MARK_AS_PMAP_TEXT kern_return_t
14572 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14573 {
14574 vm_offset_t va = phystokv(pa);
14575 unsigned int pai = pa_index(pa);
14576
14577 assert(pa_valid(pa));
14578
14579 pvh_lock(pai);
14580
14581 pv_entry_t **pv_h = pai_to_pvh(pai);
14582 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14583 #if defined(PVH_FLAG_EXEC)
14584 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14585
14586 if (need_ap_twiddle) {
14587 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14588 }
14589 #endif /* defined(PVH_FLAG_EXEC) */
14590
14591 /*
14592 * The low bit in an instruction address indicates a THUMB instruction
14593 */
14594 if (va & 1) {
14595 va &= ~(vm_offset_t)1;
14596 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14597 } else {
14598 *(uint32_t *)va = ARM_UNDEFINED_INSN;
14599 }
14600
14601 #if defined(PVH_FLAG_EXEC)
14602 if (need_ap_twiddle) {
14603 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14604 }
14605 #endif /* defined(PVH_FLAG_EXEC) */
14606
14607 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14608
14609 pvh_unlock(pai);
14610
14611 return KERN_SUCCESS;
14612 }
14613
14614 #endif /* DEVELOPMENT || DEBUG */
14615