1 /*
2 * Copyright (c) 2011-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_attr pmap_pt_attr_4k = {
222 .pta_level_info = pmap_table_level_info_4k,
223 .pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 .pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 .pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 .pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 .pta_max_level = PMAP_TT_L3_LEVEL,
234 .pta_ops = &native_pt_ops,
235 .ap_ro = ARM_PTE_AP(AP_RORO),
236 .ap_rw = ARM_PTE_AP(AP_RWRW),
237 .ap_rona = ARM_PTE_AP(AP_RONA),
238 .ap_rwna = ARM_PTE_AP(AP_RWNA),
239 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 .ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 .pta_tcr_value = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 .pta_page_size = 4096,
245 .pta_page_shift = 12,
246 };
247
248 const struct page_table_attr pmap_pt_attr_16k = {
249 .pta_level_info = pmap_table_level_info_16k,
250 .pta_root_level = PMAP_TT_L1_LEVEL,
251 .pta_commpage_level = PMAP_TT_L2_LEVEL,
252 .pta_max_level = PMAP_TT_L3_LEVEL,
253 .pta_ops = &native_pt_ops,
254 .ap_ro = ARM_PTE_AP(AP_RORO),
255 .ap_rw = ARM_PTE_AP(AP_RWRW),
256 .ap_rona = ARM_PTE_AP(AP_RONA),
257 .ap_rwna = ARM_PTE_AP(AP_RWNA),
258 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
259 .ap_x = ARM_PTE_PNX,
260 #if __ARM_MIXED_PAGE_SIZE__
261 .pta_tcr_value = TCR_EL1_16KB,
262 #endif /* __ARM_MIXED_PAGE_SIZE__ */
263 .pta_page_size = 16384,
264 .pta_page_shift = 14,
265 };
266
267 #if __ARM_16K_PG__
268 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
269 #else /* !__ARM_16K_PG__ */
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
271 #endif /* !__ARM_16K_PG__ */
272
273
274 #if MACH_ASSERT
275 int vm_footprint_suspend_allowed = 1;
276
277 extern int pmap_ledgers_panic;
278 extern int pmap_ledgers_panic_leeway;
279
280 #endif /* MACH_ASSERT */
281
282 #if DEVELOPMENT || DEBUG
283 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
284 (current_thread()->pmap_footprint_suspended)
285 #else /* DEVELOPMENT || DEBUG */
286 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
287 #endif /* DEVELOPMENT || DEBUG */
288
289
290 /*
291 * Represents a tlb range that will be flushed before exiting
292 * the ppl.
293 * Used by phys_attribute_clear_range to defer flushing pages in
294 * this range until the end of the operation.
295 */
296 typedef struct pmap_tlb_flush_range {
297 pmap_t ptfr_pmap;
298 vm_map_address_t ptfr_start;
299 vm_map_address_t ptfr_end;
300 bool ptfr_flush_needed;
301 } pmap_tlb_flush_range_t;
302
303 #if XNU_MONITOR
304 /*
305 * PPL External References.
306 */
307 extern vm_offset_t segPPLDATAB;
308 extern unsigned long segSizePPLDATA;
309 extern vm_offset_t segPPLTEXTB;
310 extern unsigned long segSizePPLTEXT;
311 extern vm_offset_t segPPLDATACONSTB;
312 extern unsigned long segSizePPLDATACONST;
313
314
315 /*
316 * PPL Global Variables
317 */
318
319 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
320 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
321 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
322 #else
323 const boolean_t pmap_ppl_disable = FALSE;
324 #endif
325
326 /*
327 * Indicates if the PPL has started applying APRR.
328 * This variable is accessed from various assembly trampolines, so be sure to change
329 * those if you change the size or layout of this variable.
330 */
331 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
332
333 extern void *pmap_stacks_start;
334 extern void *pmap_stacks_end;
335
336 #endif /* !XNU_MONITOR */
337
338
339
340 /* Virtual memory region for early allocation */
341 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
342 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
343 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
344
345 extern uint8_t bootstrap_pagetables[];
346
347 extern unsigned int not_in_kdp;
348
349 extern vm_offset_t first_avail;
350
351 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
352 extern vm_offset_t virtual_space_end; /* End of kernel address space */
353 extern vm_offset_t static_memory_end;
354
355 extern const vm_map_address_t physmap_base;
356 extern const vm_map_address_t physmap_end;
357
358 extern int maxproc, hard_maxproc;
359
360 /* The number of address bits one TTBR can cover. */
361 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
362
363 /*
364 * The bounds on our TTBRs. These are for sanity checking that
365 * an address is accessible by a TTBR before we attempt to map it.
366 */
367
368 /* The level of the root of a page table. */
369 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
370
371 /* The number of entries in the root TT of a page table. */
372 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
373
374 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
375 const pmap_t kernel_pmap = &kernel_pmap_store;
376
377 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
378
379 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
380 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
381 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
382
383 typedef struct tt_free_entry {
384 struct tt_free_entry *next;
385 } tt_free_entry_t;
386
387 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
388
389 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
390 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
391 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
392 #define FREE_PAGE_SIZE_TT_MAX 4
393 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
394 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
395 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
396 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
397 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
398 unsigned int free_tt_count MARK_AS_PMAP_DATA;
399 unsigned int free_tt_max MARK_AS_PMAP_DATA;
400
401 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
402
403 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
404 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
405 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
406 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
407 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
408 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
409
410 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
411 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
412
413 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
414 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
415
416 /* Lock group used for all pmap object locks. */
417 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
418
419 #if DEVELOPMENT || DEBUG
420 int nx_enabled = 1; /* enable no-execute protection */
421 int allow_data_exec = 0; /* No apps may execute data */
422 int allow_stack_exec = 0; /* No apps may execute from the stack */
423 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
424 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
425 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
426 #else /* DEVELOPMENT || DEBUG */
427 const int nx_enabled = 1; /* enable no-execute protection */
428 const int allow_data_exec = 0; /* No apps may execute data */
429 const int allow_stack_exec = 0; /* No apps may execute from the stack */
430 #endif /* DEVELOPMENT || DEBUG */
431
432 /**
433 * This variable is set true during hibernation entry to protect pmap data structures
434 * during image copying, and reset false on hibernation exit.
435 */
436 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
437
438 #if MACH_ASSERT
439 static void pmap_check_ledgers(pmap_t pmap);
440 #else
441 static inline void
pmap_check_ledgers(__unused pmap_t pmap)442 pmap_check_ledgers(__unused pmap_t pmap)
443 {
444 }
445 #endif /* MACH_ASSERT */
446
447 /**
448 * This helper function ensures that potentially-long-running batched PPL operations are
449 * called in preemptible context before entering the PPL, so that the PPL call may
450 * periodically exit to allow pending urgent ASTs to be taken.
451 */
452 static inline void
pmap_verify_preemptible(void)453 pmap_verify_preemptible(void)
454 {
455 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
456 }
457
458 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
459
460 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
461 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
462
463 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
464
465 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
466 #if defined(__arm64__)
467 /* end of shared region + 512MB for various purposes */
468 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
469 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
470 "Minimum address space size outside allowable range");
471
472 // Max offset is 15.375GB for devices with "large" memory config
473 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
474 // Max offset is 11.375GB for devices with "small" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
476
477
478 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
479 "Large device address space size outside allowable range");
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
481 "Small device address space size outside allowable range");
482
483 # ifdef XNU_TARGET_OS_OSX
484 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
485 # else
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
487 # endif
488 #endif /* __arm64__ */
489
490 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
491 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
492 #else
493 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
494 #endif
495
496 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
497 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
498 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
499 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
500 #if !HAS_16BIT_ASID
501 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
502 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
503 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
504 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
505 #else
506 static uint16_t last_allocated_asid = 0;
507 #endif /* !HAS_16BIT_ASID */
508
509
510 #if __ARM_MIXED_PAGE_SIZE__
511 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
512 #endif
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
514 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
515 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
516
517 /* PTE Define Macros */
518
519 #define ARM_PTE_IS_COMPRESSED(x, p) \
520 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
521 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
522 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
523 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
524 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
525
526 #define pte_is_wired(pte) \
527 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
528
529 #define pte_was_writeable(pte) \
530 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
531
532 #define pte_set_was_writeable(pte, was_writeable) \
533 do { \
534 if ((was_writeable)) { \
535 (pte) |= ARM_PTE_WRITEABLE; \
536 } else { \
537 (pte) &= ~ARM_PTE_WRITEABLE; \
538 } \
539 } while(0)
540
541 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)542 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
543 {
544 if (wired) {
545 *ptep |= ARM_PTE_WIRED;
546 } else {
547 *ptep &= ~ARM_PTE_WIRED;
548 }
549 /*
550 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
551 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
552 * never reclaimed.
553 */
554 if (pmap == kernel_pmap) {
555 return;
556 }
557 unsigned short *ptd_wiredcnt_ptr;
558 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
559 if (wired) {
560 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
561 } else {
562 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 if (__improbable(prev_wired == 0)) {
564 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
565 }
566 }
567 }
568
569 #if HAS_FEAT_XS
570
571 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)572 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
573 {
574 if (__improbable(pt_attr->stage2)) {
575 return false;
576 }
577 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
578 case CACHE_ATTRINDX_POSTED_XS:
579 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
580 return true;
581 default:
582 return false;
583 }
584 }
585
586 #endif /* HAS_FEAT_XS */
587
588 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
589 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
590 arm64_sync_tlb(strong); \
591 }
592
593 /*
594 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
595 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
596 * will observe the updated PTE.
597 */
598 #define FLUSH_PTE() \
599 __builtin_arm_dmb(DMB_ISH);
600
601 /*
602 * Synchronize updates to PTEs that were previously valid and thus may be cached in
603 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
604 * TLBI. This should only require a store-store barrier, as subsequent accesses in
605 * program order will not issue until the DSB completes. Prior loads may be reordered
606 * after the barrier, but their behavior should not be materially affected by the
607 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
608 * matter for loads until the access is re-driven well after the TLB update is
609 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
610 * we should be in a position to handle access faults. For "voluntary" PTE access
611 * restriction due to unmapping or protection, the decision to restrict access should
612 * have a data dependency on prior loads in order to avoid a data race.
613 */
614 #define FLUSH_PTE_STRONG() \
615 __builtin_arm_dsb(DSB_ISHST);
616
617 /**
618 * Write enough page table entries to map a single VM page. On systems where the
619 * VM page size does not match the hardware page size, multiple page table
620 * entries will need to be written.
621 *
622 * @note This function does not emit a barrier to ensure these page table writes
623 * have completed before continuing. This is commonly needed. In the case
624 * where a DMB or DSB barrier is needed, then use the write_pte() and
625 * write_pte_strong() functions respectively instead of this one.
626 *
627 * @param ptep Pointer to the first page table entry to update.
628 * @param pte The value to write into each page table entry. In the case that
629 * multiple PTEs are updated to a non-empty value, then the address
630 * in this value will automatically be incremented for each PTE
631 * write.
632 */
633 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)634 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
635 {
636 /**
637 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
638 * systems, which is why it's checked at runtime instead of compile time.
639 * The "unreachable" warning needs to be suppressed because it still is a
640 * compile time constant on some systems.
641 */
642 __unreachable_ok_push
643 if (TEST_PAGE_RATIO_4) {
644 if (((uintptr_t)ptep) & 0x1f) {
645 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
646 __func__, ptep, (void*)pte);
647 }
648
649 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
650 /**
651 * If we're writing an empty/compressed PTE value, then don't
652 * auto-increment the address for each PTE write.
653 */
654 *ptep = pte;
655 *(ptep + 1) = pte;
656 *(ptep + 2) = pte;
657 *(ptep + 3) = pte;
658 } else {
659 *ptep = pte;
660 *(ptep + 1) = pte | 0x1000;
661 *(ptep + 2) = pte | 0x2000;
662 *(ptep + 3) = pte | 0x3000;
663 }
664 } else {
665 *ptep = pte;
666 }
667 __unreachable_ok_pop
668 }
669
670 /**
671 * Writes enough page table entries to map a single VM page and then ensures
672 * those writes complete by executing a Data Memory Barrier.
673 *
674 * @note The DMB issued by this function is not strong enough to protect against
675 * TLB invalidates from being reordered above the PTE writes. If a TLBI
676 * instruction is going to immediately be called after this write, it's
677 * recommended to call write_pte_strong() instead of this function.
678 *
679 * See the function header for write_pte_fast() for more details on the
680 * parameters.
681 */
682 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)683 write_pte(pt_entry_t *ptep, pt_entry_t pte)
684 {
685 write_pte_fast(ptep, pte);
686 FLUSH_PTE();
687 }
688
689 /**
690 * Writes enough page table entries to map a single VM page and then ensures
691 * those writes complete by executing a Data Synchronization Barrier. This
692 * barrier provides stronger guarantees than the DMB executed by write_pte().
693 *
694 * @note This function is useful if you're going to immediately flush the TLB
695 * after making the PTE write. A DSB is required to protect against the
696 * TLB invalidate being reordered before the PTE write.
697 *
698 * See the function header for write_pte_fast() for more details on the
699 * parameters.
700 */
701 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)702 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
703 {
704 write_pte_fast(ptep, pte);
705 FLUSH_PTE_STRONG();
706 }
707
708 /**
709 * Retrieve the pmap structure for the thread running on the current CPU.
710 */
711 pmap_t
current_pmap()712 current_pmap()
713 {
714 const pmap_t current = vm_map_pmap(current_thread()->map);
715
716 assert(current != NULL);
717
718 #if XNU_MONITOR
719 /**
720 * On PPL-enabled systems, it's important that PPL policy decisions aren't
721 * decided by kernel-writable memory. This function is used in various parts
722 * of the PPL, and besides validating that the pointer returned by this
723 * function is indeed a pmap structure, it's also important to ensure that
724 * it's actually the current thread's pmap. This is because different pmaps
725 * will have access to different entitlements based on the code signature of
726 * their loaded process. So if a different user pmap is set in the current
727 * thread structure (in an effort to bypass code signing restrictions), even
728 * though the structure would validate correctly as it is a real pmap
729 * structure, it should fail here.
730 *
731 * This only needs to occur for user pmaps because the kernel pmap's root
732 * page table is always the same as TTBR1 (it's set during bootstrap and not
733 * changed so it'd be redundant to check), and its code signing fields are
734 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
735 * it shouldn't be possible to set those fields. Due to that, an attacker
736 * setting the current thread's pmap to the kernel pmap as a way to bypass
737 * this check won't accomplish anything as it doesn't provide any extra code
738 * signing entitlements.
739 */
740 if ((current != kernel_pmap) &&
741 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
742 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
743 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
744 }
745 #endif /* XNU_MONITOR */
746
747 return current;
748 }
749
750 #if DEVELOPMENT || DEBUG
751
752 /*
753 * Trace levels are controlled by a bitmask in which each
754 * level can be enabled/disabled by the (1<<level) position
755 * in the boot arg
756 * Level 0: PPL extension functionality
757 * Level 1: pmap lifecycle (create/destroy/switch)
758 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
759 * Level 3: internal state management (attributes/fast-fault)
760 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
761 */
762
763 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
764
765 #define PMAP_TRACE(level, ...) \
766 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
767 KDBG_RELEASE(__VA_ARGS__); \
768 }
769 #else /* DEVELOPMENT || DEBUG */
770
771 #define PMAP_TRACE(level, ...)
772
773 #endif /* DEVELOPMENT || DEBUG */
774
775
776 /*
777 * Internal function prototypes (forward declarations).
778 */
779
780 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
781
782 static void pmap_set_reference(ppnum_t pn);
783
784 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
785
786 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
787
788 static kern_return_t pmap_expand(
789 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
790
791 static int pmap_remove_range(
792 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
793
794 static tt_entry_t *pmap_tt1_allocate(
795 pmap_t, vm_size_t, unsigned int);
796
797 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
798
799 static void pmap_tt1_deallocate(
800 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
801
802 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
803
804 static kern_return_t pmap_tt_allocate(
805 pmap_t, tt_entry_t **, unsigned int, unsigned int);
806
807 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
808
809 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
810 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
811 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
812
813 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
814
815
816 static void pmap_unmap_commpage(
817 pmap_t pmap);
818
819 static boolean_t
820 pmap_is_64bit(pmap_t);
821
822
823 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
824
825 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
826
827 static bool pmap_update_cache_attributes_locked(
828 ppnum_t, unsigned, bool);
829
830 static boolean_t arm_clear_fast_fault(
831 ppnum_t ppnum,
832 vm_prot_t fault_type,
833 pt_entry_t *pte_p);
834
835 static void pmap_trim_self(pmap_t pmap);
836 static void pmap_trim_subord(pmap_t subord);
837
838
839 /*
840 * Temporary prototypes, while we wait for pmap_enter to move to taking an
841 * address instead of a page number.
842 */
843 static kern_return_t
844 pmap_enter_addr(
845 pmap_t pmap,
846 vm_map_address_t v,
847 pmap_paddr_t pa,
848 vm_prot_t prot,
849 vm_prot_t fault_type,
850 unsigned int flags,
851 boolean_t wired);
852
853 kern_return_t
854 pmap_enter_options_addr(
855 pmap_t pmap,
856 vm_map_address_t v,
857 pmap_paddr_t pa,
858 vm_prot_t prot,
859 vm_prot_t fault_type,
860 unsigned int flags,
861 boolean_t wired,
862 unsigned int options,
863 __unused void *arg);
864
865 #ifdef CONFIG_XNUPOST
866 kern_return_t pmap_test(void);
867 #endif /* CONFIG_XNUPOST */
868
869 PMAP_SUPPORT_PROTOTYPES(
870 kern_return_t,
871 arm_fast_fault, (pmap_t pmap,
872 vm_map_address_t va,
873 vm_prot_t fault_type,
874 bool was_af_fault,
875 bool from_user), ARM_FAST_FAULT_INDEX);
876
877 PMAP_SUPPORT_PROTOTYPES(
878 boolean_t,
879 arm_force_fast_fault, (ppnum_t ppnum,
880 vm_prot_t allow_mode,
881 int options), ARM_FORCE_FAST_FAULT_INDEX);
882
883 MARK_AS_PMAP_TEXT static boolean_t
884 arm_force_fast_fault_with_flush_range(
885 ppnum_t ppnum,
886 vm_prot_t allow_mode,
887 int options,
888 pmap_tlb_flush_range_t *flush_range);
889
890 /**
891 * Definition of the states driving the batch cache attributes update
892 * state machine.
893 */
894 typedef struct {
895 uint64_t page_index : 32, /* The page index to be operated on */
896 state : 8, /* The current state of the update machine */
897 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
898 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
899 :0;
900 } batch_set_cache_attr_state_t;
901
902 /* Possible values of the "state" field. */
903 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
904 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
905 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
907
908 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
909
910 PMAP_SUPPORT_PROTOTYPES(
911 batch_set_cache_attr_state_t,
912 pmap_batch_set_cache_attributes, (
913 #if XNU_MONITOR
914 volatile upl_page_info_t *user_page_list,
915 #else /* !XNU_MONITOR */
916 upl_page_info_array_t user_page_list,
917 #endif /* XNU_MONITOR */
918 batch_set_cache_attr_state_t state,
919 unsigned int page_cnt,
920 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
921
922 PMAP_SUPPORT_PROTOTYPES(
923 kern_return_t,
924 pmap_change_wiring, (pmap_t pmap,
925 vm_map_address_t v,
926 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
927
928 PMAP_SUPPORT_PROTOTYPES(
929 pmap_t,
930 pmap_create_options, (ledger_t ledger,
931 vm_map_size_t size,
932 unsigned int flags,
933 kern_return_t * kr), PMAP_CREATE_INDEX);
934
935 PMAP_SUPPORT_PROTOTYPES(
936 void,
937 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
938
939 PMAP_SUPPORT_PROTOTYPES(
940 kern_return_t,
941 pmap_enter_options, (pmap_t pmap,
942 vm_map_address_t v,
943 pmap_paddr_t pa,
944 vm_prot_t prot,
945 vm_prot_t fault_type,
946 unsigned int flags,
947 boolean_t wired,
948 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
949
950 PMAP_SUPPORT_PROTOTYPES(
951 pmap_paddr_t,
952 pmap_find_pa, (pmap_t pmap,
953 addr64_t va), PMAP_FIND_PA_INDEX);
954
955 PMAP_SUPPORT_PROTOTYPES(
956 kern_return_t,
957 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
958
959
960 PMAP_SUPPORT_PROTOTYPES(
961 boolean_t,
962 pmap_is_empty, (pmap_t pmap,
963 vm_map_offset_t va_start,
964 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
965
966
967 PMAP_SUPPORT_PROTOTYPES(
968 unsigned int,
969 pmap_map_cpu_windows_copy, (ppnum_t pn,
970 vm_prot_t prot,
971 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
972
973 PMAP_SUPPORT_PROTOTYPES(
974 void,
975 pmap_ro_zone_memcpy, (zone_id_t zid,
976 vm_offset_t va,
977 vm_offset_t offset,
978 const vm_offset_t new_data,
979 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
980
981 PMAP_SUPPORT_PROTOTYPES(
982 uint64_t,
983 pmap_ro_zone_atomic_op, (zone_id_t zid,
984 vm_offset_t va,
985 vm_offset_t offset,
986 zro_atomic_op_t op,
987 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
988
989 PMAP_SUPPORT_PROTOTYPES(
990 void,
991 pmap_ro_zone_bzero, (zone_id_t zid,
992 vm_offset_t va,
993 vm_offset_t offset,
994 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
995
996 PMAP_SUPPORT_PROTOTYPES(
997 vm_map_offset_t,
998 pmap_nest, (pmap_t grand,
999 pmap_t subord,
1000 addr64_t vstart,
1001 uint64_t size,
1002 vm_map_offset_t vrestart,
1003 kern_return_t * krp), PMAP_NEST_INDEX);
1004
1005 PMAP_SUPPORT_PROTOTYPES(
1006 void,
1007 pmap_page_protect_options, (ppnum_t ppnum,
1008 vm_prot_t prot,
1009 unsigned int options,
1010 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1011
1012 PMAP_SUPPORT_PROTOTYPES(
1013 vm_map_address_t,
1014 pmap_protect_options, (pmap_t pmap,
1015 vm_map_address_t start,
1016 vm_map_address_t end,
1017 vm_prot_t prot,
1018 unsigned int options,
1019 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1020
1021 PMAP_SUPPORT_PROTOTYPES(
1022 kern_return_t,
1023 pmap_query_page_info, (pmap_t pmap,
1024 vm_map_offset_t va,
1025 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1026
1027 PMAP_SUPPORT_PROTOTYPES(
1028 mach_vm_size_t,
1029 pmap_query_resident, (pmap_t pmap,
1030 vm_map_address_t start,
1031 vm_map_address_t end,
1032 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1033
1034 PMAP_SUPPORT_PROTOTYPES(
1035 void,
1036 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1037
1038 PMAP_SUPPORT_PROTOTYPES(
1039 vm_map_address_t,
1040 pmap_remove_options, (pmap_t pmap,
1041 vm_map_address_t start,
1042 vm_map_address_t end,
1043 int options), PMAP_REMOVE_OPTIONS_INDEX);
1044
1045
1046 PMAP_SUPPORT_PROTOTYPES(
1047 void,
1048 pmap_set_cache_attributes, (ppnum_t pn,
1049 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1050
1051 PMAP_SUPPORT_PROTOTYPES(
1052 void,
1053 pmap_update_compressor_page, (ppnum_t pn,
1054 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1055
1056 PMAP_SUPPORT_PROTOTYPES(
1057 void,
1058 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1059
1060 #if MACH_ASSERT || XNU_MONITOR
1061 PMAP_SUPPORT_PROTOTYPES(
1062 void,
1063 pmap_set_process, (pmap_t pmap,
1064 int pid,
1065 char *procname), PMAP_SET_PROCESS_INDEX);
1066 #endif
1067
1068 PMAP_SUPPORT_PROTOTYPES(
1069 void,
1070 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1071
1072 PMAP_SUPPORT_PROTOTYPES(
1073 vm_map_offset_t,
1074 pmap_unnest_options, (pmap_t grand,
1075 addr64_t vaddr,
1076 uint64_t size,
1077 vm_map_offset_t vrestart,
1078 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1079
1080 PMAP_SUPPORT_PROTOTYPES(
1081 void,
1082 phys_attribute_set, (ppnum_t pn,
1083 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1084
1085 PMAP_SUPPORT_PROTOTYPES(
1086 void,
1087 phys_attribute_clear, (ppnum_t pn,
1088 unsigned int bits,
1089 int options,
1090 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1091
1092 #if __ARM_RANGE_TLBI__
1093 PMAP_SUPPORT_PROTOTYPES(
1094 vm_map_address_t,
1095 phys_attribute_clear_range, (pmap_t pmap,
1096 vm_map_address_t start,
1097 vm_map_address_t end,
1098 unsigned int bits,
1099 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1100 #endif /* __ARM_RANGE_TLBI__ */
1101
1102
1103 PMAP_SUPPORT_PROTOTYPES(
1104 void,
1105 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1106
1107 PMAP_SUPPORT_PROTOTYPES(
1108 void,
1109 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1110
1111 PMAP_SUPPORT_PROTOTYPES(
1112 void,
1113 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1114
1115 PMAP_SUPPORT_PROTOTYPES(
1116 void,
1117 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1118
1119 PMAP_SUPPORT_PROTOTYPES(
1120 void,
1121 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1122
1123 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1124 PMAP_SUPPORT_PROTOTYPES(
1125 void,
1126 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1127 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1128
1129 /* Definition of the states used by pmap_trim(). */
1130 typedef enum {
1131 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1132 PMAP_TRIM_STATE_START = 0,
1133
1134 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1135 PMAP_TRIM_STATE_GRAND_BEFORE,
1136
1137 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1138 PMAP_TRIM_STATE_GRAND_AFTER,
1139
1140 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1141 PMAP_TRIM_STATE_SUBORD,
1142
1143 /* Marks that trimming is finished. */
1144 PMAP_TRIM_STATE_DONE,
1145
1146 /* Sentry enum for sanity checks. */
1147 PMAP_TRIM_STATE_COUNT,
1148 } pmap_trim_state_t;
1149
1150 PMAP_SUPPORT_PROTOTYPES(
1151 pmap_trim_state_t,
1152 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1153
1154 #if HAS_APPLE_PAC
1155 PMAP_SUPPORT_PROTOTYPES(
1156 void *,
1157 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1158 PMAP_SUPPORT_PROTOTYPES(
1159 void *,
1160 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1161 #endif /* HAS_APPLE_PAC */
1162
1163
1164
1165
1166 PMAP_SUPPORT_PROTOTYPES(
1167 kern_return_t,
1168 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1169 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1170
1171 PMAP_SUPPORT_PROTOTYPES(
1172 kern_return_t,
1173 pmap_load_trust_cache_with_type, (TCType_t type,
1174 const vm_address_t pmap_img4_payload,
1175 const vm_size_t pmap_img4_payload_len,
1176 const vm_address_t img4_manifest,
1177 const vm_size_t img4_manifest_len,
1178 const vm_address_t img4_aux_manifest,
1179 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1180
1181 PMAP_SUPPORT_PROTOTYPES(
1182 void,
1183 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1184
1185 PMAP_SUPPORT_PROTOTYPES(
1186 kern_return_t,
1187 pmap_query_trust_cache, (TCQueryType_t query_type,
1188 const uint8_t cdhash[kTCEntryHashSize],
1189 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1190
1191 #if PMAP_CS_INCLUDE_CODE_SIGNING
1192
1193 PMAP_SUPPORT_PROTOTYPES(
1194 kern_return_t,
1195 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1196 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1197
1198 PMAP_SUPPORT_PROTOTYPES(
1199 kern_return_t,
1200 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1201 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1202
1203 PMAP_SUPPORT_PROTOTYPES(
1204 kern_return_t,
1205 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1206 pmap_cs_profile_t * profile_obj),
1207 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1208
1209 PMAP_SUPPORT_PROTOTYPES(
1210 kern_return_t,
1211 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1212 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1213
1214 PMAP_SUPPORT_PROTOTYPES(
1215 kern_return_t,
1216 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1217 const void *kernel_entitlements),
1218 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1219
1220 PMAP_SUPPORT_PROTOTYPES(
1221 kern_return_t,
1222 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1223 const void **kernel_entitlements),
1224 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1225
1226 PMAP_SUPPORT_PROTOTYPES(
1227 kern_return_t,
1228 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1229 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1230
1231 PMAP_SUPPORT_PROTOTYPES(
1232 kern_return_t,
1233 pmap_cs_allow_invalid, (pmap_t pmap),
1234 PMAP_CS_ALLOW_INVALID_INDEX);
1235
1236 PMAP_SUPPORT_PROTOTYPES(
1237 void,
1238 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1239 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1240
1241 PMAP_SUPPORT_PROTOTYPES(
1242 bool,
1243 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1244 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1245
1246 PMAP_SUPPORT_PROTOTYPES(
1247 void,
1248 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1249 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1250
1251 PMAP_SUPPORT_PROTOTYPES(
1252 void,
1253 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1254 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1255
1256 #endif
1257
1258 PMAP_SUPPORT_PROTOTYPES(
1259 uint32_t,
1260 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1261
1262 PMAP_SUPPORT_PROTOTYPES(
1263 bool,
1264 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1265
1266 PMAP_SUPPORT_PROTOTYPES(
1267 void,
1268 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1269
1270 void pmap_footprint_suspend(vm_map_t map,
1271 boolean_t suspend);
1272 PMAP_SUPPORT_PROTOTYPES(
1273 void,
1274 pmap_footprint_suspend, (vm_map_t map,
1275 boolean_t suspend),
1276 PMAP_FOOTPRINT_SUSPEND_INDEX);
1277
1278
1279
1280
1281 #if DEVELOPMENT || DEBUG
1282 PMAP_SUPPORT_PROTOTYPES(
1283 kern_return_t,
1284 pmap_test_text_corruption, (pmap_paddr_t),
1285 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1286 #endif /* DEVELOPMENT || DEBUG */
1287
1288 /*
1289 * The low global vector page is mapped at a fixed alias.
1290 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1291 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1292 * to check both addresses anyway for backward compatibility. So for now
1293 * we leave H6 and H7 where they were.
1294 */
1295 #if (ARM_PGSHIFT == 14)
1296 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1297 #else
1298 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1299 #endif
1300
1301
1302 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1303 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1304 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1305
1306 #if XNU_MONITOR
1307
1308 #if __has_feature(ptrauth_calls)
1309 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1310 #else
1311 #define __ptrauth_ppl_handler
1312 #endif
1313
1314 /*
1315 * Table of function pointers used for PPL dispatch.
1316 */
1317 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1318 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1319 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1320 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1321 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1322 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1323 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1324 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1325 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1326 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1327 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1328 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1329 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1330 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1331 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1332 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1333 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1334 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1335 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1336 [PMAP_NEST_INDEX] = pmap_nest_internal,
1337 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1338 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1339 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1340 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1341 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1342 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1343 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1344 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1345 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1346 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1347 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1348 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1349 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1350 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1351 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1352 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1353 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1354 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1355 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1356 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1357 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1358 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1359 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1360 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1361 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1362 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1363 #if PMAP_CS_INCLUDE_CODE_SIGNING
1364 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1365 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1366 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1367 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1368 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1369 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1370 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1371 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1372 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1373 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1374 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1375 #endif
1376 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1377 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1378 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1379 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1380 #if HAS_APPLE_PAC
1381 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1382 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1383 #endif /* HAS_APPLE_PAC */
1384 #if __ARM_RANGE_TLBI__
1385 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1386 #endif /* __ARM_RANGE_TLBI__ */
1387 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1388 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1389 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1390 [PMAP_NOP_INDEX] = pmap_nop_internal,
1391
1392 #if DEVELOPMENT || DEBUG
1393 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1394 #endif /* DEVELOPMENT || DEBUG */
1395
1396 };
1397 #endif
1398
1399 #if XNU_MONITOR
1400 /**
1401 * A convenience function for setting protections on a single physical
1402 * aperture or static region mapping without invalidating the TLB.
1403 *
1404 * @note This function does not perform any TLB invalidations. That must be done
1405 * separately to be able to safely use the updated mapping.
1406 *
1407 * @note This function understands the difference between the VM page size and
1408 * the kernel page size and will update multiple PTEs if the sizes differ.
1409 * In other words, enough PTEs will always get updated to change the
1410 * permissions on a PAGE_SIZE amount of memory.
1411 *
1412 * @note The PVH lock for the physical page represented by this mapping must
1413 * already be locked.
1414 *
1415 * @note This function assumes the caller has already verified that the PTE
1416 * pointer does indeed point to a physical aperture or static region page
1417 * table. Please validate your inputs before passing it along to this
1418 * function.
1419 *
1420 * @param ptep Pointer to the physical aperture or static region page table to
1421 * update with a new XPRR index.
1422 * @param expected_perm The XPRR index that is expected to already exist at the
1423 * current mapping. If the current index doesn't match this
1424 * then the system will panic.
1425 * @param new_perm The new XPRR index to update the mapping with.
1426 */
1427 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1428 pmap_set_pte_xprr_perm(
1429 pt_entry_t * const ptep,
1430 unsigned int expected_perm,
1431 unsigned int new_perm)
1432 {
1433 assert(ptep != NULL);
1434
1435 pt_entry_t spte = *ptep;
1436 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1437
1438 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1439 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1440 __func__, ptep, new_perm, expected_perm);
1441 }
1442
1443 /**
1444 * The PTE involved should be valid, should not have the hint bit set, and
1445 * should have the expected XPRR index.
1446 */
1447 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1448 panic_plain("%s: physical aperture or static region PTE is invalid, "
1449 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1450 __func__, ptep, spte, new_perm, expected_perm);
1451 }
1452
1453 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1454 panic_plain("%s: physical aperture or static region PTE has hint bit "
1455 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1456 __func__, ptep, spte, new_perm, expected_perm);
1457 }
1458
1459 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1460 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1461 "ptep=%p, new_perm=%u, expected_perm=%u",
1462 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1463 }
1464
1465 pt_entry_t template = spte;
1466 template &= ~ARM_PTE_XPRR_MASK;
1467 template |= xprr_perm_to_pte(new_perm);
1468
1469 write_pte_strong(ptep, template);
1470 }
1471
1472 /**
1473 * Update the protections on a single physical aperture mapping and invalidate
1474 * the TLB so the mapping can be used.
1475 *
1476 * @note The PVH lock for the physical page must already be locked.
1477 *
1478 * @param pai The physical address index of the page whose physical aperture
1479 * mapping will be updated with new permissions.
1480 * @param expected_perm The XPRR index that is expected to already exist at the
1481 * current mapping. If the current index doesn't match this
1482 * then the system will panic.
1483 * @param new_perm The new XPRR index to update the mapping with.
1484 */
1485 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1486 pmap_set_xprr_perm(
1487 unsigned int pai,
1488 unsigned int expected_perm,
1489 unsigned int new_perm)
1490 {
1491 pvh_assert_locked(pai);
1492
1493 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1494 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1495
1496 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1497
1498 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1499 sync_tlb_flush();
1500 }
1501
1502 /**
1503 * Update the protections on a range of physical aperture or static region
1504 * mappings and invalidate the TLB so the mappings can be used.
1505 *
1506 * @note Static region mappings can only be updated before machine_lockdown().
1507 * Physical aperture mappings can be updated at any time.
1508 *
1509 * @param start The starting virtual address of the static region or physical
1510 * aperture range whose permissions will be updated.
1511 * @param end The final (inclusive) virtual address of the static region or
1512 * physical aperture range whose permissions will be updated.
1513 * @param expected_perm The XPRR index that is expected to already exist at the
1514 * current mappings. If the current indices don't match
1515 * this then the system will panic.
1516 * @param new_perm The new XPRR index to update the mappings with.
1517 */
1518 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1519 pmap_set_range_xprr_perm(
1520 vm_address_t start,
1521 vm_address_t end,
1522 unsigned int expected_perm,
1523 unsigned int new_perm)
1524 {
1525 /**
1526 * Validate our arguments; any invalid argument will be grounds for a panic.
1527 */
1528 if (__improbable((start | end) & ARM_PGMASK)) {
1529 panic_plain("%s: start or end not page aligned, "
1530 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1531 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1532 }
1533
1534 if (__improbable(start > end)) {
1535 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1536 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1537 }
1538
1539 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1540 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1541
1542 if (__improbable(!(in_physmap || in_static))) {
1543 panic_plain("%s: address not in static region or physical aperture, "
1544 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1545 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1546 }
1547
1548 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1549 panic_plain("%s: invalid XPRR index, "
1550 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1551 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1552 }
1553
1554 /*
1555 * Walk over the PTEs for the given range, and set the protections on those
1556 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1557 * one twig entry (whichever twig entry currently maps "va").
1558 */
1559 vm_address_t va = start;
1560 while (va < end) {
1561 /**
1562 * Get the last VA that the twig entry for "va" maps. All of the leaf
1563 * PTEs from va to tte_va_end will have their permissions updated.
1564 */
1565 vm_address_t tte_va_end =
1566 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1567
1568 if (tte_va_end > end) {
1569 tte_va_end = end;
1570 }
1571
1572 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1573
1574 if (ttep == NULL) {
1575 panic_plain("%s: physical aperture or static region tte is NULL, "
1576 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1577 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1578 }
1579
1580 tt_entry_t tte = *ttep;
1581
1582 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1583 panic_plain("%s: tte=0x%llx is not a table type entry, "
1584 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1585 tte, (void *)start, (void *)end, new_perm, expected_perm);
1586 }
1587
1588 /* Walk over the given L3 page table page and update the PTEs. */
1589 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1590 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1591 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1592 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1593
1594 /**
1595 * The current PTE pointer is incremented by the page ratio (ratio of
1596 * VM page size to kernel hardware page size) because one call to
1597 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1598 * a PAGE_SIZE worth of hardware pages.
1599 */
1600 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1601 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1602 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1603 pvh_lock(pai);
1604 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1605 pvh_unlock(pai);
1606 }
1607
1608 va = tte_va_end;
1609 }
1610
1611 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1612 }
1613
1614 #endif /* XNU_MONITOR */
1615
1616 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1617 PMAP_ZINFO_PALLOC(
1618 pmap_t pmap, int bytes)
1619 {
1620 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1621 }
1622
1623 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1624 PMAP_ZINFO_PFREE(
1625 pmap_t pmap,
1626 int bytes)
1627 {
1628 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1629 }
1630
1631 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1632 pmap_tt_ledger_credit(
1633 pmap_t pmap,
1634 vm_size_t size)
1635 {
1636 if (pmap != kernel_pmap) {
1637 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1638 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1639 }
1640 }
1641
1642 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1643 pmap_tt_ledger_debit(
1644 pmap_t pmap,
1645 vm_size_t size)
1646 {
1647 if (pmap != kernel_pmap) {
1648 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1649 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1650 }
1651 }
1652
1653 static inline void
pmap_update_plru(uint16_t asid_index __unused)1654 pmap_update_plru(uint16_t asid_index __unused)
1655 {
1656 #if !HAS_16BIT_ASID
1657 if (__probable(pmap_asid_plru)) {
1658 unsigned plru_index = asid_index >> 6;
1659 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1660 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1661 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1662 }
1663 }
1664 #endif /* !HAS_16BIT_ASID */
1665 }
1666
1667 static bool
alloc_asid(pmap_t pmap)1668 alloc_asid(pmap_t pmap)
1669 {
1670 int vasid = -1;
1671 uint16_t hw_asid;
1672
1673 pmap_simple_lock(&asid_lock);
1674
1675 #if !HAS_16BIT_ASID
1676 if (__probable(pmap_asid_plru)) {
1677 unsigned plru_index = 0;
1678 uint64_t lowest_gen = asid_plru_generation[0];
1679 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1680 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1681 if (asid_plru_generation[i] < lowest_gen) {
1682 plru_index = i;
1683 lowest_gen = asid_plru_generation[i];
1684 lowest_gen_bitmap = asid_plru_bitmap[i];
1685 }
1686 }
1687
1688 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1689 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1690 if (temp_plru) {
1691 vasid = (plru_index << 6) + lsb_first(temp_plru);
1692 #if DEVELOPMENT || DEBUG
1693 ++pmap_asid_hits;
1694 #endif
1695 break;
1696 }
1697 }
1698 }
1699 #else
1700 /**
1701 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1702 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1703 * However, we first try to allocate starting from the position of the most-recently allocated
1704 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1705 * lower bit positions and then re-checking those same lower positions every time we allocate
1706 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1707 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1708 * logic, without requiring prohibitively expensive RCTX instructions.
1709 */
1710 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1711 #endif /* !HAS_16BIT_ASID */
1712 if (__improbable(vasid < 0)) {
1713 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1714 // slightly better with the collision detection scheme used by pmap_switch_internal().
1715 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1716 #if DEVELOPMENT || DEBUG
1717 ++pmap_asid_misses;
1718 #endif
1719 }
1720 if (__improbable(vasid < 0)) {
1721 pmap_simple_unlock(&asid_lock);
1722 return false;
1723 }
1724 assert((uint32_t)vasid < pmap_max_asids);
1725 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1726 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1727 #if HAS_16BIT_ASID
1728 last_allocated_asid = (uint16_t)vasid;
1729 #endif /* HAS_16BIT_ASID */
1730 pmap_simple_unlock(&asid_lock);
1731 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1732 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1733 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1734 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1735 * reassign to a reserved VASID. */
1736 assert(pmap->sw_asid < UINT8_MAX);
1737 pmap->sw_asid = UINT8_MAX;
1738 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1739 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1740 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1741 assert(hw_asid < MAX_HW_ASIDS);
1742 }
1743 pmap_update_plru(hw_asid);
1744 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1745 #if __ARM_KERNEL_PROTECT__
1746 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1747 #endif
1748 pmap->hw_asid = hw_asid;
1749 return true;
1750 }
1751
1752 static void
free_asid(pmap_t pmap)1753 free_asid(pmap_t pmap)
1754 {
1755 unsigned int vasid;
1756 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1757 if (__improbable(hw_asid == 0)) {
1758 return;
1759 }
1760
1761 #if __ARM_KERNEL_PROTECT__
1762 hw_asid >>= 1;
1763 #endif
1764 hw_asid -= 1;
1765
1766 #if HAS_16BIT_ASID
1767 vasid = hw_asid;
1768 #else
1769 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1770 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1771 } else {
1772 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1773 }
1774
1775 if (__probable(pmap_asid_plru)) {
1776 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1777 }
1778 #endif /* HAS_16BIT_ASID */
1779 pmap_simple_lock(&asid_lock);
1780 assert(!bitmap_test(&asid_bitmap[0], vasid));
1781 bitmap_set(&asid_bitmap[0], vasid);
1782 pmap_simple_unlock(&asid_lock);
1783 }
1784
1785
1786 boolean_t
pmap_valid_address(pmap_paddr_t addr)1787 pmap_valid_address(
1788 pmap_paddr_t addr)
1789 {
1790 return pa_valid(addr);
1791 }
1792
1793
1794
1795
1796
1797
1798 /*
1799 * Map memory at initialization. The physical addresses being
1800 * mapped are not managed and are never unmapped.
1801 *
1802 * For now, VM is already on, we only need to map the
1803 * specified memory.
1804 */
1805 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1806 pmap_map(
1807 vm_map_address_t virt,
1808 vm_offset_t start,
1809 vm_offset_t end,
1810 vm_prot_t prot,
1811 unsigned int flags)
1812 {
1813 kern_return_t kr;
1814 vm_size_t ps;
1815
1816 ps = PAGE_SIZE;
1817 while (start < end) {
1818 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1819 prot, VM_PROT_NONE, flags, FALSE);
1820
1821 if (kr != KERN_SUCCESS) {
1822 panic("%s: failed pmap_enter, "
1823 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1824 __FUNCTION__,
1825 (void *) virt, (void *) start, (void *) end, prot, flags);
1826 }
1827
1828 virt += ps;
1829 start += ps;
1830 }
1831 return virt;
1832 }
1833
1834 #if XNU_MONITOR
1835 /**
1836 * Remove kernel writeablity from an IO PTE value if the page is owned by
1837 * guarded mode software.
1838 *
1839 * @param paddr The physical address of the page which has to be non-DRAM.
1840 * @param tmplate The PTE value to be evaluated.
1841 *
1842 * @return A new PTE value with permission bits modified.
1843 */
1844 static inline
1845 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1846 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1847 {
1848 assert(!pa_valid(paddr));
1849
1850 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1851
1852 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1853 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1854 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1855 switch (xprr_perm) {
1856 case XPRR_KERN_RO_PERM:
1857 break;
1858 case XPRR_KERN_RW_PERM:
1859 tmplate &= ~ARM_PTE_XPRR_MASK;
1860 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1861 break;
1862 default:
1863 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1864 }
1865 }
1866
1867 return tmplate;
1868 }
1869 #endif /* XNU_MONITOR */
1870
1871 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1872 pmap_map_bd_with_options(
1873 vm_map_address_t virt,
1874 vm_offset_t start,
1875 vm_offset_t end,
1876 vm_prot_t prot,
1877 int32_t options)
1878 {
1879 pt_entry_t mem_attr;
1880
1881 switch (options & PMAP_MAP_BD_MASK) {
1882 case PMAP_MAP_BD_WCOMB:
1883 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1884 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1885 break;
1886 case PMAP_MAP_BD_POSTED:
1887 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1888 break;
1889 case PMAP_MAP_BD_POSTED_REORDERED:
1890 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1891 break;
1892 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1893 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1894 break;
1895 default:
1896 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1897 break;
1898 }
1899
1900 /* not cacheable and not buffered */
1901 pt_entry_t tmplate = pa_to_pte(start)
1902 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1903 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1904 | mem_attr;
1905
1906 #if __ARM_KERNEL_PROTECT__
1907 tmplate |= ARM_PTE_NG;
1908 #endif /* __ARM_KERNEL_PROTECT__ */
1909
1910 vm_map_address_t vaddr = virt;
1911 vm_offset_t paddr = start;
1912 while (paddr < end) {
1913 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1914 if (ptep == PT_ENTRY_NULL) {
1915 panic("pmap_map_bd");
1916 }
1917
1918 /**
1919 * For every iteration, the paddr encoded in tmplate is incrementing,
1920 * but we always start with the original AP bits defined at the top
1921 * of the function in tmplate and only modify the AP bits in the pte
1922 * variable.
1923 */
1924 pt_entry_t pte;
1925 #if XNU_MONITOR
1926 if (!pa_valid(paddr)) {
1927 pte = pmap_construct_io_pte(paddr, tmplate);
1928 } else {
1929 pte = tmplate;
1930 }
1931 #else /* !XNU_MONITOR */
1932 pte = tmplate;
1933 #endif
1934
1935 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1936 write_pte_strong(ptep, pte);
1937
1938 pte_increment_pa(tmplate);
1939 vaddr += PAGE_SIZE;
1940 paddr += PAGE_SIZE;
1941 }
1942
1943 if (end >= start) {
1944 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1945 }
1946
1947 return vaddr;
1948 }
1949
1950 /*
1951 * Back-door routine for mapping kernel VM at initialization.
1952 * Useful for mapping memory outside the range
1953 * [vm_first_phys, vm_last_phys] (i.e., devices).
1954 * Otherwise like pmap_map.
1955 */
1956 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1957 pmap_map_bd(
1958 vm_map_address_t virt,
1959 vm_offset_t start,
1960 vm_offset_t end,
1961 vm_prot_t prot)
1962 {
1963 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1964 }
1965
1966 /*
1967 * Back-door routine for mapping kernel VM at initialization.
1968 * Useful for mapping memory specific physical addresses in early
1969 * boot (i.e., before kernel_map is initialized).
1970 *
1971 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1972 */
1973
1974 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1975 pmap_map_high_window_bd(
1976 vm_offset_t pa_start,
1977 vm_size_t len,
1978 vm_prot_t prot)
1979 {
1980 pt_entry_t *ptep, pte;
1981 vm_map_address_t va_start = VREGION1_START;
1982 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1983 vm_map_address_t va_end;
1984 vm_map_address_t va;
1985 vm_size_t offset;
1986
1987 offset = pa_start & PAGE_MASK;
1988 pa_start -= offset;
1989 len += offset;
1990
1991 if (len > (va_max - va_start)) {
1992 panic("%s: area too large, "
1993 "pa_start=%p, len=%p, prot=0x%x",
1994 __FUNCTION__,
1995 (void*)pa_start, (void*)len, prot);
1996 }
1997
1998 scan:
1999 for (; va_start < va_max; va_start += PAGE_SIZE) {
2000 ptep = pmap_pte(kernel_pmap, va_start);
2001 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2002 if (*ptep == ARM_PTE_TYPE_FAULT) {
2003 break;
2004 }
2005 }
2006 if (va_start > va_max) {
2007 panic("%s: insufficient pages, "
2008 "pa_start=%p, len=%p, prot=0x%x",
2009 __FUNCTION__,
2010 (void*)pa_start, (void*)len, prot);
2011 }
2012
2013 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2014 ptep = pmap_pte(kernel_pmap, va_end);
2015 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2016 if (*ptep != ARM_PTE_TYPE_FAULT) {
2017 va_start = va_end + PAGE_SIZE;
2018 goto scan;
2019 }
2020 }
2021
2022 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2023 ptep = pmap_pte(kernel_pmap, va);
2024 pte = pa_to_pte(pa_start)
2025 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2026 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2027 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2028 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2029 #if __ARM_KERNEL_PROTECT__
2030 pte |= ARM_PTE_NG;
2031 #endif /* __ARM_KERNEL_PROTECT__ */
2032 write_pte_strong(ptep, pte);
2033 }
2034 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2035 #if KASAN
2036 kasan_notify_address(va_start, len);
2037 #endif
2038 return va_start;
2039 }
2040
2041 static uint32_t
pmap_compute_max_asids(void)2042 pmap_compute_max_asids(void)
2043 {
2044 DTEntry entry;
2045 void const *prop = NULL;
2046 uint32_t max_asids;
2047 int err;
2048 unsigned int prop_size;
2049
2050 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2051 assert(err == kSuccess);
2052
2053 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2054 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2055 * we can choose a more flexible default value here. */
2056 return MAX_ASIDS;
2057 }
2058
2059 if (prop_size != sizeof(max_asids)) {
2060 panic("pmap-max-asids property is not a 32-bit integer");
2061 }
2062
2063 max_asids = *((uint32_t const *)prop);
2064 #if HAS_16BIT_ASID
2065 if (max_asids > MAX_HW_ASIDS) {
2066 panic("pmap-max-asids 0x%x too large", max_asids);
2067 }
2068 #else
2069 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2070 max_asids = (max_asids + 63) & ~63UL;
2071
2072 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2073 /* currently capped by size of pmap->sw_asid */
2074 panic("pmap-max-asids 0x%x too large", max_asids);
2075 }
2076 #endif /* HAS_16BIT_ASID */
2077 if (max_asids == 0) {
2078 panic("pmap-max-asids cannot be zero");
2079 }
2080 return max_asids;
2081 }
2082
2083 #if __arm64__
2084 /*
2085 * pmap_get_arm64_prot
2086 *
2087 * return effective armv8 VMSA block protections including
2088 * table AP/PXN/XN overrides of a pmap entry
2089 *
2090 */
2091
2092 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2093 pmap_get_arm64_prot(
2094 pmap_t pmap,
2095 vm_offset_t addr)
2096 {
2097 tt_entry_t tte = 0;
2098 unsigned int level = 0;
2099 uint64_t tte_type = 0;
2100 uint64_t effective_prot_bits = 0;
2101 uint64_t aggregate_tte = 0;
2102 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2103 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2104
2105 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2106 tte = *pmap_ttne(pmap, level, addr);
2107
2108 if (!(tte & ARM_TTE_VALID)) {
2109 return 0;
2110 }
2111
2112 tte_type = tte & ARM_TTE_TYPE_MASK;
2113
2114 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2115 (level == pt_attr->pta_max_level)) {
2116 /* Block or page mapping; both have the same protection bit layout. */
2117 break;
2118 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2119 /* All of the table bits we care about are overrides, so just OR them together. */
2120 aggregate_tte |= tte;
2121 }
2122 }
2123
2124 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2125 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2126 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2127
2128 /* Start with the PTE bits. */
2129 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2130
2131 /* Table AP bits mask out block/page AP bits */
2132 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2133
2134 /* XN/PXN bits can be OR'd in. */
2135 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2136 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2137
2138 return effective_prot_bits;
2139 }
2140 #endif /* __arm64__ */
2141
2142 /*
2143 * Bootstrap the system enough to run with virtual memory.
2144 *
2145 * The early VM initialization code has already allocated
2146 * the first CPU's translation table and made entries for
2147 * all the one-to-one mappings to be found there.
2148 *
2149 * We must set up the kernel pmap structures, the
2150 * physical-to-virtual translation lookup tables for the
2151 * physical memory to be managed (between avail_start and
2152 * avail_end).
2153 *
2154 * Map the kernel's code and data, and allocate the system page table.
2155 * Page_size must already be set.
2156 *
2157 * Parameters:
2158 * first_avail first available physical page -
2159 * after kernel page tables
2160 * avail_start PA of first managed physical page
2161 * avail_end PA of last managed physical page
2162 */
2163
2164 void
pmap_bootstrap(vm_offset_t vstart)2165 pmap_bootstrap(
2166 vm_offset_t vstart)
2167 {
2168 vm_map_offset_t maxoffset;
2169
2170 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2171
2172 #if XNU_MONITOR
2173
2174 #if DEVELOPMENT || DEBUG
2175 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2176 #endif
2177
2178 #if CONFIG_CSR_FROM_DT
2179 if (csr_unsafe_kernel_text) {
2180 pmap_ppl_disable = true;
2181 }
2182 #endif /* CONFIG_CSR_FROM_DT */
2183
2184 #endif /* XNU_MONITOR */
2185
2186 #if DEVELOPMENT || DEBUG
2187 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2188 kprintf("Kernel traces for pmap operations enabled\n");
2189 }
2190 #endif
2191
2192 /*
2193 * Initialize the kernel pmap.
2194 */
2195 #if ARM_PARAMETERIZED_PMAP
2196 kernel_pmap->pmap_pt_attr = native_pt_attr;
2197 #endif /* ARM_PARAMETERIZED_PMAP */
2198 #if HAS_APPLE_PAC
2199 kernel_pmap->disable_jop = 0;
2200 #endif /* HAS_APPLE_PAC */
2201 kernel_pmap->tte = cpu_tte;
2202 kernel_pmap->ttep = cpu_ttep;
2203 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2204 kernel_pmap->max = UINTPTR_MAX;
2205 os_atomic_init(&kernel_pmap->ref_count, 1);
2206 #if XNU_MONITOR
2207 os_atomic_init(&kernel_pmap->nested_count, 0);
2208 #endif
2209 kernel_pmap->nx_enabled = TRUE;
2210 #ifdef __arm64__
2211 kernel_pmap->is_64bit = TRUE;
2212 #else
2213 kernel_pmap->is_64bit = FALSE;
2214 #endif
2215 #if CONFIG_ROSETTA
2216 kernel_pmap->is_rosetta = FALSE;
2217 #endif
2218
2219 #if ARM_PARAMETERIZED_PMAP
2220 kernel_pmap->pmap_pt_attr = native_pt_attr;
2221 #endif /* ARM_PARAMETERIZED_PMAP */
2222
2223 kernel_pmap->nested_region_addr = 0x0ULL;
2224 kernel_pmap->nested_region_size = 0x0ULL;
2225 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2226 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2227 kernel_pmap->type = PMAP_TYPE_KERNEL;
2228
2229 kernel_pmap->hw_asid = 0;
2230 kernel_pmap->sw_asid = 0;
2231
2232 pmap_lock_init(kernel_pmap);
2233
2234 pmap_max_asids = pmap_compute_max_asids();
2235 #if HAS_16BIT_ASID
2236 asid_chunk_size = MAX_HW_ASIDS;
2237 #else
2238 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2239 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2240 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2241 * masking used by the PLRU scheme. This means we must handle the case in which
2242 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2243 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2244 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2245 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2246 #endif /* HAS_16BIT_ASIDS */
2247
2248 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2249
2250 /**
2251 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2252 * pp_attr_table, etc). This function will use `avail_start` to allocate
2253 * space for these data structures.
2254 */
2255 pmap_data_bootstrap();
2256
2257 /**
2258 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2259 */
2260 uat_bootstrap();
2261
2262
2263 /**
2264 * Bootstrap any necessary SART data structures and values needed from the device tree.
2265 */
2266 sart_bootstrap();
2267
2268 /**
2269 * Don't make any assumptions about the alignment of avail_start before this
2270 * point (i.e., pmap_data_bootstrap() performs allocations).
2271 */
2272 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2273
2274 const pmap_paddr_t pmap_struct_start = avail_start;
2275
2276 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2277 avail_start = round_page(avail_start + asid_table_size);
2278
2279 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2280
2281 vm_first_phys = gPhysBase;
2282 vm_last_phys = trunc_page(avail_end);
2283
2284 queue_init(&map_pmap_list);
2285 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2286 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2287 free_page_size_tt_count = 0;
2288 free_page_size_tt_max = 0;
2289 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2290 free_two_page_size_tt_count = 0;
2291 free_two_page_size_tt_max = 0;
2292 free_tt_list = TT_FREE_ENTRY_NULL;
2293 free_tt_count = 0;
2294 free_tt_max = 0;
2295
2296 virtual_space_start = vstart;
2297 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2298
2299 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2300 #if !HAS_16BIT_ASID
2301 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2302 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2303 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2304 #endif /* !HAS_16BIT_ASID */
2305
2306
2307
2308 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2309 maxoffset = trunc_page(maxoffset);
2310 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2311 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2312 arm_pmap_max_offset_default = maxoffset;
2313 }
2314 }
2315 #if defined(__arm64__)
2316 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2317 maxoffset = trunc_page(maxoffset);
2318 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2319 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2320 arm64_pmap_max_offset_default = maxoffset;
2321 }
2322 }
2323 #endif
2324
2325 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2326
2327
2328 #if PMAP_CS_PPL_MONITOR
2329 /* Initialize the PPL trust cache read-write lock */
2330 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2331 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2332 #endif
2333
2334 #if MACH_ASSERT
2335 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2336 &vm_footprint_suspend_allowed,
2337 sizeof(vm_footprint_suspend_allowed));
2338 #endif /* MACH_ASSERT */
2339
2340 #if KASAN
2341 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2342 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2343 #endif /* KASAN */
2344
2345 /**
2346 * Ensure that avail_start is always left on a page boundary. The calling
2347 * code might not perform any alignment before allocating page tables so
2348 * this is important.
2349 */
2350 avail_start = round_page(avail_start);
2351 }
2352
2353 #if XNU_MONITOR
2354
2355 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2356 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2357 {
2358 pmap_paddr_t cur_pa;
2359 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2360 assert(pa_valid(cur_pa));
2361 ppattr_pa_set_monitor(cur_pa);
2362 }
2363 }
2364
2365 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2366 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2367 pmap_paddr_t end_pa,
2368 unsigned int expected_perm,
2369 unsigned int new_perm)
2370 {
2371 vm_offset_t start_va = phystokv(start_pa);
2372 vm_offset_t end_va = start_va + (end_pa - start_pa);
2373
2374 pa_set_range_monitor(start_pa, end_pa);
2375 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2376 }
2377
2378 static void
pmap_lockdown_kc(void)2379 pmap_lockdown_kc(void)
2380 {
2381 extern vm_offset_t vm_kernelcache_base;
2382 extern vm_offset_t vm_kernelcache_top;
2383 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2384 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2385 pmap_paddr_t cur_pa = start_pa;
2386 vm_offset_t cur_va = vm_kernelcache_base;
2387 while (cur_pa < end_pa) {
2388 vm_size_t range_size = end_pa - cur_pa;
2389 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2390 if (ptov_va != cur_va) {
2391 /*
2392 * If the physical address maps back to a virtual address that is non-linear
2393 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2394 * reclaimed by the OS and should therefore not be locked down.
2395 */
2396 cur_pa += range_size;
2397 cur_va += range_size;
2398 continue;
2399 }
2400 unsigned int pai = pa_index(cur_pa);
2401 pv_entry_t **pv_h = pai_to_pvh(pai);
2402
2403 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2404
2405 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2406 panic("pai %d already locked down", pai);
2407 }
2408
2409 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2410 cur_pa += ARM_PGBYTES;
2411 cur_va += ARM_PGBYTES;
2412 }
2413 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2414 extern uint64_t ctrr_ro_test;
2415 extern uint64_t ctrr_nx_test;
2416 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2417 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2418 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2419 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2420 }
2421 #endif
2422 }
2423
2424 void
pmap_static_allocations_done(void)2425 pmap_static_allocations_done(void)
2426 {
2427 pmap_paddr_t monitor_start_pa;
2428 pmap_paddr_t monitor_end_pa;
2429
2430 /*
2431 * Protect the bootstrap (V=P and V->P) page tables.
2432 *
2433 * These bootstrap allocations will be used primarily for page tables.
2434 * If we wish to secure the page tables, we need to start by marking
2435 * these bootstrap allocations as pages that we want to protect.
2436 */
2437 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2438 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2439
2440 /* The bootstrap page tables are mapped RW at boostrap. */
2441 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2442
2443 /*
2444 * We use avail_start as a pointer to the first address that has not
2445 * been reserved for bootstrap, so we know which pages to give to the
2446 * virtual memory layer.
2447 */
2448 monitor_start_pa = first_avail_phys;
2449 monitor_end_pa = avail_start;
2450
2451 /* The other bootstrap allocations are mapped RW at bootstrap. */
2452 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2453
2454 /*
2455 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2456 * to RO in arm_vm_prot_finalize(), which is called after this function.
2457 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2458 * they can't be allocated for other uses. We don't need a special xPRR
2459 * protection index, as there is no PPL_RO index, and these pages are ultimately
2460 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2461 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2462 * lookup table index to USER_XO before APRR is applied, leading the hardware
2463 * to believe we are dealing with an user XO page upon performing a translation.
2464 */
2465 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2466 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2467 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2468
2469 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2470 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2471
2472 /* PPL data is RW for the PPL, RO for the kernel. */
2473 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2474
2475 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2476 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2477
2478 /* PPL text is RX for the PPL, RO for the kernel. */
2479 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2480
2481
2482 /*
2483 * In order to support DTrace, the save areas for the PPL must be
2484 * writable. This is due to the fact that DTrace will try to update
2485 * register state.
2486 */
2487 if (pmap_ppl_disable) {
2488 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2489 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2490
2491 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2492 }
2493
2494
2495 if (segSizePPLDATACONST > 0) {
2496 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2497 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2498
2499 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2500 }
2501
2502 /*
2503 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2504 * precaution. The real RW mappings are at a different location with guard pages.
2505 */
2506 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2507
2508 /* Prevent remapping of the kernelcache */
2509 pmap_lockdown_kc();
2510 }
2511
2512 void
pmap_lockdown_ppl(void)2513 pmap_lockdown_ppl(void)
2514 {
2515 /* Mark the PPL as being locked down. */
2516
2517 mp_disable_preemption(); // for _nopreempt locking operations
2518 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2519 if (commpage_text_kva != 0) {
2520 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2521 false, VM_PROT_READ | VM_PROT_EXECUTE);
2522 }
2523 mp_enable_preemption();
2524
2525 /* Write-protect the kernel RO commpage. */
2526 #error "XPRR configuration error"
2527 }
2528 #endif /* XNU_MONITOR */
2529
2530 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2531 pmap_virtual_space(
2532 vm_offset_t *startp,
2533 vm_offset_t *endp
2534 )
2535 {
2536 *startp = virtual_space_start;
2537 *endp = virtual_space_end;
2538 }
2539
2540
2541 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2542 pmap_virtual_region(
2543 unsigned int region_select,
2544 vm_map_offset_t *startp,
2545 vm_map_size_t *size
2546 )
2547 {
2548 boolean_t ret = FALSE;
2549 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2550 if (region_select == 0) {
2551 /*
2552 * In this config, the bootstrap mappings should occupy their own L2
2553 * TTs, as they should be immutable after boot. Having the associated
2554 * TTEs and PTEs in their own pages allows us to lock down those pages,
2555 * while allowing the rest of the kernel address range to be remapped.
2556 */
2557 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2558 #if defined(ARM_LARGE_MEMORY)
2559 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2560 #else
2561 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2562 #endif
2563 ret = TRUE;
2564 }
2565
2566 #if defined(ARM_LARGE_MEMORY)
2567 if (region_select == 1) {
2568 *startp = VREGION1_START;
2569 *size = VREGION1_SIZE;
2570 ret = TRUE;
2571 }
2572 #endif
2573 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2574 #if defined(ARM_LARGE_MEMORY)
2575 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2576 if (region_select == 0) {
2577 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2578 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2579 ret = TRUE;
2580 }
2581
2582 if (region_select == 1) {
2583 *startp = VREGION1_START;
2584 *size = VREGION1_SIZE;
2585 ret = TRUE;
2586 }
2587 #else /* !defined(ARM_LARGE_MEMORY) */
2588 unsigned long low_global_vr_mask = 0;
2589 vm_map_size_t low_global_vr_size = 0;
2590
2591 if (region_select == 0) {
2592 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2593 if (!TEST_PAGE_SIZE_4K) {
2594 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2595 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2596 } else {
2597 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2598 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2599 }
2600 ret = TRUE;
2601 }
2602 if (region_select == 1) {
2603 *startp = VREGION1_START;
2604 *size = VREGION1_SIZE;
2605 ret = TRUE;
2606 }
2607 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2608 if (!TEST_PAGE_SIZE_4K) {
2609 low_global_vr_mask = 0xFFFFFFFFFE000000;
2610 low_global_vr_size = 0x2000000;
2611 } else {
2612 low_global_vr_mask = 0xFFFFFFFFFF800000;
2613 low_global_vr_size = 0x800000;
2614 }
2615
2616 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2617 *startp = LOW_GLOBAL_BASE_ADDRESS;
2618 *size = low_global_vr_size;
2619 ret = TRUE;
2620 }
2621
2622 if (region_select == 3) {
2623 /* In this config, we allow the bootstrap mappings to occupy the same
2624 * page table pages as the heap.
2625 */
2626 *startp = VM_MIN_KERNEL_ADDRESS;
2627 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2628 ret = TRUE;
2629 }
2630 #endif /* defined(ARM_LARGE_MEMORY) */
2631 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2632 return ret;
2633 }
2634
2635 /*
2636 * Routines to track and allocate physical pages during early boot.
2637 * On most systems that memory runs from first_avail through to avail_end
2638 * with no gaps.
2639 *
2640 * If the system supports ECC and ecc_bad_pages_count > 0, we
2641 * need to skip those pages.
2642 */
2643
2644 static unsigned int avail_page_count = 0;
2645 static bool need_ram_ranges_init = true;
2646
2647
2648 /**
2649 * Checks to see if a given page is in
2650 * the array of known bad pages
2651 *
2652 * @param ppn page number to check
2653 */
2654 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2655 pmap_is_bad_ram(__unused ppnum_t ppn)
2656 {
2657 return false;
2658 }
2659
2660 /**
2661 * Prepare bad ram pages to be skipped.
2662 */
2663
2664 /*
2665 * Initialize the count of available pages. No lock needed here,
2666 * as this code is called while kernel boot up is single threaded.
2667 */
2668 static void
initialize_ram_ranges(void)2669 initialize_ram_ranges(void)
2670 {
2671 pmap_paddr_t first = first_avail;
2672 pmap_paddr_t end = avail_end;
2673
2674 assert(first <= end);
2675 assert(first == (first & ~PAGE_MASK));
2676 assert(end == (end & ~PAGE_MASK));
2677 avail_page_count = atop(end - first);
2678
2679 need_ram_ranges_init = false;
2680 }
2681
2682 unsigned int
pmap_free_pages(void)2683 pmap_free_pages(
2684 void)
2685 {
2686 if (need_ram_ranges_init) {
2687 initialize_ram_ranges();
2688 }
2689 return avail_page_count;
2690 }
2691
2692 unsigned int
pmap_free_pages_span(void)2693 pmap_free_pages_span(
2694 void)
2695 {
2696 if (need_ram_ranges_init) {
2697 initialize_ram_ranges();
2698 }
2699 return (unsigned int)atop(avail_end - first_avail);
2700 }
2701
2702
2703 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2704 pmap_next_page_hi(
2705 ppnum_t * pnum,
2706 __unused boolean_t might_free)
2707 {
2708 return pmap_next_page(pnum);
2709 }
2710
2711
2712 boolean_t
pmap_next_page(ppnum_t * pnum)2713 pmap_next_page(
2714 ppnum_t *pnum)
2715 {
2716 if (need_ram_ranges_init) {
2717 initialize_ram_ranges();
2718 }
2719
2720
2721 if (first_avail != avail_end) {
2722 *pnum = (ppnum_t)atop(first_avail);
2723 first_avail += PAGE_SIZE;
2724 assert(avail_page_count > 0);
2725 --avail_page_count;
2726 return TRUE;
2727 }
2728 assert(avail_page_count == 0);
2729 return FALSE;
2730 }
2731
2732
2733 /*
2734 * Initialize the pmap module.
2735 * Called by vm_init, to initialize any structures that the pmap
2736 * system needs to map virtual memory.
2737 */
2738 void
pmap_init(void)2739 pmap_init(
2740 void)
2741 {
2742 /*
2743 * Protect page zero in the kernel map.
2744 * (can be overruled by permanent transltion
2745 * table entries at page zero - see arm_vm_init).
2746 */
2747 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2748
2749 pmap_initialized = TRUE;
2750
2751 /*
2752 * Create the zone of physical maps
2753 * and the physical-to-virtual entries.
2754 */
2755 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2756 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2757
2758
2759 /*
2760 * Initialize the pmap object (for tracking the vm_page_t
2761 * structures for pages we allocate to be page tables in
2762 * pmap_expand().
2763 */
2764 _vm_object_allocate(mem_size, pmap_object);
2765 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2766
2767 /*
2768 * The values of [hard_]maxproc may have been scaled, make sure
2769 * they are still less than the value of pmap_max_asids.
2770 */
2771 if ((uint32_t)maxproc > pmap_max_asids) {
2772 maxproc = pmap_max_asids;
2773 }
2774 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2775 hard_maxproc = pmap_max_asids;
2776 }
2777 }
2778
2779 /**
2780 * Verify that a given physical page contains no mappings (outside of the
2781 * default physical aperture mapping).
2782 *
2783 * @param ppnum Physical page number to check there are no mappings to.
2784 *
2785 * @return True if there are no mappings, false otherwise or if the page is not
2786 * kernel-managed.
2787 */
2788 bool
pmap_verify_free(ppnum_t ppnum)2789 pmap_verify_free(ppnum_t ppnum)
2790 {
2791 const pmap_paddr_t pa = ptoa(ppnum);
2792
2793 assert(pa != vm_page_fictitious_addr);
2794
2795 /* Only mappings to kernel-managed physical memory are tracked. */
2796 if (!pa_valid(pa)) {
2797 return false;
2798 }
2799
2800 const unsigned int pai = pa_index(pa);
2801 pv_entry_t **pvh = pai_to_pvh(pai);
2802
2803 return pvh_test_type(pvh, PVH_TYPE_NULL);
2804 }
2805
2806 #if MACH_ASSERT
2807 /**
2808 * Verify that a given physical page contains no mappings (outside of the
2809 * default physical aperture mapping) and if it does, then panic.
2810 *
2811 * @note It's recommended to use pmap_verify_free() directly when operating in
2812 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2813 * normally being called from outside of the PPL, and the pv_head_table
2814 * can't be modified outside of the PPL).
2815 *
2816 * @param ppnum Physical page number to check there are no mappings to.
2817 */
2818 void
pmap_assert_free(ppnum_t ppnum)2819 pmap_assert_free(ppnum_t ppnum)
2820 {
2821 const pmap_paddr_t pa = ptoa(ppnum);
2822
2823 /* Only mappings to kernel-managed physical memory are tracked. */
2824 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2825 return;
2826 }
2827
2828 const unsigned int pai = pa_index(pa);
2829 pv_entry_t **pvh = pai_to_pvh(pai);
2830
2831 /**
2832 * This function is always called from outside of the PPL. Because of this,
2833 * the PVH entry can't be locked. This function is generally only called
2834 * before the VM reclaims a physical page and shouldn't be creating new
2835 * mappings. Even if a new mapping is created while parsing the hierarchy,
2836 * the worst case is that the system will panic in another way, and we were
2837 * already about to panic anyway.
2838 */
2839
2840 /**
2841 * Since pmap_verify_free() returned false, that means there is at least one
2842 * mapping left. Let's get some extra info on the first mapping we find to
2843 * dump in the panic string (the common case is that there is one spare
2844 * mapping that was never unmapped).
2845 */
2846 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2847
2848 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2849 first_ptep = pvh_ptep(pvh);
2850 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2851 pv_entry_t *pvep = pvh_pve_list(pvh);
2852
2853 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2854 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2855 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2856 if (first_ptep != PT_ENTRY_NULL) {
2857 break;
2858 }
2859 }
2860
2861 /* The PVE should have at least one valid PTE. */
2862 assert(first_ptep != PT_ENTRY_NULL);
2863 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2864 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2865 __func__, pvh, pai);
2866 } else {
2867 /**
2868 * The mapping disappeared between here and the pmap_verify_free() call.
2869 * The only way that can happen is if the VM was racing this call with
2870 * a call that unmaps PTEs. Operations on this page should not be
2871 * occurring at the same time as this check, and unfortunately we can't
2872 * lock the PVH entry to prevent it, so just panic instead.
2873 */
2874 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2875 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2876 __func__, pvh, pai);
2877 }
2878
2879 /* Panic with a unique string identifying the first bad mapping and owner. */
2880 {
2881 /* First PTE is mapped by the main CPUs. */
2882 pmap_t pmap = ptep_get_pmap(first_ptep);
2883 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2884
2885 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2886 "%s CPU mapping (pmap: %p)",
2887 __func__, (uint64_t)pa, first_ptep, type, pmap);
2888 }
2889 }
2890 #endif
2891
2892
2893 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2894 pmap_root_alloc_size(pmap_t pmap)
2895 {
2896 #pragma unused(pmap)
2897 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2898 unsigned int root_level = pt_attr_root_level(pt_attr);
2899 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2900 }
2901
2902
2903 /*
2904 * Create and return a physical map.
2905 *
2906 * If the size specified for the map
2907 * is zero, the map is an actual physical
2908 * map, and may be referenced by the
2909 * hardware.
2910 *
2911 * If the size specified is non-zero,
2912 * the map will be used in software only, and
2913 * is bounded by that size.
2914 */
2915 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2916 pmap_create_options_internal(
2917 ledger_t ledger,
2918 vm_map_size_t size,
2919 unsigned int flags,
2920 kern_return_t *kr)
2921 {
2922 unsigned i;
2923 unsigned tte_index_max;
2924 pmap_t p;
2925 bool is_64bit = flags & PMAP_CREATE_64BIT;
2926 #if defined(HAS_APPLE_PAC)
2927 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2928 #endif /* defined(HAS_APPLE_PAC) */
2929 kern_return_t local_kr = KERN_SUCCESS;
2930
2931 if (size != 0) {
2932 {
2933 // Size parameter should only be set for stage 2.
2934 return PMAP_NULL;
2935 }
2936 }
2937
2938 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2939 return PMAP_NULL;
2940 }
2941
2942 #if XNU_MONITOR
2943 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2944 goto pmap_create_fail;
2945 }
2946
2947 assert(p != PMAP_NULL);
2948
2949 if (ledger) {
2950 pmap_ledger_validate(ledger);
2951 pmap_ledger_retain(ledger);
2952 }
2953 #else
2954 /*
2955 * Allocate a pmap struct from the pmap_zone. Then allocate
2956 * the translation table of the right size for the pmap.
2957 */
2958 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2959 local_kr = KERN_RESOURCE_SHORTAGE;
2960 goto pmap_create_fail;
2961 }
2962 #endif
2963
2964 p->ledger = ledger;
2965
2966
2967 p->pmap_vm_map_cs_enforced = false;
2968
2969
2970 #if CONFIG_ROSETTA
2971 if (flags & PMAP_CREATE_ROSETTA) {
2972 p->is_rosetta = TRUE;
2973 } else {
2974 p->is_rosetta = FALSE;
2975 }
2976 #endif /* CONFIG_ROSETTA */
2977
2978 #if defined(HAS_APPLE_PAC)
2979 p->disable_jop = disable_jop;
2980 #endif /* defined(HAS_APPLE_PAC) */
2981
2982 p->nested_region_true_start = 0;
2983 p->nested_region_true_end = ~0;
2984
2985 p->nx_enabled = true;
2986 p->is_64bit = is_64bit;
2987 p->nested_pmap = PMAP_NULL;
2988 p->type = PMAP_TYPE_USER;
2989
2990 #if ARM_PARAMETERIZED_PMAP
2991 /* Default to the native pt_attr */
2992 p->pmap_pt_attr = native_pt_attr;
2993 #endif /* ARM_PARAMETERIZED_PMAP */
2994 #if __ARM_MIXED_PAGE_SIZE__
2995 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
2996 p->pmap_pt_attr = &pmap_pt_attr_4k;
2997 }
2998 #endif /* __ARM_MIXED_PAGE_SIZE__ */
2999 p->max = pmap_user_va_size(p);
3000 /* Don't allow mapping the first page (i.e. NULL or near-NULL). */
3001 p->min = pt_attr_page_size(pmap_get_pt_attr(p));
3002
3003 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3004 local_kr = KERN_NO_SPACE;
3005 goto id_alloc_fail;
3006 }
3007
3008 pmap_lock_init(p);
3009
3010 p->tt_entry_free = (tt_entry_t *)0;
3011 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3012
3013
3014 #if XNU_MONITOR
3015 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3016 #else
3017 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3018 #endif
3019 if (!(p->tte)) {
3020 local_kr = KERN_RESOURCE_SHORTAGE;
3021 goto tt1_alloc_fail;
3022 }
3023
3024 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3025 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3026
3027 /* nullify the translation table */
3028 for (i = 0; i < tte_index_max; i++) {
3029 p->tte[i] = ARM_TTE_TYPE_FAULT;
3030 }
3031
3032 FLUSH_PTE();
3033
3034 /*
3035 * initialize the rest of the structure
3036 */
3037 p->nested_region_addr = 0x0ULL;
3038 p->nested_region_size = 0x0ULL;
3039 p->nested_region_unnested_table_bitmap = NULL;
3040 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3041
3042 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3043 p->nested_no_bounds_refcnt = 0;
3044 p->nested_bounds_set = false;
3045
3046
3047 #if MACH_ASSERT
3048 p->pmap_pid = 0;
3049 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3050 #endif /* MACH_ASSERT */
3051 #if DEVELOPMENT || DEBUG
3052 p->footprint_was_suspended = FALSE;
3053 #endif /* DEVELOPMENT || DEBUG */
3054
3055 #if XNU_MONITOR
3056 os_atomic_init(&p->nested_count, 0);
3057 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3058 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3059 os_atomic_thread_fence(release);
3060 #endif
3061 os_atomic_init(&p->ref_count, 1);
3062 pmap_simple_lock(&pmaps_lock);
3063 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3064 pmap_simple_unlock(&pmaps_lock);
3065
3066 /*
3067 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3068 * which can lead to a concurrent disconnect operation making the balance
3069 * transiently negative. The ledger should still ultimately balance out,
3070 * which we still check upon pmap destruction.
3071 */
3072 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3073 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3074 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3075 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3076 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3077 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3078 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3079 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3080 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3081
3082 return p;
3083
3084 tt1_alloc_fail:
3085 pmap_get_pt_ops(p)->free_id(p);
3086 id_alloc_fail:
3087 #if XNU_MONITOR
3088 pmap_free_pmap(p);
3089
3090 if (ledger) {
3091 pmap_ledger_release(ledger);
3092 }
3093 #else
3094 zfree(pmap_zone, p);
3095 #endif
3096 pmap_create_fail:
3097 #if XNU_MONITOR
3098 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3099 #endif
3100 *kr = local_kr;
3101 #if XNU_MONITOR
3102 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3103 #endif
3104 return PMAP_NULL;
3105 }
3106
3107 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3108 pmap_create_options(
3109 ledger_t ledger,
3110 vm_map_size_t size,
3111 unsigned int flags)
3112 {
3113 pmap_t pmap;
3114 kern_return_t kr = KERN_SUCCESS;
3115
3116 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3117
3118 ledger_reference(ledger);
3119
3120 #if XNU_MONITOR
3121 for (;;) {
3122 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3123 if (kr != KERN_RESOURCE_SHORTAGE) {
3124 break;
3125 }
3126 assert(pmap == PMAP_NULL);
3127 pmap_alloc_page_for_ppl(0);
3128 kr = KERN_SUCCESS;
3129 }
3130 #else
3131 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3132 #endif
3133
3134 if (pmap == PMAP_NULL) {
3135 ledger_dereference(ledger);
3136 }
3137
3138 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3139
3140 return pmap;
3141 }
3142
3143 #if XNU_MONITOR
3144 /*
3145 * This symbol remains in place when the PPL is enabled so that the dispatch
3146 * table does not change from development to release configurations.
3147 */
3148 #endif
3149 #if MACH_ASSERT || XNU_MONITOR
3150 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3151 pmap_set_process_internal(
3152 __unused pmap_t pmap,
3153 __unused int pid,
3154 __unused char *procname)
3155 {
3156 #if MACH_ASSERT
3157 if (pmap == NULL || pmap->pmap_pid == -1) {
3158 return;
3159 }
3160
3161 validate_pmap_mutable(pmap);
3162
3163 pmap->pmap_pid = pid;
3164 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3165 #endif /* MACH_ASSERT */
3166 }
3167 #endif /* MACH_ASSERT || XNU_MONITOR */
3168
3169 #if MACH_ASSERT
3170 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3171 pmap_set_process(
3172 pmap_t pmap,
3173 int pid,
3174 char *procname)
3175 {
3176 #if XNU_MONITOR
3177 pmap_set_process_ppl(pmap, pid, procname);
3178 #else
3179 pmap_set_process_internal(pmap, pid, procname);
3180 #endif
3181 }
3182 #endif /* MACH_ASSERT */
3183
3184 /*
3185 * pmap_deallocate_all_leaf_tts:
3186 *
3187 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3188 * removing and deallocating all TTEs.
3189 */
3190 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3191 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3192 {
3193 tt_entry_t tte = ARM_TTE_EMPTY;
3194 tt_entry_t * ttep = NULL;
3195 tt_entry_t * last_ttep = NULL;
3196
3197 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3198
3199 assert(level < pt_attr_leaf_level(pt_attr));
3200
3201 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3202
3203 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3204 tte = *ttep;
3205
3206 if (!(tte & ARM_TTE_VALID)) {
3207 continue;
3208 }
3209
3210 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3211 panic("%s: found block mapping, ttep=%p, tte=%p, "
3212 "pmap=%p, first_ttep=%p, level=%u",
3213 __FUNCTION__, ttep, (void *)tte,
3214 pmap, first_ttep, level);
3215 }
3216
3217 /* Must be valid, type table */
3218 if (level < pt_attr_twig_level(pt_attr)) {
3219 /* If we haven't reached the twig level, recurse to the next level. */
3220 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3221 }
3222
3223 /* Remove the TTE. */
3224 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3225 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3226 }
3227 }
3228
3229 /*
3230 * We maintain stats and ledgers so that a task's physical footprint is:
3231 * phys_footprint = ((internal - alternate_accounting)
3232 * + (internal_compressed - alternate_accounting_compressed)
3233 * + iokit_mapped
3234 * + purgeable_nonvolatile
3235 * + purgeable_nonvolatile_compressed
3236 * + page_table)
3237 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3238 */
3239
3240 /*
3241 * Retire the given physical map from service.
3242 * Should only be called if the map contains
3243 * no valid mappings.
3244 */
3245 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3246 pmap_destroy_internal(
3247 pmap_t pmap)
3248 {
3249 if (pmap == PMAP_NULL) {
3250 return;
3251 }
3252
3253 validate_pmap(pmap);
3254
3255 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3256
3257 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3258 if (ref_count > 0) {
3259 return;
3260 } else if (__improbable(ref_count < 0)) {
3261 panic("pmap %p: refcount underflow", pmap);
3262 } else if (__improbable(pmap == kernel_pmap)) {
3263 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3264 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3265 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3266 }
3267
3268 #if XNU_MONITOR
3269 /*
3270 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3271 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3272 * That ensures that if the pmap is currently in use elsewhere, this path will
3273 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3274 * ref_count of 0 and panic.
3275 */
3276 os_atomic_thread_fence(seq_cst);
3277 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3278 panic("pmap %p: attempt to destroy while nested", pmap);
3279 }
3280 const int max_cpu = ml_get_max_cpu_number();
3281 for (unsigned int i = 0; i <= max_cpu; ++i) {
3282 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3283 if (cpu_data == NULL) {
3284 continue;
3285 }
3286 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3287 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3288 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3289 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3290 }
3291 }
3292 #endif
3293 pmap_unmap_commpage(pmap);
3294
3295 pmap_simple_lock(&pmaps_lock);
3296 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3297 pmap_simple_unlock(&pmaps_lock);
3298
3299 pmap_trim_self(pmap);
3300
3301 /*
3302 * Free the memory maps, then the
3303 * pmap structure.
3304 */
3305 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3306
3307
3308
3309 if (pmap->tte) {
3310 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3311 pmap->tte = (tt_entry_t *) NULL;
3312 pmap->ttep = 0;
3313 }
3314
3315 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3316
3317 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3318 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3319 sync_tlb_flush();
3320 } else {
3321 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3322 sync_tlb_flush();
3323 /* return its asid to the pool */
3324 pmap_get_pt_ops(pmap)->free_id(pmap);
3325 if (pmap->nested_pmap != NULL) {
3326 #if XNU_MONITOR
3327 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3328 #endif
3329 /* release the reference we hold on the nested pmap */
3330 pmap_destroy_internal(pmap->nested_pmap);
3331 }
3332 }
3333
3334 pmap_check_ledgers(pmap);
3335
3336 if (pmap->nested_region_unnested_table_bitmap) {
3337 #if XNU_MONITOR
3338 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3339 #else
3340 kfree_data(pmap->nested_region_unnested_table_bitmap,
3341 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3342 #endif
3343 }
3344
3345 #if XNU_MONITOR
3346 if (pmap->ledger) {
3347 pmap_ledger_release(pmap->ledger);
3348 }
3349
3350 pmap_lock_destroy(pmap);
3351 pmap_free_pmap(pmap);
3352 #else
3353 pmap_lock_destroy(pmap);
3354 zfree(pmap_zone, pmap);
3355 #endif
3356 }
3357
3358 void
pmap_destroy(pmap_t pmap)3359 pmap_destroy(
3360 pmap_t pmap)
3361 {
3362 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3363
3364 ledger_t ledger = pmap->ledger;
3365
3366 #if XNU_MONITOR
3367 pmap_destroy_ppl(pmap);
3368
3369 pmap_ledger_check_balance(pmap);
3370 #else
3371 pmap_destroy_internal(pmap);
3372 #endif
3373
3374 ledger_dereference(ledger);
3375
3376 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3377 }
3378
3379
3380 /*
3381 * Add a reference to the specified pmap.
3382 */
3383 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3384 pmap_reference_internal(
3385 pmap_t pmap)
3386 {
3387 if (pmap != PMAP_NULL) {
3388 validate_pmap_mutable(pmap);
3389 os_atomic_inc(&pmap->ref_count, relaxed);
3390 }
3391 }
3392
3393 void
pmap_reference(pmap_t pmap)3394 pmap_reference(
3395 pmap_t pmap)
3396 {
3397 #if XNU_MONITOR
3398 pmap_reference_ppl(pmap);
3399 #else
3400 pmap_reference_internal(pmap);
3401 #endif
3402 }
3403
3404 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3405 pmap_tt1_allocate(
3406 pmap_t pmap,
3407 vm_size_t size,
3408 unsigned option)
3409 {
3410 tt_entry_t *tt1 = NULL;
3411 tt_free_entry_t *tt1_free;
3412 pmap_paddr_t pa;
3413 vm_address_t va;
3414 vm_address_t va_end;
3415 kern_return_t ret;
3416
3417 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3418 size = PAGE_SIZE;
3419 }
3420
3421 pmap_simple_lock(&tt1_lock);
3422 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3423 free_page_size_tt_count--;
3424 tt1 = (tt_entry_t *)free_page_size_tt_list;
3425 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3426 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3427 free_two_page_size_tt_count--;
3428 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3429 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3430 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3431 free_tt_count--;
3432 tt1 = (tt_entry_t *)free_tt_list;
3433 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3434 }
3435
3436 pmap_simple_unlock(&tt1_lock);
3437
3438 if (tt1 != NULL) {
3439 pmap_tt_ledger_credit(pmap, size);
3440 return (tt_entry_t *)tt1;
3441 }
3442
3443 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3444
3445 if (ret == KERN_RESOURCE_SHORTAGE) {
3446 return (tt_entry_t *)0;
3447 }
3448
3449 #if XNU_MONITOR
3450 assert(pa);
3451 #endif
3452
3453 if (size < PAGE_SIZE) {
3454 va = phystokv(pa) + size;
3455 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3456 tt_free_entry_t *next_free = NULL;
3457 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3458 tt1_free = (tt_free_entry_t *)va;
3459 tt1_free->next = next_free;
3460 next_free = tt1_free;
3461 }
3462 pmap_simple_lock(&tt1_lock);
3463 local_free_list->next = free_tt_list;
3464 free_tt_list = next_free;
3465 free_tt_count += ((PAGE_SIZE / size) - 1);
3466 if (free_tt_count > free_tt_max) {
3467 free_tt_max = free_tt_count;
3468 }
3469 pmap_simple_unlock(&tt1_lock);
3470 }
3471
3472 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3473 * Depending on the device, this can vary between 512b and 16K. */
3474 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3475 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3476 pmap_tt_ledger_credit(pmap, size);
3477
3478 return (tt_entry_t *) phystokv(pa);
3479 }
3480
3481 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3482 pmap_tt1_deallocate(
3483 pmap_t pmap,
3484 tt_entry_t *tt,
3485 vm_size_t size,
3486 unsigned option)
3487 {
3488 tt_free_entry_t *tt_entry;
3489
3490 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3491 size = PAGE_SIZE;
3492 }
3493
3494 tt_entry = (tt_free_entry_t *)tt;
3495 assert(not_in_kdp);
3496 pmap_simple_lock(&tt1_lock);
3497
3498 if (size < PAGE_SIZE) {
3499 free_tt_count++;
3500 if (free_tt_count > free_tt_max) {
3501 free_tt_max = free_tt_count;
3502 }
3503 tt_entry->next = free_tt_list;
3504 free_tt_list = tt_entry;
3505 }
3506
3507 if (size == PAGE_SIZE) {
3508 free_page_size_tt_count++;
3509 if (free_page_size_tt_count > free_page_size_tt_max) {
3510 free_page_size_tt_max = free_page_size_tt_count;
3511 }
3512 tt_entry->next = free_page_size_tt_list;
3513 free_page_size_tt_list = tt_entry;
3514 }
3515
3516 if (size == 2 * PAGE_SIZE) {
3517 free_two_page_size_tt_count++;
3518 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3519 free_two_page_size_tt_max = free_two_page_size_tt_count;
3520 }
3521 tt_entry->next = free_two_page_size_tt_list;
3522 free_two_page_size_tt_list = tt_entry;
3523 }
3524
3525 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3526 pmap_simple_unlock(&tt1_lock);
3527 pmap_tt_ledger_debit(pmap, size);
3528 return;
3529 }
3530
3531 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3532 free_page_size_tt_count--;
3533 tt = (tt_entry_t *)free_page_size_tt_list;
3534 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3535
3536 pmap_simple_unlock(&tt1_lock);
3537
3538 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3539
3540 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3541
3542 pmap_simple_lock(&tt1_lock);
3543 }
3544
3545 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3546 free_two_page_size_tt_count--;
3547 tt = (tt_entry_t *)free_two_page_size_tt_list;
3548 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3549
3550 pmap_simple_unlock(&tt1_lock);
3551
3552 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3553
3554 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3555
3556 pmap_simple_lock(&tt1_lock);
3557 }
3558 pmap_simple_unlock(&tt1_lock);
3559 pmap_tt_ledger_debit(pmap, size);
3560 }
3561
3562 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3563 pmap_tt_allocate(
3564 pmap_t pmap,
3565 tt_entry_t **ttp,
3566 unsigned int level,
3567 unsigned int options)
3568 {
3569 pmap_paddr_t pa;
3570 *ttp = NULL;
3571
3572 /* Traverse the tt_entry_free list to find a free tt_entry */
3573 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3574 return KERN_ABORTED;
3575 }
3576
3577 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3578 tt_free_entry_t *tt_free_cur, *tt_free_next;
3579
3580 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3581 tt_free_next = tt_free_cur->next;
3582 tt_free_cur->next = NULL;
3583 *ttp = (tt_entry_t *)tt_free_cur;
3584 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3585 }
3586 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3587
3588 /* Only do the heavylifting here when we don't have a free tt_entry. */
3589 if (*ttp == NULL) {
3590 pt_desc_t *ptdp;
3591
3592 /*
3593 * Allocate a VM page for the level x page table entries.
3594 */
3595 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3596 if (options & PMAP_OPTIONS_NOWAIT) {
3597 return KERN_RESOURCE_SHORTAGE;
3598 }
3599 VM_PAGE_WAIT();
3600 }
3601
3602 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3603 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3604 if (options & PMAP_OPTIONS_NOWAIT) {
3605 /* Deallocate all allocated resources so far. */
3606 pmap_pages_free(pa, PAGE_SIZE);
3607 return KERN_RESOURCE_SHORTAGE;
3608 }
3609 VM_PAGE_WAIT();
3610 }
3611
3612 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3613 OSAddAtomic64(1, &alloc_ttepages_count);
3614 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3615 } else {
3616 OSAddAtomic64(1, &alloc_ptepages_count);
3617 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3618 }
3619
3620 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3621
3622 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3623
3624 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3625 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3626 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3627
3628 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3629 if (PAGE_SIZE > pmap_page_size) {
3630 vm_address_t va;
3631 vm_address_t va_end;
3632
3633 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3634 /* Deallocate all allocated resources so far. */
3635 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3636 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3637 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3638 pmap_pages_free(pa, PAGE_SIZE);
3639 ptd_deallocate(ptdp);
3640
3641 return KERN_ABORTED;
3642 }
3643
3644 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3645 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3646 pmap->tt_entry_free = (tt_entry_t *)va;
3647 }
3648 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3649 }
3650
3651 *ttp = (tt_entry_t *)phystokv(pa);
3652 }
3653
3654 #if XNU_MONITOR
3655 assert(*ttp);
3656 #endif
3657
3658 return KERN_SUCCESS;
3659 }
3660
3661
3662 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3663 pmap_tt_deallocate(
3664 pmap_t pmap,
3665 tt_entry_t *ttp,
3666 unsigned int level)
3667 {
3668 pt_desc_t *ptdp;
3669 ptd_info_t *ptd_info;
3670 unsigned pt_acc_cnt;
3671 unsigned i;
3672 vm_offset_t free_page = 0;
3673 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3674 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3675
3676 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3677
3678 ptdp = ptep_get_ptd(ttp);
3679 ptd_info = ptd_get_info(ptdp, ttp);
3680
3681 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3682
3683 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3684 ptd_info->refcnt = 0;
3685 }
3686
3687 if (__improbable(ptd_info->refcnt != 0)) {
3688 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3689 }
3690
3691 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3692 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3693 }
3694
3695 if (pt_acc_cnt == 0) {
3696 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3697 unsigned pt_free_entry_cnt = 1;
3698
3699 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3700 tt_free_entry_t *tt_free_list_next;
3701
3702 tt_free_list_next = tt_free_list->next;
3703 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3704 pt_free_entry_cnt++;
3705 }
3706 tt_free_list = tt_free_list_next;
3707 }
3708 if (pt_free_entry_cnt == max_pt_index) {
3709 tt_free_entry_t *tt_free_list_cur;
3710
3711 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3712 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3713 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3714
3715 while (tt_free_list_cur) {
3716 tt_free_entry_t *tt_free_list_next;
3717
3718 tt_free_list_next = tt_free_list_cur->next;
3719 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3720 tt_free_list->next = tt_free_list_next->next;
3721 } else {
3722 tt_free_list = tt_free_list_next;
3723 }
3724 tt_free_list_cur = tt_free_list_next;
3725 }
3726 } else {
3727 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3728 pmap->tt_entry_free = ttp;
3729 }
3730 } else {
3731 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3732 pmap->tt_entry_free = ttp;
3733 }
3734
3735 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3736
3737 if (free_page != 0) {
3738 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3739 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3740 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3741 if (level < pt_attr_leaf_level(pt_attr)) {
3742 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3743 } else {
3744 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3745 }
3746 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3747 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3748 }
3749 }
3750
3751 /**
3752 * Safely clear out a translation table entry.
3753 *
3754 * @note If the TTE to clear out points to a leaf table, then that leaf table
3755 * must have a refcnt of zero before the TTE can be removed.
3756 * @note This function expects to be called with pmap locked exclusive, and will
3757 * return with pmap unlocked.
3758 *
3759 * @param pmap The pmap containing the page table whose TTE is being removed.
3760 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3761 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3762 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3763 * @param ttep Pointer to the TTE that should be cleared out.
3764 * @param level The level of the page table that contains the TTE to be removed.
3765 */
3766 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3767 pmap_tte_remove(
3768 pmap_t pmap,
3769 vm_offset_t va_start,
3770 vm_offset_t va_end,
3771 bool need_strong_sync,
3772 tt_entry_t *ttep,
3773 unsigned int level)
3774 {
3775 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3776
3777 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3778 const tt_entry_t tte = *ttep;
3779
3780 if (__improbable(tte == ARM_TTE_EMPTY)) {
3781 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3782 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3783 }
3784
3785 *ttep = (tt_entry_t) 0;
3786 FLUSH_PTE_STRONG();
3787 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3788 if (va_end > va_start) {
3789 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3790 }
3791
3792 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3793
3794 /**
3795 * Remember, the passed in "level" parameter refers to the level above the
3796 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3797 * page table).
3798 */
3799 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3800
3801 /**
3802 * Non-leaf pagetables don't track active references in the PTD and instead
3803 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3804 * the real refcount below.
3805 */
3806 unsigned short refcnt = PT_DESC_REFCOUNT;
3807
3808 /*
3809 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3810 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3811 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3812 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3813 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3814 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3815 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3816 * synchronize it against the disconnect operation. If that removal caused the
3817 * refcount to reach zero, the pagetable page could be freed before the disconnect
3818 * operation is finished using the relevant pagetable descriptor.
3819 * Address these cases by waiting until all CPUs have been observed to not be
3820 * executing pmap_disconnect().
3821 */
3822 if (remove_leaf_table) {
3823 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3824 const int max_cpu = ml_get_max_cpu_number();
3825 bitmap_full(&active_disconnects[0], max_cpu + 1);
3826 bool inflight_disconnect;
3827
3828 /*
3829 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3830 * ahead of any prior PTE load which may have observed the effect of a
3831 * concurrent disconnect operation. An acquire fence is required for this;
3832 * a load-acquire operation is insufficient.
3833 */
3834 os_atomic_thread_fence(acquire);
3835 do {
3836 inflight_disconnect = false;
3837 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3838 i >= 0;
3839 i = bitmap_next(&active_disconnects[0], i)) {
3840 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3841 if (cpu_data == NULL) {
3842 continue;
3843 }
3844 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3845 __builtin_arm_wfe();
3846 inflight_disconnect = true;
3847 continue;
3848 }
3849 os_atomic_clear_exclusive();
3850 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3851 }
3852 } while (inflight_disconnect);
3853 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3854 os_atomic_thread_fence(acquire);
3855 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3856 }
3857
3858 #if MACH_ASSERT
3859 /**
3860 * On internal devices, always do the page table consistency check
3861 * regardless of page table level or the actual refcnt value.
3862 */
3863 {
3864 #else /* MACH_ASSERT */
3865 /**
3866 * Only perform the page table consistency check when deleting leaf page
3867 * tables and it seems like there might be valid/compressed mappings
3868 * leftover.
3869 */
3870 if (__improbable(remove_leaf_table && refcnt != 0)) {
3871 #endif /* MACH_ASSERT */
3872
3873 /**
3874 * There are multiple problems that can arise as a non-zero refcnt:
3875 * 1. A bug in the refcnt management logic.
3876 * 2. A memory stomper or hardware failure.
3877 * 3. The VM forgetting to unmap all of the valid mappings in an address
3878 * space before destroying a pmap.
3879 *
3880 * By looping over the page table and determining how many valid or
3881 * compressed entries there actually are, we can narrow down which of
3882 * these three cases is causing this panic. If the expected refcnt
3883 * (valid + compressed) and the actual refcnt don't match then the
3884 * problem is probably either a memory corruption issue (if the
3885 * non-empty entries don't match valid+compressed, that could also be a
3886 * sign of corruption) or refcnt management bug. Otherwise, there
3887 * actually are leftover mappings and the higher layers of xnu are
3888 * probably at fault.
3889 */
3890 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3891 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3892
3893 pt_entry_t *ptep = bpte;
3894 unsigned short non_empty = 0, valid = 0, comp = 0;
3895
3896 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3897 /**
3898 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3899 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3900 * That's because it's possible for the 4-tuple PTE clear operation in
3901 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3902 * pmap_disconnect() to race each other in such a way that the compressed marker
3903 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3904 * This should be harmless as only the 1st PTE is used for accounting purposes,
3905 * but we don't want it to trip our internal checks here.
3906 */
3907 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3908 if ((i % PAGE_RATIO) == 0) {
3909 comp++;
3910 } else {
3911 continue;
3912 }
3913 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3914 valid++;
3915 }
3916
3917 /* Keep track of all non-empty entries to detect memory corruption. */
3918 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3919 non_empty++;
3920 }
3921 }
3922
3923 #if MACH_ASSERT
3924 /**
3925 * On internal machines, panic whenever a page table getting deleted has
3926 * leftover mappings (valid or otherwise) or a leaf page table has a
3927 * non-zero refcnt.
3928 */
3929 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3930 #else /* MACH_ASSERT */
3931 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3932 {
3933 #endif /* MACH_ASSERT */
3934 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3935 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3936 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3937 }
3938 }
3939 }
3940
3941 /**
3942 * Given a pointer to an entry within a `level` page table, delete the
3943 * page table at `level` + 1 that is represented by that entry. For instance,
3944 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3945 * contains the PA of the L3 table, and `level` would be "2".
3946 *
3947 * @note If the table getting deallocated is a leaf table, then that leaf table
3948 * must have a refcnt of zero before getting deallocated. All other levels
3949 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3950 * @note This function expects to be called with pmap locked exclusive and will
3951 * return with pmap unlocked.
3952 *
3953 * @param pmap The pmap that owns the page table to be deallocated.
3954 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3955 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3956 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3957 * @param ttep Pointer to the `level` TTE to remove.
3958 * @param level The level of the table that contains an entry pointing to the
3959 * table to be removed. The deallocated page table will be a
3960 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3961 * deleted).
3962 */
3963 void
3964 pmap_tte_deallocate(
3965 pmap_t pmap,
3966 vm_offset_t va_start,
3967 vm_offset_t va_end,
3968 bool need_strong_sync,
3969 tt_entry_t *ttep,
3970 unsigned int level)
3971 {
3972 tt_entry_t tte;
3973
3974 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3975
3976 tte = *ttep;
3977
3978 if (tte_get_ptd(tte)->pmap != pmap) {
3979 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3980 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3981 }
3982
3983 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3984 __func__, ttep, (unsigned long long)tte);
3985
3986 /* pmap_tte_remove() will drop the pmap lock */
3987 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3988
3989 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3990 }
3991
3992 /*
3993 * Remove a range of hardware page-table entries.
3994 * The entries given are the first (inclusive)
3995 * and last (exclusive) entries for the VM pages.
3996 * The virtual address is the va for the first pte.
3997 *
3998 * The pmap must be locked.
3999 * If the pmap is not the kernel pmap, the range must lie
4000 * entirely within one pte-page. This is NOT checked.
4001 * Assumes that the pte-page exists.
4002 *
4003 * Returns the number of PTE changed
4004 */
4005 MARK_AS_PMAP_TEXT static int
4006 pmap_remove_range(
4007 pmap_t pmap,
4008 vm_map_address_t va,
4009 pt_entry_t *bpte,
4010 pt_entry_t *epte)
4011 {
4012 bool need_strong_sync = false;
4013 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4014 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4015 if (num_changed > 0) {
4016 PMAP_UPDATE_TLBS(pmap, va,
4017 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4018 }
4019 return num_changed;
4020 }
4021
4022
4023 #ifdef PVH_FLAG_EXEC
4024
4025 /*
4026 * Update the access protection bits of the physical aperture mapping for a page.
4027 * This is useful, for example, in guranteeing that a verified executable page
4028 * has no writable mappings anywhere in the system, including the physical
4029 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4030 * synchronization overhead in cases where the call to this function is
4031 * guaranteed to be followed by other TLB operations.
4032 */
4033 void
4034 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4035 {
4036 #if __ARM_PTE_PHYSMAP__
4037 pvh_assert_locked(pai);
4038 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4039 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4040
4041 pt_entry_t tmplate = *pte_p;
4042 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4043 return;
4044 }
4045 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4046 if (tmplate & ARM_PTE_HINT_MASK) {
4047 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4048 __func__, pte_p, (void *)kva, tmplate);
4049 }
4050 write_pte_strong(pte_p, tmplate);
4051 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4052 if (!flush_tlb_async) {
4053 sync_tlb_flush();
4054 }
4055 #endif
4056 }
4057 #endif /* defined(PVH_FLAG_EXEC) */
4058
4059
4060
4061 MARK_AS_PMAP_TEXT int
4062 pmap_remove_range_options(
4063 pmap_t pmap,
4064 vm_map_address_t va,
4065 pt_entry_t *bpte,
4066 pt_entry_t *epte,
4067 vm_map_address_t *eva,
4068 bool *need_strong_sync __unused,
4069 int options)
4070 {
4071 pt_entry_t *cpte;
4072 size_t npages = 0;
4073 int num_removed, num_unwired;
4074 int num_pte_changed;
4075 unsigned int pai = 0;
4076 pmap_paddr_t pa;
4077 int num_external, num_internal, num_reusable;
4078 int num_alt_internal;
4079 uint64_t num_compressed, num_alt_compressed;
4080 int16_t refcnt = 0;
4081
4082 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4083
4084 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4085 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4086
4087 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4088 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4089 }
4090
4091 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4092 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4093 }
4094
4095 num_removed = 0;
4096 num_unwired = 0;
4097 num_pte_changed = 0;
4098 num_external = 0;
4099 num_internal = 0;
4100 num_reusable = 0;
4101 num_compressed = 0;
4102 num_alt_internal = 0;
4103 num_alt_compressed = 0;
4104
4105 #if XNU_MONITOR
4106 bool ro_va = false;
4107 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4108 ro_va = true;
4109 }
4110 #endif
4111 for (cpte = bpte; cpte < epte;
4112 cpte += PAGE_RATIO, va += pmap_page_size) {
4113 pt_entry_t spte;
4114 boolean_t managed = FALSE;
4115
4116 /*
4117 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4118 * so we need to be as aggressive as possible in checking for preemption when we can.
4119 */
4120 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4121 *eva = va;
4122 break;
4123 }
4124
4125 spte = *((volatile pt_entry_t*)cpte);
4126
4127 while (!managed) {
4128 if (pmap != kernel_pmap &&
4129 (options & PMAP_OPTIONS_REMOVE) &&
4130 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4131 /*
4132 * "pmap" must be locked at this point,
4133 * so this should not race with another
4134 * pmap_remove_range() or pmap_enter().
4135 */
4136
4137 /* one less "compressed"... */
4138 num_compressed++;
4139 if (spte & ARM_PTE_COMPRESSED_ALT) {
4140 /* ... but it used to be "ALTACCT" */
4141 num_alt_compressed++;
4142 }
4143
4144 /* clear marker */
4145 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4146 /*
4147 * "refcnt" also accounts for
4148 * our "compressed" markers,
4149 * so let's update it here.
4150 */
4151 --refcnt;
4152 spte = *((volatile pt_entry_t*)cpte);
4153 }
4154 /*
4155 * It may be possible for the pte to transition from managed
4156 * to unmanaged in this timeframe; for now, elide the assert.
4157 * We should break out as a consequence of checking pa_valid.
4158 */
4159 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4160 pa = pte_to_pa(spte);
4161 if (!pa_valid(pa)) {
4162 #if XNU_MONITOR
4163 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4164 #endif
4165 #if XNU_MONITOR
4166 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4167 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4168 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4169 __func__, (uint64_t)pa);
4170 }
4171 #endif
4172 break;
4173 }
4174 #if HAS_FEAT_XS
4175 if (pte_is_xs(pt_attr, spte)) {
4176 *need_strong_sync = true;
4177 }
4178 #endif /* HAS_FEAT_XS */
4179 pai = pa_index(pa);
4180 pvh_lock(pai);
4181 spte = *((volatile pt_entry_t*)cpte);
4182 pa = pte_to_pa(spte);
4183 if (pai == pa_index(pa)) {
4184 managed = TRUE;
4185 break; // Leave pai locked as we will unlock it after we free the PV entry
4186 }
4187 pvh_unlock(pai);
4188 }
4189
4190 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4191 /*
4192 * There used to be a valid mapping here but it
4193 * has already been removed when the page was
4194 * sent to the VM compressor, so nothing left to
4195 * remove now...
4196 */
4197 continue;
4198 }
4199
4200 /* remove the translation, do not flush the TLB */
4201 if (*cpte != ARM_PTE_TYPE_FAULT) {
4202 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4203 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4204 #if MACH_ASSERT
4205 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4206 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4207 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4208 }
4209 #endif
4210 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4211 num_pte_changed++;
4212 }
4213
4214 if ((spte != ARM_PTE_TYPE_FAULT) &&
4215 (pmap != kernel_pmap)) {
4216 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4217 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4218 --refcnt;
4219 }
4220
4221 if (pte_is_wired(spte)) {
4222 pte_set_wired(pmap, cpte, 0);
4223 num_unwired++;
4224 }
4225 /*
4226 * if not managed, we're done
4227 */
4228 if (!managed) {
4229 continue;
4230 }
4231
4232 #if XNU_MONITOR
4233 if (__improbable(ro_va)) {
4234 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4235 }
4236 #endif
4237
4238 /*
4239 * find and remove the mapping from the chain for this
4240 * physical address.
4241 */
4242 bool is_internal, is_altacct;
4243 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4244
4245 if (is_altacct) {
4246 assert(is_internal);
4247 num_internal++;
4248 num_alt_internal++;
4249 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4250 ppattr_clear_altacct(pai);
4251 ppattr_clear_internal(pai);
4252 }
4253 } else if (is_internal) {
4254 if (ppattr_test_reusable(pai)) {
4255 num_reusable++;
4256 } else {
4257 num_internal++;
4258 }
4259 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4260 ppattr_clear_internal(pai);
4261 }
4262 } else {
4263 num_external++;
4264 }
4265 pvh_unlock(pai);
4266 num_removed++;
4267 }
4268
4269 /*
4270 * Update the counts
4271 */
4272 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4273
4274 if (pmap != kernel_pmap) {
4275 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4276 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4277 }
4278
4279 /* update ledgers */
4280 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4281 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4282 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4283 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4284 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4285 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4286 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4287 /* make needed adjustments to phys_footprint */
4288 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4289 ((num_internal -
4290 num_alt_internal) +
4291 (num_compressed -
4292 num_alt_compressed)) * pmap_page_size);
4293 }
4294
4295 /* flush the ptable entries we have written */
4296 if (num_pte_changed > 0) {
4297 FLUSH_PTE_STRONG();
4298 }
4299
4300 return num_pte_changed;
4301 }
4302
4303
4304 /*
4305 * Remove the given range of addresses
4306 * from the specified map.
4307 *
4308 * It is assumed that the start and end are properly
4309 * rounded to the hardware page size.
4310 */
4311 void
4312 pmap_remove(
4313 pmap_t pmap,
4314 vm_map_address_t start,
4315 vm_map_address_t end)
4316 {
4317 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4318 }
4319
4320 MARK_AS_PMAP_TEXT vm_map_address_t
4321 pmap_remove_options_internal(
4322 pmap_t pmap,
4323 vm_map_address_t start,
4324 vm_map_address_t end,
4325 int options)
4326 {
4327 vm_map_address_t eva = end;
4328 pt_entry_t *bpte, *epte;
4329 pt_entry_t *pte_p;
4330 tt_entry_t *tte_p;
4331 int remove_count = 0;
4332 bool need_strong_sync = false;
4333 bool unlock = true;
4334
4335 if (__improbable(end < start)) {
4336 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
4337 }
4338
4339 validate_pmap_mutable(pmap);
4340
4341 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4342
4343 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4344
4345 tte_p = pmap_tte(pmap, start);
4346
4347 if (tte_p == (tt_entry_t *) NULL) {
4348 goto done;
4349 }
4350
4351 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4352 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4353 bpte = &pte_p[pte_index(pt_attr, start)];
4354 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4355
4356 /*
4357 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4358 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4359 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4360 */
4361 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4362 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4363 __func__, ptep_get_pmap(bpte), pmap, bpte);
4364 }
4365
4366 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4367 &need_strong_sync, options);
4368
4369 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4370 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4371 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4372 unlock = false; // pmap_tte_deallocate() has dropped the lock
4373 }
4374 }
4375
4376 done:
4377 if (unlock) {
4378 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4379 }
4380
4381 if (remove_count > 0) {
4382 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4383 }
4384 return eva;
4385 }
4386
4387 void
4388 pmap_remove_options(
4389 pmap_t pmap,
4390 vm_map_address_t start,
4391 vm_map_address_t end,
4392 int options)
4393 {
4394 vm_map_address_t va;
4395
4396 if (pmap == PMAP_NULL) {
4397 return;
4398 }
4399
4400 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4401
4402 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4403 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4404 VM_KERNEL_ADDRHIDE(end));
4405
4406 #if MACH_ASSERT
4407 if ((start | end) & pt_attr_leaf_offmask(pt_attr)) {
4408 panic("pmap_remove_options() pmap %p start 0x%llx end 0x%llx",
4409 pmap, (uint64_t)start, (uint64_t)end);
4410 }
4411 if ((end < start) || (start < pmap->min) || (end > pmap->max)) {
4412 panic("pmap_remove_options(): invalid address range, pmap=%p, start=0x%llx, end=0x%llx",
4413 pmap, (uint64_t)start, (uint64_t)end);
4414 }
4415 #endif
4416
4417 /*
4418 * We allow single-page requests to execute non-preemptibly,
4419 * as it doesn't make sense to sample AST_URGENT for a single-page
4420 * operation, and there are a couple of special use cases that
4421 * require a non-preemptible single-page operation.
4422 */
4423 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4424 pmap_verify_preemptible();
4425 }
4426
4427 /*
4428 * Invalidate the translation buffer first
4429 */
4430 va = start;
4431 while (va < end) {
4432 vm_map_address_t l;
4433
4434 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4435 if (l > end) {
4436 l = end;
4437 }
4438
4439 #if XNU_MONITOR
4440 va = pmap_remove_options_ppl(pmap, va, l, options);
4441
4442 pmap_ledger_check_balance(pmap);
4443 #else
4444 va = pmap_remove_options_internal(pmap, va, l, options);
4445 #endif
4446 }
4447
4448 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4449 }
4450
4451
4452 /*
4453 * Remove phys addr if mapped in specified map
4454 */
4455 void
4456 pmap_remove_some_phys(
4457 __unused pmap_t map,
4458 __unused ppnum_t pn)
4459 {
4460 /* Implement to support working set code */
4461 }
4462
4463 /*
4464 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4465 * switch a thread onto a new vm_map.
4466 */
4467 void
4468 pmap_switch_user(thread_t thread, vm_map_t new_map)
4469 {
4470 pmap_t new_pmap = new_map->pmap;
4471
4472
4473 thread->map = new_map;
4474 pmap_set_pmap(new_pmap, thread);
4475
4476 }
4477
4478 void
4479 pmap_set_pmap(
4480 pmap_t pmap,
4481 #if !__ARM_USER_PROTECT__
4482 __unused
4483 #endif
4484 thread_t thread)
4485 {
4486 pmap_switch(pmap);
4487 #if __ARM_USER_PROTECT__
4488 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4489 thread->machine.asid = pmap->hw_asid;
4490 #endif
4491 }
4492
4493 static void
4494 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4495 {
4496 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4497 }
4498
4499 static inline bool
4500 pmap_user_ttb_is_clear(void)
4501 {
4502 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4503 }
4504
4505 MARK_AS_PMAP_TEXT void
4506 pmap_switch_internal(
4507 pmap_t pmap)
4508 {
4509 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4510 #if XNU_MONITOR
4511 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4512 #endif
4513 validate_pmap_mutable(pmap);
4514 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4515 uint16_t asid_index = pmap->hw_asid;
4516 bool do_asid_flush = false;
4517 bool do_commpage_flush = false;
4518
4519 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4520 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4521 }
4522 #if __ARM_KERNEL_PROTECT__
4523 asid_index >>= 1;
4524 #endif
4525
4526 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4527 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4528 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4529 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4530 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4531 bool break_before_make = do_shared_region_flush;
4532
4533 #if !HAS_16BIT_ASID
4534 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4535 asid_index -= 1;
4536 pmap_update_plru(asid_index);
4537
4538 /* Paranoia. */
4539 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4540
4541 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4542 uint8_t new_sw_asid = pmap->sw_asid;
4543 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4544
4545 if (new_sw_asid != last_sw_asid) {
4546 /*
4547 * If the virtual ASID of the new pmap does not match the virtual ASID
4548 * last seen on this CPU for the physical ASID (that was a mouthful),
4549 * then this switch runs the risk of aliasing. We need to flush the
4550 * TLB for this phyiscal ASID in this case.
4551 */
4552 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4553 do_asid_flush = true;
4554 break_before_make = true;
4555 }
4556 }
4557 #endif /* !HAS_16BIT_ASID */
4558
4559 #if __ARM_MIXED_PAGE_SIZE__
4560 if (pt_attr->pta_tcr_value != get_tcr()) {
4561 break_before_make = true;
4562 }
4563 #endif
4564 #if __ARM_MIXED_PAGE_SIZE__
4565 /*
4566 * For mixed page size configurations, we need to flush the global commpage mappings from
4567 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4568 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4569 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4570 * conflict abort or other unpredictable behavior.
4571 */
4572 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4573 do_commpage_flush = true;
4574 }
4575 if (do_commpage_flush) {
4576 break_before_make = true;
4577 }
4578 #endif
4579 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4580 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4581 pmap_clear_user_ttb_internal();
4582 }
4583
4584 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4585 * to flush the userspace mappings for that region. Those mappings are global
4586 * and will not be protected by the ASID. It should also be cheaper to flush the
4587 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4588 if (__improbable(do_shared_region_flush)) {
4589 #if __ARM_RANGE_TLBI__
4590 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4591 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4592
4593 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4594 * There may still be non-global entries that overlap with the incoming pmap's
4595 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4596 * must necessarily belong to a different ASID than the incoming pmap, or they would
4597 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4598 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4599 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4600 * to consider additional invalidation here in the future. */
4601 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4602 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4603 } else {
4604 /*
4605 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4606 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4607 * have a single-page shared region anyway, not least because pmap_nest()
4608 * requires L2 block alignment of the address and size.
4609 */
4610 do_asid_flush = false;
4611 flush_core_tlb_async();
4612 }
4613 #else
4614 do_asid_flush = false;
4615 flush_core_tlb_async();
4616 #endif // __ARM_RANGE_TLBI__
4617 }
4618
4619 #if __ARM_MIXED_PAGE_SIZE__
4620 if (__improbable(do_commpage_flush)) {
4621 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4622 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4623 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4624 flush_core_tlb_allrange_async(rtlbi_param);
4625 }
4626 #endif
4627 if (__improbable(do_asid_flush)) {
4628 pmap_flush_core_tlb_asid_async(pmap);
4629 #if DEVELOPMENT || DEBUG
4630 os_atomic_inc(&pmap_asid_flushes, relaxed);
4631 #endif
4632 }
4633 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4634 sync_tlb_flush_local();
4635 }
4636
4637 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4638 }
4639
4640 void
4641 pmap_switch(
4642 pmap_t pmap)
4643 {
4644 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4645 #if XNU_MONITOR
4646 pmap_switch_ppl(pmap);
4647 #else
4648 pmap_switch_internal(pmap);
4649 #endif
4650 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4651 }
4652
4653 void
4654 pmap_page_protect(
4655 ppnum_t ppnum,
4656 vm_prot_t prot)
4657 {
4658 pmap_page_protect_options(ppnum, prot, 0, NULL);
4659 }
4660
4661 /*
4662 * Routine: pmap_page_protect_options
4663 *
4664 * Function:
4665 * Lower the permission for all mappings to a given
4666 * page.
4667 */
4668 MARK_AS_PMAP_TEXT static void
4669 pmap_page_protect_options_with_flush_range(
4670 ppnum_t ppnum,
4671 vm_prot_t prot,
4672 unsigned int options,
4673 pmap_tlb_flush_range_t *flush_range)
4674 {
4675 pmap_paddr_t phys = ptoa(ppnum);
4676 pv_entry_t **pv_h;
4677 pv_entry_t *pve_p, *orig_pve_p;
4678 pv_entry_t *pveh_p;
4679 pv_entry_t *pvet_p;
4680 pt_entry_t *pte_p, *orig_pte_p;
4681 pv_entry_t *new_pve_p;
4682 pt_entry_t *new_pte_p;
4683 vm_offset_t pvh_flags;
4684 unsigned int pai;
4685 bool remove;
4686 bool set_NX;
4687 unsigned int pvh_cnt = 0;
4688 unsigned int pass1_updated = 0;
4689 unsigned int pass2_updated = 0;
4690
4691 assert(ppnum != vm_page_fictitious_addr);
4692
4693 /* Only work with managed pages. */
4694 if (!pa_valid(phys)) {
4695 return;
4696 }
4697
4698 /*
4699 * Determine the new protection.
4700 */
4701 switch (prot) {
4702 case VM_PROT_ALL:
4703 return; /* nothing to do */
4704 case VM_PROT_READ:
4705 case VM_PROT_READ | VM_PROT_EXECUTE:
4706 remove = false;
4707 break;
4708 default:
4709 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4710 options = options & ~PMAP_OPTIONS_NOFLUSH;
4711 remove = true;
4712 break;
4713 }
4714
4715 pmap_cpu_data_t *pmap_cpu_data = NULL;
4716 if (remove) {
4717 #if !XNU_MONITOR
4718 mp_disable_preemption();
4719 #endif
4720 pmap_cpu_data = pmap_get_cpu_data();
4721 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4722 /*
4723 * Ensure the store to inflight_disconnect will be observed before any of the
4724 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4725 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4726 * another CPU, in between this function's clearing a PTE and dropping the
4727 * corresponding pagetable refcount. That can lead to a panic if the
4728 * destroying thread observes a non-zero refcount. For this we need a store-
4729 * store barrier; a store-release operation would not be sufficient.
4730 */
4731 os_atomic_thread_fence(release);
4732 }
4733
4734 pai = pa_index(phys);
4735 pvh_lock(pai);
4736 pv_h = pai_to_pvh(pai);
4737 pvh_flags = pvh_get_flags(pv_h);
4738
4739 #if XNU_MONITOR
4740 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4741 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4742 }
4743 if (__improbable(ppattr_pa_test_monitor(phys))) {
4744 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4745 }
4746 #endif
4747
4748
4749 orig_pte_p = pte_p = PT_ENTRY_NULL;
4750 orig_pve_p = pve_p = PV_ENTRY_NULL;
4751 pveh_p = PV_ENTRY_NULL;
4752 pvet_p = PV_ENTRY_NULL;
4753 new_pve_p = PV_ENTRY_NULL;
4754 new_pte_p = PT_ENTRY_NULL;
4755
4756
4757 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4758 orig_pte_p = pte_p = pvh_ptep(pv_h);
4759 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4760 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4761 pveh_p = pve_p;
4762 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4763 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4764 }
4765
4766 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4767 int pve_ptep_idx = 0;
4768
4769 /*
4770 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4771 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4772 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4773 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4774 * operation, TLB invalidation may be handled by the caller so it's possible for
4775 * tlb_flush_needed to be true while issue_tlbi is false.
4776 */
4777 bool issue_tlbi = false;
4778 bool tlb_flush_needed = false;
4779 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4780 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4781 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4782 bool update = false;
4783
4784 if (pve_p != PV_ENTRY_NULL) {
4785 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4786 if (pte_p == PT_ENTRY_NULL) {
4787 goto protect_skip_pve_pass1;
4788 }
4789 }
4790
4791 #ifdef PVH_FLAG_IOMMU
4792 if (pvh_ptep_is_iommu(pte_p)) {
4793 #if XNU_MONITOR
4794 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4795 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4796 ppnum, ptep_get_iommu(pte_p), pve_p);
4797 }
4798 #endif
4799 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4800 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4801 ppnum, ptep_get_iommu(pte_p), pve_p);
4802 }
4803 goto protect_skip_pve_pass1;
4804 }
4805 #endif
4806 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4807 const pmap_t pmap = ptdp->pmap;
4808 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4809
4810 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4811 #if MACH_ASSERT
4812 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4813 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4814 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4815 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4816
4817 pv_entry_t *check_pvep = pve_p;
4818
4819 do {
4820 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4821 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4822 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4823 }
4824 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4825
4826 /* Restore previous PTEP value. */
4827 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4828 }
4829 #endif
4830 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4831 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4832 }
4833
4834 #if DEVELOPMENT || DEBUG
4835 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4836 #else
4837 if ((prot & VM_PROT_EXECUTE))
4838 #endif
4839 {
4840 set_NX = false;
4841 } else {
4842 set_NX = true;
4843 }
4844
4845 #if HAS_FEAT_XS
4846 /**
4847 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4848 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4849 */
4850 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4851 #endif /* HAS_FEAT_XS */
4852
4853 /* Remove the mapping if new protection is NONE */
4854 if (remove) {
4855 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4856 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4857 __func__, pmap, ppnum);
4858 }
4859
4860 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4861 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4862 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4863 pt_entry_t spte = *pte_p;
4864
4865 if (pte_is_wired(spte)) {
4866 pte_set_wired(pmap, pte_p, 0);
4867 spte = *pte_p;
4868 if (pmap != kernel_pmap) {
4869 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4870 }
4871 }
4872
4873 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4874 (uint64_t)spte, pte_p, ppnum);
4875
4876 if (compress && is_internal && (pmap != kernel_pmap)) {
4877 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4878 /* mark this PTE as having been "compressed" */
4879 tmplate = ARM_PTE_COMPRESSED;
4880 if (is_altacct) {
4881 tmplate |= ARM_PTE_COMPRESSED_ALT;
4882 }
4883 } else {
4884 tmplate = ARM_PTE_TYPE_FAULT;
4885 }
4886
4887 assert(spte != tmplate);
4888 write_pte_fast(pte_p, tmplate);
4889 update = true;
4890 ++pass1_updated;
4891
4892 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4893
4894 if (pmap != kernel_pmap) {
4895 if (ppattr_test_reusable(pai) &&
4896 is_internal &&
4897 !is_altacct) {
4898 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4899 } else if (!is_internal) {
4900 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4901 }
4902
4903 if (is_altacct) {
4904 assert(is_internal);
4905 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4906 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4907 if (options & PMAP_OPTIONS_COMPRESSOR) {
4908 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4909 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4910 }
4911 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4912 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4913 } else if (ppattr_test_reusable(pai)) {
4914 assert(is_internal);
4915 if (options & PMAP_OPTIONS_COMPRESSOR) {
4916 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4917 /* was not in footprint, but is now */
4918 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4919 }
4920 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4921 } else if (is_internal) {
4922 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4923
4924 /*
4925 * Update all stats related to physical footprint, which only
4926 * deals with internal pages.
4927 */
4928 if (options & PMAP_OPTIONS_COMPRESSOR) {
4929 /*
4930 * This removal is only being done so we can send this page to
4931 * the compressor; therefore it mustn't affect total task footprint.
4932 */
4933 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4934 } else {
4935 /*
4936 * This internal page isn't going to the compressor, so adjust stats to keep
4937 * phys_footprint up to date.
4938 */
4939 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4940 }
4941 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4942 } else {
4943 /* external page: no impact on ledgers */
4944 }
4945 }
4946 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4947 } else {
4948 pt_entry_t spte = *pte_p;
4949 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4950
4951 if (pmap == kernel_pmap) {
4952 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4953 } else {
4954 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4955 }
4956
4957 /*
4958 * While the naive implementation of this would serve to add execute
4959 * permission, this is not how the VM uses this interface, or how
4960 * x86_64 implements it. So ignore requests to add execute permissions.
4961 */
4962 if (set_NX) {
4963 tmplate |= pt_attr_leaf_xn(pt_attr);
4964 }
4965
4966
4967 assert(spte != ARM_PTE_TYPE_FAULT);
4968 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4969
4970 if (spte != tmplate) {
4971 /*
4972 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4973 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4974 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4975 * should always be cleared by this function.
4976 */
4977 pte_set_was_writeable(tmplate, true);
4978 write_pte_fast(pte_p, tmplate);
4979 update = true;
4980 ++pass1_updated;
4981 } else if (pte_was_writeable(tmplate)) {
4982 /*
4983 * We didn't change any of the relevant permission bits in the PTE, so we don't need
4984 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
4985 * write access to a page, this function should always at least clear that flag for
4986 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
4987 * these mappings go through vm_fault(). We therefore don't want those accesses to
4988 * be handled through arm_fast_fault().
4989 */
4990 pte_set_was_writeable(tmplate, false);
4991 write_pte_fast(pte_p, tmplate);
4992 }
4993 }
4994
4995 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
4996 tlb_flush_needed = true;
4997 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
4998 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
4999 issue_tlbi = true;
5000 }
5001 }
5002 protect_skip_pve_pass1:
5003 pte_p = PT_ENTRY_NULL;
5004 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5005 pve_ptep_idx = 0;
5006 pve_p = pve_next(pve_p);
5007 }
5008 }
5009
5010 if (tlb_flush_needed) {
5011 FLUSH_PTE_STRONG();
5012 }
5013
5014 if (!remove && !issue_tlbi) {
5015 goto protect_finish;
5016 }
5017
5018 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5019 pv_entry_t **pve_pp = pv_h;
5020 pve_p = orig_pve_p;
5021 pte_p = orig_pte_p;
5022 pve_ptep_idx = 0;
5023
5024 /*
5025 * We need to keep track of whether a particular PVE list contains IOMMU
5026 * mappings when removing entries, because we should only remove CPU
5027 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5028 * it around.
5029 */
5030 bool iommu_mapping_in_pve = false;
5031 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5032 if (pve_p != PV_ENTRY_NULL) {
5033 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5034 if (pte_p == PT_ENTRY_NULL) {
5035 goto protect_skip_pve_pass2;
5036 }
5037 }
5038
5039 #ifdef PVH_FLAG_IOMMU
5040 if (pvh_ptep_is_iommu(pte_p)) {
5041 iommu_mapping_in_pve = true;
5042 if (remove && (pve_p == PV_ENTRY_NULL)) {
5043 /*
5044 * We've found an IOMMU entry and it's the only entry in the PV list.
5045 * We don't discard IOMMU entries, so simply set up the new PV list to
5046 * contain the single IOMMU PTE and exit the loop.
5047 */
5048 new_pte_p = pte_p;
5049 break;
5050 }
5051 goto protect_skip_pve_pass2;
5052 }
5053 #endif
5054 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5055 const pmap_t pmap = ptdp->pmap;
5056 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5057
5058 if (remove) {
5059 if (!compress && (pmap != kernel_pmap)) {
5060 /*
5061 * We must wait to decrement the refcount until we're completely finished using the PTE
5062 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5063 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5064 * under us.
5065 */
5066 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5067 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5068 }
5069 }
5070 /* Remove this CPU mapping from PVE list. */
5071 if (pve_p != PV_ENTRY_NULL) {
5072 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5073 }
5074 } else {
5075 pt_entry_t spte = *pte_p;
5076 if (pte_was_writeable(spte)) {
5077 pte_set_was_writeable(spte, false);
5078 write_pte_fast(pte_p, spte);
5079 } else {
5080 goto protect_skip_pve_pass2;
5081 }
5082 }
5083 ++pass2_updated;
5084 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5085 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5086 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5087 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5088 }
5089
5090 protect_skip_pve_pass2:
5091 pte_p = PT_ENTRY_NULL;
5092 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5093 pve_ptep_idx = 0;
5094
5095 if (remove) {
5096 /**
5097 * If there are any IOMMU mappings in the PVE list, preserve
5098 * those mappings in a new PVE list (new_pve_p) which will later
5099 * become the new PVH entry. Keep track of the CPU mappings in
5100 * pveh_p/pvet_p so they can be deallocated later.
5101 */
5102 if (iommu_mapping_in_pve) {
5103 iommu_mapping_in_pve = false;
5104 pv_entry_t *temp_pve_p = pve_next(pve_p);
5105 pve_remove(pv_h, pve_pp, pve_p);
5106 pveh_p = pvh_pve_list(pv_h);
5107 pve_p->pve_next = new_pve_p;
5108 new_pve_p = pve_p;
5109 pve_p = temp_pve_p;
5110 continue;
5111 } else {
5112 pvet_p = pve_p;
5113 pvh_cnt++;
5114 }
5115 }
5116
5117 pve_pp = pve_next_ptr(pve_p);
5118 pve_p = pve_next(pve_p);
5119 iommu_mapping_in_pve = false;
5120 }
5121 }
5122
5123 protect_finish:
5124
5125 #ifdef PVH_FLAG_EXEC
5126 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5127 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5128 }
5129 #endif
5130 if (__improbable(pass1_updated != pass2_updated)) {
5131 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5132 __func__, pass1_updated, pass2_updated);
5133 }
5134 /* if we removed a bunch of entries, take care of them now */
5135 if (remove) {
5136 if (new_pve_p != PV_ENTRY_NULL) {
5137 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5138 pvh_set_flags(pv_h, pvh_flags);
5139 } else if (new_pte_p != PT_ENTRY_NULL) {
5140 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5141 pvh_set_flags(pv_h, pvh_flags);
5142 } else {
5143 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5144 pmap_flush_noncoherent_page(phys);
5145 }
5146 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5147 }
5148 }
5149
5150 if (flush_range && tlb_flush_needed) {
5151 if (!remove) {
5152 flush_range->ptfr_flush_needed = true;
5153 tlb_flush_needed = false;
5154 }
5155 }
5156
5157 /*
5158 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5159 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5160 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5161 * a page to be repurposed while it is still live in the TLBs.
5162 */
5163 if (remove && tlb_flush_needed) {
5164 sync_tlb_flush();
5165 }
5166
5167
5168 pvh_unlock(pai);
5169
5170 if (remove) {
5171 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5172 #if !XNU_MONITOR
5173 mp_enable_preemption();
5174 #endif
5175 }
5176
5177 if (!remove && tlb_flush_needed) {
5178 sync_tlb_flush();
5179 }
5180
5181 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5182 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5183 }
5184 }
5185
5186 MARK_AS_PMAP_TEXT void
5187 pmap_page_protect_options_internal(
5188 ppnum_t ppnum,
5189 vm_prot_t prot,
5190 unsigned int options,
5191 void *arg)
5192 {
5193 if (arg != NULL) {
5194 /*
5195 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5196 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5197 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5198 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5199 * In that case, force the flush to take place.
5200 */
5201 options &= ~PMAP_OPTIONS_NOFLUSH;
5202 }
5203 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5204 }
5205
5206 void
5207 pmap_page_protect_options(
5208 ppnum_t ppnum,
5209 vm_prot_t prot,
5210 unsigned int options,
5211 void *arg)
5212 {
5213 pmap_paddr_t phys = ptoa(ppnum);
5214
5215 assert(ppnum != vm_page_fictitious_addr);
5216
5217 /* Only work with managed pages. */
5218 if (!pa_valid(phys)) {
5219 return;
5220 }
5221
5222 /*
5223 * Determine the new protection.
5224 */
5225 if (prot == VM_PROT_ALL) {
5226 return; /* nothing to do */
5227 }
5228
5229 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5230
5231 #if XNU_MONITOR
5232 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5233 #else
5234 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5235 #endif
5236
5237 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5238 }
5239
5240
5241 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5242 MARK_AS_PMAP_TEXT void
5243 pmap_disable_user_jop_internal(pmap_t pmap)
5244 {
5245 if (pmap == kernel_pmap) {
5246 panic("%s: called with kernel_pmap", __func__);
5247 }
5248 validate_pmap_mutable(pmap);
5249 pmap->disable_jop = true;
5250 }
5251
5252 void
5253 pmap_disable_user_jop(pmap_t pmap)
5254 {
5255 #if XNU_MONITOR
5256 pmap_disable_user_jop_ppl(pmap);
5257 #else
5258 pmap_disable_user_jop_internal(pmap);
5259 #endif
5260 }
5261 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5262
5263 /*
5264 * Indicates if the pmap layer enforces some additional restrictions on the
5265 * given set of protections.
5266 */
5267 bool
5268 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5269 {
5270 return false;
5271 }
5272
5273 /*
5274 * Set the physical protection on the
5275 * specified range of this map as requested.
5276 * VERY IMPORTANT: Will not increase permissions.
5277 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5278 */
5279 void
5280 pmap_protect(
5281 pmap_t pmap,
5282 vm_map_address_t b,
5283 vm_map_address_t e,
5284 vm_prot_t prot)
5285 {
5286 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5287 }
5288
5289 MARK_AS_PMAP_TEXT vm_map_address_t
5290 pmap_protect_options_internal(
5291 pmap_t pmap,
5292 vm_map_address_t start,
5293 vm_map_address_t end,
5294 vm_prot_t prot,
5295 unsigned int options,
5296 __unused void *args)
5297 {
5298 tt_entry_t *tte_p;
5299 pt_entry_t *bpte_p, *epte_p;
5300 pt_entry_t *pte_p;
5301 boolean_t set_NX = TRUE;
5302 boolean_t set_XO = FALSE;
5303 boolean_t should_have_removed = FALSE;
5304 bool need_strong_sync = false;
5305
5306 /* Validate the pmap input before accessing its data. */
5307 validate_pmap_mutable(pmap);
5308
5309 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5310
5311 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5312 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5313 }
5314
5315 #if DEVELOPMENT || DEBUG
5316 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5317 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5318 should_have_removed = TRUE;
5319 }
5320 } else
5321 #endif
5322 {
5323 /* Determine the new protection. */
5324 switch (prot) {
5325 case VM_PROT_EXECUTE:
5326 set_XO = TRUE;
5327 OS_FALLTHROUGH;
5328 case VM_PROT_READ:
5329 case VM_PROT_READ | VM_PROT_EXECUTE:
5330 break;
5331 case VM_PROT_READ | VM_PROT_WRITE:
5332 case VM_PROT_ALL:
5333 return end; /* nothing to do */
5334 default:
5335 should_have_removed = TRUE;
5336 }
5337 }
5338
5339 if (should_have_removed) {
5340 panic("%s: should have been a remove operation, "
5341 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5342 __FUNCTION__,
5343 pmap, (void *)start, (void *)end, prot, options, args);
5344 }
5345
5346 #if DEVELOPMENT || DEBUG
5347 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5348 #else
5349 if ((prot & VM_PROT_EXECUTE))
5350 #endif
5351 {
5352 set_NX = FALSE;
5353 } else {
5354 set_NX = TRUE;
5355 }
5356
5357 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5358 vm_map_address_t va = start;
5359 unsigned int npages = 0;
5360
5361 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5362
5363 tte_p = pmap_tte(pmap, start);
5364
5365 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5366 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5367 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5368 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5369 pte_p = bpte_p;
5370
5371 for (pte_p = bpte_p;
5372 pte_p < epte_p;
5373 pte_p += PAGE_RATIO, va += pmap_page_size) {
5374 ++npages;
5375 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5376 pmap_pending_preemption())) {
5377 break;
5378 }
5379 pt_entry_t spte;
5380 #if DEVELOPMENT || DEBUG
5381 boolean_t force_write = FALSE;
5382 #endif
5383
5384 spte = *((volatile pt_entry_t*)pte_p);
5385
5386 if ((spte == ARM_PTE_TYPE_FAULT) ||
5387 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5388 continue;
5389 }
5390
5391 pmap_paddr_t pa;
5392 unsigned int pai = 0;
5393 boolean_t managed = FALSE;
5394
5395 while (!managed) {
5396 /*
5397 * It may be possible for the pte to transition from managed
5398 * to unmanaged in this timeframe; for now, elide the assert.
5399 * We should break out as a consequence of checking pa_valid.
5400 */
5401 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5402 pa = pte_to_pa(spte);
5403 if (!pa_valid(pa)) {
5404 break;
5405 }
5406 pai = pa_index(pa);
5407 pvh_lock(pai);
5408 spte = *((volatile pt_entry_t*)pte_p);
5409 pa = pte_to_pa(spte);
5410 if (pai == pa_index(pa)) {
5411 managed = TRUE;
5412 break; // Leave the PVH locked as we will unlock it after we free the PTE
5413 }
5414 pvh_unlock(pai);
5415 }
5416
5417 if ((spte == ARM_PTE_TYPE_FAULT) ||
5418 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5419 continue;
5420 }
5421
5422 pt_entry_t tmplate;
5423
5424 if (pmap == kernel_pmap) {
5425 #if DEVELOPMENT || DEBUG
5426 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5427 force_write = TRUE;
5428 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5429 } else
5430 #endif
5431 {
5432 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5433 }
5434 } else {
5435 #if DEVELOPMENT || DEBUG
5436 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5437 assert(pmap->type != PMAP_TYPE_NESTED);
5438 force_write = TRUE;
5439 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5440 } else
5441 #endif
5442 {
5443 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5444 }
5445 }
5446
5447 /*
5448 * XXX Removing "NX" would
5449 * grant "execute" access
5450 * immediately, bypassing any
5451 * checks VM might want to do
5452 * in its soft fault path.
5453 * pmap_protect() and co. are
5454 * not allowed to increase
5455 * access permissions.
5456 */
5457 if (set_NX) {
5458 tmplate |= pt_attr_leaf_xn(pt_attr);
5459 } else {
5460 if (pmap == kernel_pmap) {
5461 /* do NOT clear "PNX"! */
5462 tmplate |= ARM_PTE_NX;
5463 } else {
5464 /* do NOT clear "NX"! */
5465 tmplate |= pt_attr_leaf_x(pt_attr);
5466 if (set_XO) {
5467 tmplate &= ~ARM_PTE_APMASK;
5468 tmplate |= pt_attr_leaf_rona(pt_attr);
5469 }
5470 }
5471 }
5472
5473 #if DEVELOPMENT || DEBUG
5474 if (force_write) {
5475 /*
5476 * TODO: Run CS/Monitor checks here.
5477 */
5478 if (managed) {
5479 /*
5480 * We are marking the page as writable,
5481 * so we consider it to be modified and
5482 * referenced.
5483 */
5484 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5485 tmplate |= ARM_PTE_AF;
5486
5487 if (ppattr_test_reffault(pai)) {
5488 ppattr_clear_reffault(pai);
5489 }
5490
5491 if (ppattr_test_modfault(pai)) {
5492 ppattr_clear_modfault(pai);
5493 }
5494 }
5495 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5496 /*
5497 * An immediate request for anything other than
5498 * write should still mark the page as
5499 * referenced if managed.
5500 */
5501 if (managed) {
5502 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5503 tmplate |= ARM_PTE_AF;
5504
5505 if (ppattr_test_reffault(pai)) {
5506 ppattr_clear_reffault(pai);
5507 }
5508 }
5509 }
5510 #endif
5511
5512 /* We do not expect to write fast fault the entry. */
5513 pte_set_was_writeable(tmplate, false);
5514 #if HAS_FEAT_XS
5515 if (pte_is_xs(pt_attr, spte)) {
5516 need_strong_sync = true;
5517 }
5518 #endif /* HAS_FEAT_XS */
5519
5520 write_pte_fast(pte_p, tmplate);
5521
5522 if (managed) {
5523 pvh_assert_locked(pai);
5524 pvh_unlock(pai);
5525 }
5526 }
5527 FLUSH_PTE_STRONG();
5528 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5529 } else {
5530 va = end;
5531 }
5532
5533 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5534 return va;
5535 }
5536
5537 void
5538 pmap_protect_options(
5539 pmap_t pmap,
5540 vm_map_address_t b,
5541 vm_map_address_t e,
5542 vm_prot_t prot,
5543 unsigned int options,
5544 __unused void *args)
5545 {
5546 vm_map_address_t l, beg;
5547
5548 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5549
5550 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5551 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5552 pmap, (uint64_t)b, (uint64_t)e);
5553 }
5554
5555 /*
5556 * We allow single-page requests to execute non-preemptibly,
5557 * as it doesn't make sense to sample AST_URGENT for a single-page
5558 * operation, and there are a couple of special use cases that
5559 * require a non-preemptible single-page operation.
5560 */
5561 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5562 pmap_verify_preemptible();
5563 }
5564
5565 #if DEVELOPMENT || DEBUG
5566 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5567 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5568 pmap_remove_options(pmap, b, e, options);
5569 return;
5570 }
5571 } else
5572 #endif
5573 {
5574 /* Determine the new protection. */
5575 switch (prot) {
5576 case VM_PROT_EXECUTE:
5577 case VM_PROT_READ:
5578 case VM_PROT_READ | VM_PROT_EXECUTE:
5579 break;
5580 case VM_PROT_READ | VM_PROT_WRITE:
5581 case VM_PROT_ALL:
5582 return; /* nothing to do */
5583 default:
5584 pmap_remove_options(pmap, b, e, options);
5585 return;
5586 }
5587 }
5588
5589 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5590 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5591 VM_KERNEL_ADDRHIDE(e));
5592
5593 beg = b;
5594
5595 while (beg < e) {
5596 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5597
5598 if (l > e) {
5599 l = e;
5600 }
5601
5602 #if XNU_MONITOR
5603 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5604 #else
5605 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5606 #endif
5607 }
5608
5609 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5610 }
5611
5612 /**
5613 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5614 *
5615 * @param pmap pmap to insert the pages into.
5616 * @param va virtual address to map the pages into.
5617 * @param pa page number of the first physical page to map.
5618 * @param size block size, in number of pages.
5619 * @param prot mapping protection attributes.
5620 * @param attr flags to pass to pmap_enter().
5621 *
5622 * @return KERN_SUCCESS.
5623 */
5624 kern_return_t
5625 pmap_map_block(
5626 pmap_t pmap,
5627 addr64_t va,
5628 ppnum_t pa,
5629 uint32_t size,
5630 vm_prot_t prot,
5631 int attr,
5632 unsigned int flags)
5633 {
5634 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5635 }
5636
5637 /**
5638 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5639 * As opposed to pmap_map_block(), this function takes
5640 * a physical address as an input and operates using the
5641 * page size associated with the input pmap.
5642 *
5643 * @param pmap pmap to insert the pages into.
5644 * @param va virtual address to map the pages into.
5645 * @param pa physical address of the first physical page to map.
5646 * @param size block size, in number of pages.
5647 * @param prot mapping protection attributes.
5648 * @param attr flags to pass to pmap_enter().
5649 *
5650 * @return KERN_SUCCESS.
5651 */
5652 kern_return_t
5653 pmap_map_block_addr(
5654 pmap_t pmap,
5655 addr64_t va,
5656 pmap_paddr_t pa,
5657 uint32_t size,
5658 vm_prot_t prot,
5659 int attr,
5660 unsigned int flags)
5661 {
5662 #if __ARM_MIXED_PAGE_SIZE__
5663 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5664 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5665 #else
5666 const uint64_t pmap_page_size = PAGE_SIZE;
5667 #endif
5668
5669 for (ppnum_t page = 0; page < size; page++) {
5670 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5671 panic("%s: failed pmap_enter_addr, "
5672 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5673 __FUNCTION__,
5674 pmap, va, (uint64_t)pa, size, prot, flags);
5675 }
5676
5677 va += pmap_page_size;
5678 pa += pmap_page_size;
5679 }
5680
5681 return KERN_SUCCESS;
5682 }
5683
5684 kern_return_t
5685 pmap_enter_addr(
5686 pmap_t pmap,
5687 vm_map_address_t v,
5688 pmap_paddr_t pa,
5689 vm_prot_t prot,
5690 vm_prot_t fault_type,
5691 unsigned int flags,
5692 boolean_t wired)
5693 {
5694 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL);
5695 }
5696
5697 /*
5698 * Insert the given physical page (p) at
5699 * the specified virtual address (v) in the
5700 * target physical map with the protection requested.
5701 *
5702 * If specified, the page will be wired down, meaning
5703 * that the related pte can not be reclaimed.
5704 *
5705 * NB: This is the only routine which MAY NOT lazy-evaluate
5706 * or lose information. That is, this routine must actually
5707 * insert this page into the given map eventually (must make
5708 * forward progress eventually.
5709 */
5710 kern_return_t
5711 pmap_enter(
5712 pmap_t pmap,
5713 vm_map_address_t v,
5714 ppnum_t pn,
5715 vm_prot_t prot,
5716 vm_prot_t fault_type,
5717 unsigned int flags,
5718 boolean_t wired)
5719 {
5720 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5721 }
5722
5723 /*
5724 * Attempt to commit the pte.
5725 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5726 * Performs no page table or accounting writes on failures.
5727 */
5728 static inline bool
5729 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5730 {
5731 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5732 bool success = false, changed_wiring = false;
5733
5734 __unreachable_ok_push
5735 if (TEST_PAGE_RATIO_4) {
5736 /*
5737 * 16K virtual pages w/ 4K hw pages.
5738 * We actually need to update 4 ptes here which can't easily be done atomically.
5739 * As a result we require the exclusive pmap lock.
5740 */
5741 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5742 *old_pte = *pte_p;
5743 if (*old_pte == new_pte) {
5744 /* Another thread completed this operation. Nothing to do here. */
5745 success = true;
5746 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5747 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5748 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5749 success = false;
5750 } else {
5751 write_pte_fast(pte_p, new_pte);
5752 success = true;
5753 }
5754 } else {
5755 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5756 }
5757 __unreachable_ok_pop
5758
5759 if (success && *old_pte != new_pte) {
5760 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5761 bool need_strong_sync = false;
5762 FLUSH_PTE_STRONG();
5763 #if HAS_FEAT_XS
5764 if (pte_is_xs(pt_attr, *old_pte)) {
5765 need_strong_sync = true;
5766 }
5767 #endif /* HAS_FEAT_XS */
5768 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5769 } else {
5770 FLUSH_PTE();
5771 __builtin_arm_isb(ISB_SY);
5772 }
5773 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5774 (new_pte & ARM_PTE_WIRED) != 0 :
5775 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5776
5777 if (pmap != kernel_pmap && changed_wiring) {
5778 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5779 if (new_pte & ARM_PTE_WIRED) {
5780 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5781 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5782 } else {
5783 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5784 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5785 }
5786 }
5787
5788 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5789 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5790 }
5791 return success;
5792 }
5793
5794 MARK_AS_PMAP_TEXT static pt_entry_t
5795 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5796 {
5797 pt_entry_t pte;
5798
5799 switch (wimg & (VM_WIMG_MASK)) {
5800 case VM_WIMG_IO:
5801 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5802 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5803 // AP, while preserving the security benefits of using device
5804 // mapping against side-channel attacks. On pre-H14 platforms,
5805 // the accesses will still be strongly ordered.
5806 if (is_dram_addr(pa)) {
5807 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5808 } else {
5809 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5810 }
5811 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5812 break;
5813 case VM_WIMG_RT:
5814 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5815 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5816 break;
5817 case VM_WIMG_POSTED:
5818 if (is_dram_addr(pa)) {
5819 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5820 } else {
5821 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5822 }
5823 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5824 break;
5825 case VM_WIMG_POSTED_REORDERED:
5826 if (is_dram_addr(pa)) {
5827 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5828 } else {
5829 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5830 }
5831 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5832 break;
5833 case VM_WIMG_POSTED_COMBINED_REORDERED:
5834 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5835 #if HAS_FEAT_XS
5836 if (!is_dram_addr(pa)) {
5837 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5838 }
5839 #endif /* HAS_FEAT_XS */
5840 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5841 break;
5842 case VM_WIMG_WCOMB:
5843 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5844 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5845 break;
5846 case VM_WIMG_WTHRU:
5847 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5848 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5849 break;
5850 case VM_WIMG_COPYBACK:
5851 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5852 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5853 break;
5854 case VM_WIMG_INNERWBACK:
5855 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5856 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5857 break;
5858 default:
5859 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5860 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5861 }
5862
5863 return pte;
5864 }
5865
5866
5867 /*
5868 * Construct a PTE (and the physical page attributes) for the given virtual to
5869 * physical mapping.
5870 *
5871 * This function has no side effects and is safe to call so that it is safe to
5872 * call while attempting a pmap_enter transaction.
5873 */
5874 MARK_AS_PMAP_TEXT static pt_entry_t
5875 pmap_construct_pte(
5876 const pmap_t pmap,
5877 vm_map_address_t va,
5878 pmap_paddr_t pa,
5879 vm_prot_t prot,
5880 vm_prot_t fault_type,
5881 boolean_t wired,
5882 const pt_attr_t* const pt_attr,
5883 uint16_t *pp_attr_bits /* OUTPUT */
5884 )
5885 {
5886 bool set_NX = false, set_XO = false;
5887 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5888 assert(pp_attr_bits != NULL);
5889 *pp_attr_bits = 0;
5890
5891 if (wired) {
5892 pte |= ARM_PTE_WIRED;
5893 }
5894
5895 #if DEVELOPMENT || DEBUG
5896 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5897 #else
5898 if ((prot & VM_PROT_EXECUTE))
5899 #endif
5900 {
5901 set_NX = false;
5902 } else {
5903 set_NX = true;
5904 }
5905
5906 if (prot == VM_PROT_EXECUTE) {
5907 set_XO = true;
5908 }
5909
5910 if (set_NX) {
5911 pte |= pt_attr_leaf_xn(pt_attr);
5912 } else {
5913 if (pmap == kernel_pmap) {
5914 pte |= ARM_PTE_NX;
5915 } else {
5916 pte |= pt_attr_leaf_x(pt_attr);
5917 }
5918 }
5919
5920 if (pmap == kernel_pmap) {
5921 #if __ARM_KERNEL_PROTECT__
5922 pte |= ARM_PTE_NG;
5923 #endif /* __ARM_KERNEL_PROTECT__ */
5924 if (prot & VM_PROT_WRITE) {
5925 pte |= ARM_PTE_AP(AP_RWNA);
5926 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5927 } else {
5928 pte |= ARM_PTE_AP(AP_RONA);
5929 *pp_attr_bits |= PP_ATTR_REFERENCED;
5930 }
5931 } else {
5932 if (pmap->type != PMAP_TYPE_NESTED) {
5933 pte |= ARM_PTE_NG;
5934 } else if ((pmap->nested_region_unnested_table_bitmap)
5935 && (va >= pmap->nested_region_addr)
5936 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5937 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5938
5939 if ((pmap->nested_region_unnested_table_bitmap)
5940 && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5941 pte |= ARM_PTE_NG;
5942 }
5943 }
5944 if (prot & VM_PROT_WRITE) {
5945 assert(pmap->type != PMAP_TYPE_NESTED);
5946 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5947 if (fault_type & VM_PROT_WRITE) {
5948 pte |= pt_attr_leaf_rw(pt_attr);
5949 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5950 } else {
5951 pte |= pt_attr_leaf_ro(pt_attr);
5952 /*
5953 * Mark the page as MODFAULT so that a subsequent write
5954 * may be handled through arm_fast_fault().
5955 */
5956 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5957 pte_set_was_writeable(pte, true);
5958 }
5959 } else {
5960 pte |= pt_attr_leaf_rw(pt_attr);
5961 *pp_attr_bits |= PP_ATTR_REFERENCED;
5962 }
5963 } else {
5964 if (set_XO) {
5965 pte |= pt_attr_leaf_rona(pt_attr);
5966 } else {
5967 pte |= pt_attr_leaf_ro(pt_attr);
5968 }
5969 *pp_attr_bits |= PP_ATTR_REFERENCED;
5970 }
5971 }
5972
5973 pte |= ARM_PTE_AF;
5974 return pte;
5975 }
5976
5977 MARK_AS_PMAP_TEXT kern_return_t
5978 pmap_enter_options_internal(
5979 pmap_t pmap,
5980 vm_map_address_t v,
5981 pmap_paddr_t pa,
5982 vm_prot_t prot,
5983 vm_prot_t fault_type,
5984 unsigned int flags,
5985 boolean_t wired,
5986 unsigned int options)
5987 {
5988 ppnum_t pn = (ppnum_t)atop(pa);
5989 pt_entry_t pte;
5990 pt_entry_t spte;
5991 pt_entry_t *pte_p;
5992 bool refcnt_updated;
5993 bool wiredcnt_updated;
5994 bool ro_va = false;
5995 unsigned int wimg_bits;
5996 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
5997 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
5998 kern_return_t kr = KERN_SUCCESS;
5999 uint16_t pp_attr_bits;
6000 volatile uint16_t *refcnt;
6001 volatile uint16_t *wiredcnt;
6002 pv_free_list_t *local_pv_free;
6003
6004 validate_pmap_mutable(pmap);
6005
6006 #if XNU_MONITOR
6007 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6008 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6009 }
6010 #endif
6011
6012 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6013
6014 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6015 panic("%s: pmap %p v 0x%llx not page-aligned",
6016 __func__, pmap, (unsigned long long)v);
6017 }
6018
6019 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6020 panic("%s: attempt to map out-of-bounds VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6021 }
6022
6023 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6024 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6025 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6026 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6027 }
6028
6029 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6030 panic("pmap_enter_options() pmap %p pa 0x%llx",
6031 pmap, (uint64_t)pa);
6032 }
6033
6034 /* The PA should not extend beyond the architected physical address space */
6035 pa &= ARM_PTE_PAGE_MASK;
6036
6037 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6038 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6039 extern vm_offset_t ctrr_test_page;
6040 if (__probable(v != ctrr_test_page))
6041 #endif
6042 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6043 }
6044 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6045 if (__improbable(prot != VM_PROT_READ)) {
6046 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6047 __func__, (unsigned long long)v, prot);
6048 }
6049 ro_va = true;
6050 }
6051 assert(pn != vm_page_fictitious_addr);
6052
6053 refcnt_updated = false;
6054 wiredcnt_updated = false;
6055
6056 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6057 /*
6058 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6059 *
6060 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6061 */
6062 lock_mode = PMAP_LOCK_EXCLUSIVE;
6063 }
6064
6065 if (!pmap_lock_preempt(pmap, lock_mode)) {
6066 return KERN_ABORTED;
6067 }
6068
6069 /*
6070 * Expand pmap to include this pte. Assume that
6071 * pmap is always expanded to include enough hardware
6072 * pages to map one VM page.
6073 */
6074 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6075 /* Must unlock to expand the pmap. */
6076 pmap_unlock(pmap, lock_mode);
6077
6078 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6079
6080 if (kr != KERN_SUCCESS) {
6081 return kr;
6082 }
6083
6084 if (!pmap_lock_preempt(pmap, lock_mode)) {
6085 return KERN_ABORTED;
6086 }
6087 }
6088
6089 if (options & PMAP_OPTIONS_NOENTER) {
6090 pmap_unlock(pmap, lock_mode);
6091 return KERN_SUCCESS;
6092 }
6093
6094 /*
6095 * Since we may not hold the pmap lock exclusive, updating the pte is
6096 * done via a cmpxchg loop.
6097 * We need to be careful about modifying non-local data structures before commiting
6098 * the new pte since we may need to re-do the transaction.
6099 */
6100 spte = os_atomic_load(pte_p, relaxed);
6101 while (!committed) {
6102 refcnt = NULL;
6103 wiredcnt = NULL;
6104 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6105 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6106
6107 if (pmap != kernel_pmap) {
6108 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6109 refcnt = &ptd_info->refcnt;
6110 wiredcnt = &ptd_info->wiredcnt;
6111 /*
6112 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6113 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6114 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6115 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6116 * have PTDs, so we can't use the check there.
6117 */
6118 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6119 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6120 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6121 }
6122 /*
6123 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6124 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6125 * or acquire the pmap lock exclusive.
6126 */
6127 if (!wiredcnt_updated) {
6128 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6129 wiredcnt_updated = true;
6130 }
6131 if (!refcnt_updated) {
6132 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6133 refcnt_updated = true;
6134 drop_refcnt = true;
6135 }
6136 }
6137
6138 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6139 /*
6140 * There is already a mapping here & it's for a different physical page.
6141 * First remove that mapping.
6142 *
6143 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6144 */
6145 if (lock_mode == PMAP_LOCK_SHARED) {
6146 if (pmap_lock_shared_to_exclusive(pmap)) {
6147 lock_mode = PMAP_LOCK_EXCLUSIVE;
6148 } else {
6149 /*
6150 * We failed to upgrade to an exclusive lock.
6151 * As a result we no longer hold the lock at all,
6152 * so we need to re-acquire it and restart the transaction.
6153 */
6154 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6155 lock_mode = PMAP_LOCK_EXCLUSIVE;
6156 /* pmap might have changed after we dropped the lock. Try again. */
6157 spte = os_atomic_load(pte_p, relaxed);
6158 continue;
6159 }
6160 }
6161 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6162 spte = ARM_PTE_TYPE_FAULT;
6163 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6164 }
6165
6166 /*
6167 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6168 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6169 * read-write protection. The PMAP layer though still needs to use the right
6170 * index, which is the older XO-now-TPRO one and that is specially selected
6171 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6172 */
6173 if (options & PMAP_OPTIONS_MAP_TPRO) {
6174 if (__improbable(pmap == kernel_pmap)) {
6175 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6176 __func__);
6177 }
6178 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6179 } else {
6180 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6181 }
6182
6183 if (pa_valid(pa)) {
6184 unsigned int pai;
6185 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6186
6187 is_internal = FALSE;
6188 is_altacct = FALSE;
6189
6190 pai = pa_index(pa);
6191
6192 pvh_lock(pai);
6193
6194 /*
6195 * Make sure that the current per-cpu PV free list has
6196 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6197 * if the transaction succeeds. We're either in the
6198 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6199 * Note that we can still be interrupted, but a primary
6200 * interrupt handler can never enter the pmap.
6201 */
6202 #if !XNU_MONITOR
6203 assert(get_preemption_level() > 0);
6204 #endif
6205 local_pv_free = &pmap_get_cpu_data()->pv_free;
6206 pv_entry_t **pv_h = pai_to_pvh(pai);
6207 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6208 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6209
6210 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6211 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6212 int new_allocated_pves = 0;
6213
6214 while (new_allocated_pves < 2) {
6215 local_pv_free = &pmap_get_cpu_data()->pv_free;
6216 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6217 if (pv_status == PV_ALLOC_FAIL) {
6218 break;
6219 } else if (pv_status == PV_ALLOC_RETRY) {
6220 /*
6221 * In the case that pv_alloc() had to grab a new page of PVEs,
6222 * it will have dropped the pmap lock while doing so.
6223 * On non-PPL devices, dropping the lock re-enables preemption so we may
6224 * be on a different CPU now.
6225 */
6226 local_pv_free = &pmap_get_cpu_data()->pv_free;
6227 } else {
6228 /* If we've gotten this far then a node should've been allocated. */
6229 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6230
6231 new_allocated_pves++;
6232 }
6233 }
6234
6235 for (int i = 0; i < new_allocated_pves; i++) {
6236 pv_free(new_pve_p[i]);
6237 }
6238 }
6239
6240 if (pv_status == PV_ALLOC_FAIL) {
6241 pvh_unlock(pai);
6242 kr = KERN_RESOURCE_SHORTAGE;
6243 break;
6244 } else if (pv_status == PV_ALLOC_RETRY) {
6245 pvh_unlock(pai);
6246 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6247 spte = os_atomic_load(pte_p, relaxed);
6248 continue;
6249 }
6250
6251 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6252 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6253 } else {
6254 wimg_bits = pmap_cache_attributes(pn);
6255 }
6256
6257 /* We may be retrying this operation after dropping the PVH lock.
6258 * Cache attributes for the physical page may have changed while the lock
6259 * was dropped, so clear any cache attributes we may have previously set
6260 * in the PTE template. */
6261 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6262 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6263
6264 #if XNU_MONITOR
6265 /* The regular old kernel is not allowed to remap PPL pages. */
6266 if (__improbable(ppattr_pa_test_monitor(pa))) {
6267 panic("%s: page belongs to PPL, "
6268 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6269 __FUNCTION__,
6270 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6271 }
6272
6273 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6274 panic("%s: page locked down, "
6275 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6276 __FUNCTION__,
6277 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6278 }
6279 #endif
6280
6281
6282
6283 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6284 if (!committed) {
6285 pvh_unlock(pai);
6286 continue;
6287 }
6288 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6289 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6290
6291 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6292 /*
6293 * If there was already a valid pte here then we reuse its reference
6294 * on the ptd and drop the one that we took above.
6295 */
6296 drop_refcnt = had_valid_mapping;
6297
6298 if (!had_valid_mapping) {
6299 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6300 int pve_ptep_idx = 0;
6301 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6302 /* We did all the allocations up top. So this shouldn't be able to fail. */
6303 if (pv_status != PV_ALLOC_SUCCESS) {
6304 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6305 __func__, pv_status, new_pve_p, pmap);
6306 }
6307
6308 if (pmap != kernel_pmap) {
6309 if (options & PMAP_OPTIONS_INTERNAL) {
6310 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6311 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6312 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6313 /*
6314 * Make a note to ourselves that this
6315 * mapping is using alternative
6316 * accounting. We'll need this in order
6317 * to know which ledger to debit when
6318 * the mapping is removed.
6319 *
6320 * The altacct bit must be set while
6321 * the pv head is locked. Defer the
6322 * ledger accounting until after we've
6323 * dropped the lock.
6324 */
6325 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6326 is_altacct = TRUE;
6327 }
6328 }
6329 if (ppattr_test_reusable(pai) &&
6330 !is_altacct) {
6331 is_reusable = TRUE;
6332 } else if (options & PMAP_OPTIONS_INTERNAL) {
6333 is_internal = TRUE;
6334 } else {
6335 is_external = TRUE;
6336 }
6337 }
6338 }
6339
6340 pvh_unlock(pai);
6341
6342 if (pp_attr_bits != 0) {
6343 ppattr_pa_set_bits(pa, pp_attr_bits);
6344 }
6345
6346 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6347 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6348
6349 if (is_internal) {
6350 /*
6351 * Make corresponding adjustments to
6352 * phys_footprint statistics.
6353 */
6354 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6355 if (is_altacct) {
6356 /*
6357 * If this page is internal and
6358 * in an IOKit region, credit
6359 * the task's total count of
6360 * dirty, internal IOKit pages.
6361 * It should *not* count towards
6362 * the task's total physical
6363 * memory footprint, because
6364 * this entire region was
6365 * already billed to the task
6366 * at the time the mapping was
6367 * created.
6368 *
6369 * Put another way, this is
6370 * internal++ and
6371 * alternate_accounting++, so
6372 * net effect on phys_footprint
6373 * is 0. That means: don't
6374 * touch phys_footprint here.
6375 */
6376 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6377 } else {
6378 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6379 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6380 skip_footprint_debit = true;
6381 } else {
6382 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6383 }
6384 }
6385 }
6386 if (is_reusable) {
6387 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6388 } else if (is_external) {
6389 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6390 }
6391 }
6392 } else {
6393 if (prot & VM_PROT_EXECUTE) {
6394 kr = KERN_FAILURE;
6395 break;
6396 }
6397
6398 wimg_bits = pmap_cache_attributes(pn);
6399 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6400 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6401 }
6402
6403 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6404
6405 #if XNU_MONITOR
6406 pte = pmap_construct_io_pte(pa, pte);
6407
6408 /**
6409 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6410 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6411 * created and later removed. We must therefore prevent an attacker from downgrading a
6412 * a writable mapping in order to allow it to be removed and remapped to something else.
6413 */
6414 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6415 ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6416 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6417 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6418 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6419 __func__, (uint64_t)pte_to_pa(spte));
6420 }
6421 #endif
6422
6423 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6424 if (committed) {
6425 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6426 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6427
6428 /**
6429 * If there was already a valid pte here then we reuse its
6430 * reference on the ptd and drop the one that we took above.
6431 */
6432 drop_refcnt = had_valid_mapping;
6433 }
6434 }
6435 if (committed) {
6436 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6437 assert(pmap != kernel_pmap);
6438
6439 /* One less "compressed" */
6440 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6441 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6442
6443 if (spte & ARM_PTE_COMPRESSED_ALT) {
6444 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6445 } else if (!skip_footprint_debit) {
6446 /* Was part of the footprint */
6447 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6448 }
6449 /* The old entry held a reference so drop the extra one that we took above. */
6450 drop_refcnt = true;
6451 }
6452 }
6453 }
6454
6455 if (drop_refcnt && refcnt != NULL) {
6456 assert(refcnt_updated);
6457 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6458 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6459 }
6460 }
6461
6462 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6463 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6464 }
6465
6466 pmap_unlock(pmap, lock_mode);
6467
6468 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6469 pmap_phys_write_disable(v);
6470 }
6471
6472 return kr;
6473 }
6474
6475 kern_return_t
6476 pmap_enter_options_addr(
6477 pmap_t pmap,
6478 vm_map_address_t v,
6479 pmap_paddr_t pa,
6480 vm_prot_t prot,
6481 vm_prot_t fault_type,
6482 unsigned int flags,
6483 boolean_t wired,
6484 unsigned int options,
6485 __unused void *arg)
6486 {
6487 kern_return_t kr = KERN_FAILURE;
6488
6489
6490 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6491 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6492
6493
6494 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6495 do {
6496 #if XNU_MONITOR
6497 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6498 #else
6499 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6500 #endif
6501
6502 if (kr == KERN_RESOURCE_SHORTAGE) {
6503 #if XNU_MONITOR
6504 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6505 #endif
6506 if (nowait_requested) {
6507 break;
6508 }
6509 }
6510 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6511
6512 #if XNU_MONITOR
6513 pmap_ledger_check_balance(pmap);
6514 #endif
6515
6516 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6517
6518 return kr;
6519 }
6520
6521 kern_return_t
6522 pmap_enter_options(
6523 pmap_t pmap,
6524 vm_map_address_t v,
6525 ppnum_t pn,
6526 vm_prot_t prot,
6527 vm_prot_t fault_type,
6528 unsigned int flags,
6529 boolean_t wired,
6530 unsigned int options,
6531 __unused void *arg)
6532 {
6533 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg);
6534 }
6535
6536 /*
6537 * Routine: pmap_change_wiring
6538 * Function: Change the wiring attribute for a map/virtual-address
6539 * pair.
6540 * In/out conditions:
6541 * The mapping must already exist in the pmap.
6542 */
6543 MARK_AS_PMAP_TEXT kern_return_t
6544 pmap_change_wiring_internal(
6545 pmap_t pmap,
6546 vm_map_address_t v,
6547 boolean_t wired)
6548 {
6549 pt_entry_t *pte_p;
6550 pmap_paddr_t pa;
6551
6552 validate_pmap_mutable(pmap);
6553
6554 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6555 return KERN_ABORTED;
6556 }
6557
6558 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6559
6560 pte_p = pmap_pte(pmap, v);
6561 if (pte_p == PT_ENTRY_NULL) {
6562 if (!wired) {
6563 /*
6564 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6565 * may have been freed by a remove operation.
6566 */
6567 goto pmap_change_wiring_return;
6568 } else {
6569 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6570 }
6571 }
6572 /*
6573 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6574 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6575 */
6576 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6577
6578 while (pa_valid(pa)) {
6579 pmap_paddr_t new_pa;
6580
6581 pvh_lock(pa_index(pa));
6582 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6583
6584 if (pa == new_pa) {
6585 break;
6586 }
6587
6588 pvh_unlock(pa_index(pa));
6589 pa = new_pa;
6590 }
6591
6592 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6593 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6594 if (!wired) {
6595 /* PTE cleared by prior remove/disconnect operation */
6596 goto pmap_change_wiring_cleanup;
6597 } else {
6598 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6599 __func__, pte_p, (uint64_t)*pte_p, pmap);
6600 }
6601 }
6602
6603 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6604 if (wired != pte_is_wired(*pte_p)) {
6605 pte_set_wired(pmap, pte_p, wired);
6606 if (pmap != kernel_pmap) {
6607 if (wired) {
6608 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6609 } else if (!wired) {
6610 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6611 }
6612 }
6613 }
6614
6615 pmap_change_wiring_cleanup:
6616 if (pa_valid(pa)) {
6617 pvh_unlock(pa_index(pa));
6618 }
6619
6620 pmap_change_wiring_return:
6621 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6622
6623 return KERN_SUCCESS;
6624 }
6625
6626 void
6627 pmap_change_wiring(
6628 pmap_t pmap,
6629 vm_map_address_t v,
6630 boolean_t wired)
6631 {
6632 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6633 pmap_verify_preemptible();
6634
6635 kern_return_t kr = KERN_FAILURE;
6636 #if XNU_MONITOR
6637 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6638 do {
6639 kr = pmap_change_wiring_ppl(pmap, v, wired);
6640 } while (kr == KERN_ABORTED);
6641
6642 pmap_ledger_check_balance(pmap);
6643 #else
6644 /* Since we verified preemptibility, call the helper only once. */
6645 kr = pmap_change_wiring_internal(pmap, v, wired);
6646 #endif
6647
6648 if (kr != KERN_SUCCESS) {
6649 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6650 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6651 }
6652 }
6653
6654 MARK_AS_PMAP_TEXT pmap_paddr_t
6655 pmap_find_pa_internal(
6656 pmap_t pmap,
6657 addr64_t va)
6658 {
6659 pmap_paddr_t pa = 0;
6660
6661 validate_pmap(pmap);
6662
6663 if (pmap != kernel_pmap) {
6664 pmap_lock(pmap, PMAP_LOCK_SHARED);
6665 }
6666
6667 pa = pmap_vtophys(pmap, va);
6668
6669 if (pmap != kernel_pmap) {
6670 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6671 }
6672
6673 return pa;
6674 }
6675
6676 pmap_paddr_t
6677 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6678 {
6679 pmap_paddr_t pa = 0;
6680
6681 if (pmap == kernel_pmap) {
6682 pa = mmu_kvtop(va);
6683 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6684 /*
6685 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6686 * translation even if PAN would prevent kernel access through the translation.
6687 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6688 */
6689 pa = mmu_uvtop(va);
6690 }
6691 return pa;
6692 }
6693
6694 pmap_paddr_t
6695 pmap_find_pa(
6696 pmap_t pmap,
6697 addr64_t va)
6698 {
6699 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6700
6701 if (pa != 0) {
6702 return pa;
6703 }
6704
6705 if (not_in_kdp) {
6706 #if XNU_MONITOR
6707 return pmap_find_pa_ppl(pmap, va);
6708 #else
6709 return pmap_find_pa_internal(pmap, va);
6710 #endif
6711 } else {
6712 return pmap_vtophys(pmap, va);
6713 }
6714 }
6715
6716 ppnum_t
6717 pmap_find_phys_nofault(
6718 pmap_t pmap,
6719 addr64_t va)
6720 {
6721 ppnum_t ppn;
6722 ppn = atop(pmap_find_pa_nofault(pmap, va));
6723 return ppn;
6724 }
6725
6726 ppnum_t
6727 pmap_find_phys(
6728 pmap_t pmap,
6729 addr64_t va)
6730 {
6731 ppnum_t ppn;
6732 ppn = atop(pmap_find_pa(pmap, va));
6733 return ppn;
6734 }
6735
6736 /**
6737 * Translate a kernel virtual address into a physical address.
6738 *
6739 * @param va The kernel virtual address to translate. Does not work on user
6740 * virtual addresses.
6741 *
6742 * @return The physical address if the translation was successful, or zero if
6743 * no valid mappings were found for the given virtual address.
6744 */
6745 pmap_paddr_t
6746 kvtophys(vm_offset_t va)
6747 {
6748 /**
6749 * Attempt to do the translation first in hardware using the AT (address
6750 * translation) instruction. This will attempt to use the MMU to do the
6751 * translation for us.
6752 */
6753 pmap_paddr_t pa = mmu_kvtop(va);
6754
6755 if (pa) {
6756 return pa;
6757 }
6758
6759 /* If the MMU can't find the mapping, then manually walk the page tables. */
6760 return pmap_vtophys(kernel_pmap, va);
6761 }
6762
6763 /**
6764 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6765 * points to a non-kernel-managed physical page, then this call will panic().
6766 *
6767 * @note The output of this function is guaranteed to be a kernel-managed
6768 * physical page, which means it's safe to pass the output directly to
6769 * pa_index() to create a physical address index for various pmap data
6770 * structures.
6771 *
6772 * @param va The kernel virtual address to translate. Does not work on user
6773 * virtual addresses.
6774 *
6775 * @return The translated physical address for the given virtual address.
6776 */
6777 pmap_paddr_t
6778 kvtophys_nofail(vm_offset_t va)
6779 {
6780 pmap_paddr_t pa = kvtophys(va);
6781
6782 if (!pa_valid(pa)) {
6783 panic("%s: Invalid or non-kernel-managed physical page returned, "
6784 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6785 }
6786
6787 return pa;
6788 }
6789
6790 pmap_paddr_t
6791 pmap_vtophys(
6792 pmap_t pmap,
6793 addr64_t va)
6794 {
6795 if ((va < pmap->min) || (va >= pmap->max)) {
6796 return 0;
6797 }
6798
6799 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6800
6801 tt_entry_t * ttp = NULL;
6802 tt_entry_t * ttep = NULL;
6803 tt_entry_t tte = ARM_TTE_EMPTY;
6804 pmap_paddr_t pa = 0;
6805 unsigned int cur_level;
6806
6807 ttp = pmap->tte;
6808
6809 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6810 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6811
6812 tte = *ttep;
6813
6814 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6815 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6816 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6817 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6818
6819 if ((tte & valid_mask) != valid_mask) {
6820 return (pmap_paddr_t) 0;
6821 }
6822
6823 /* This detects both leaf entries and intermediate block mappings. */
6824 if ((tte & type_mask) == type_block) {
6825 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6826 break;
6827 }
6828
6829 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6830 }
6831
6832 return pa;
6833 }
6834
6835 /*
6836 * pmap_init_pte_page - Initialize a page table page.
6837 */
6838 MARK_AS_PMAP_TEXT void
6839 pmap_init_pte_page(
6840 pmap_t pmap,
6841 pt_entry_t *pte_p,
6842 vm_offset_t va,
6843 unsigned int ttlevel,
6844 boolean_t alloc_ptd)
6845 {
6846 pt_desc_t *ptdp = NULL;
6847 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6848
6849 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6850 if (alloc_ptd) {
6851 /*
6852 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6853 * on 4KB hardware, we may already have allocated a page table descriptor for a
6854 * bootstrap request, so we check for an existing PTD here.
6855 */
6856 ptdp = ptd_alloc(pmap);
6857 if (ptdp == NULL) {
6858 panic("%s: unable to allocate PTD", __func__);
6859 }
6860 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6861 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6862 pvh_set_flags(pvh, 0);
6863 } else {
6864 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6865 }
6866 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6867 ptdp = pvh_ptd(pvh);
6868 } else {
6869 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6870 }
6871
6872 // below barrier ensures previous updates to the page are visible to PTW before
6873 // it is linked to the PTE of previous level
6874 __builtin_arm_dmb(DMB_ISHST);
6875 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6876 }
6877
6878 /*
6879 * Routine: pmap_expand
6880 *
6881 * Expands a pmap to be able to map the specified virtual address.
6882 *
6883 * Allocates new memory for the default (COARSE) translation table
6884 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6885 * also allocates space for the corresponding pv entries.
6886 *
6887 * Nothing should be locked.
6888 */
6889 MARK_AS_PMAP_TEXT static kern_return_t
6890 pmap_expand(
6891 pmap_t pmap,
6892 vm_map_address_t v,
6893 unsigned int options,
6894 unsigned int level)
6895 {
6896 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6897
6898 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6899 return KERN_INVALID_ADDRESS;
6900 }
6901 pmap_paddr_t pa;
6902 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6903 tt_entry_t *tte_p;
6904 tt_entry_t *tt_p;
6905
6906 pa = 0x0ULL;
6907 tt_p = (tt_entry_t *)NULL;
6908
6909 for (; ttlevel < level; ttlevel++) {
6910 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6911 return KERN_ABORTED;
6912 }
6913
6914 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6915 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6916 kern_return_t ret;
6917 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6918 if (options & PMAP_OPTIONS_NOWAIT) {
6919 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6920 return ret;
6921 }
6922 #if XNU_MONITOR
6923 panic("%s: failed to allocate tt, "
6924 "pmap=%p, v=%p, options=0x%x, level=%u",
6925 __FUNCTION__,
6926 pmap, (void *)v, options, level);
6927 #else
6928 VM_PAGE_WAIT();
6929 #endif
6930 }
6931
6932 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6933 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6934 return KERN_ABORTED;
6935 }
6936
6937 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6938 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6939 pa = kvtophys_nofail((vm_offset_t)tt_p);
6940 tte_p = pmap_ttne(pmap, ttlevel, v);
6941 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6942 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6943 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6944 pa = 0x0ULL;
6945 tt_p = (tt_entry_t *)NULL;
6946 }
6947 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6948 } else {
6949 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6950 }
6951
6952 if (tt_p != (tt_entry_t *)NULL) {
6953 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6954 tt_p = (tt_entry_t *)NULL;
6955 }
6956 }
6957
6958 return KERN_SUCCESS;
6959 }
6960
6961 /*
6962 * Routine: pmap_gc
6963 * Function:
6964 * Pmap garbage collection
6965 * Called by the pageout daemon when pages are scarce.
6966 *
6967 */
6968 void
6969 pmap_gc(void)
6970 {
6971 /*
6972 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6973 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6974 * or may contain wired mappings. However, with the relatively recent change to
6975 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6976 * page, it may make sense to call that function here.
6977 */
6978 }
6979
6980 /*
6981 * By default, don't attempt pmap GC more frequently
6982 * than once / 1 minutes.
6983 */
6984
6985 void
6986 compute_pmap_gc_throttle(
6987 void *arg __unused)
6988 {
6989 }
6990
6991 /*
6992 * pmap_attribute_cache_sync(vm_offset_t pa)
6993 *
6994 * Invalidates all of the instruction cache on a physical page and
6995 * pushes any dirty data from the data cache for the same physical page
6996 */
6997
6998 kern_return_t
6999 pmap_attribute_cache_sync(
7000 ppnum_t pp,
7001 vm_size_t size,
7002 __unused vm_machine_attribute_t attribute,
7003 __unused vm_machine_attribute_val_t * value)
7004 {
7005 if (size > PAGE_SIZE) {
7006 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7007 } else {
7008 cache_sync_page(pp);
7009 }
7010
7011 return KERN_SUCCESS;
7012 }
7013
7014 /*
7015 * pmap_sync_page_data_phys(ppnum_t pp)
7016 *
7017 * Invalidates all of the instruction cache on a physical page and
7018 * pushes any dirty data from the data cache for the same physical page
7019 */
7020 void
7021 pmap_sync_page_data_phys(
7022 ppnum_t pp)
7023 {
7024 cache_sync_page(pp);
7025 }
7026
7027 /*
7028 * pmap_sync_page_attributes_phys(ppnum_t pp)
7029 *
7030 * Write back and invalidate all cachelines on a physical page.
7031 */
7032 void
7033 pmap_sync_page_attributes_phys(
7034 ppnum_t pp)
7035 {
7036 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7037 }
7038
7039 #if CONFIG_COREDUMP
7040 /* temporary workaround */
7041 boolean_t
7042 coredumpok(
7043 vm_map_t map,
7044 mach_vm_offset_t va)
7045 {
7046 pt_entry_t *pte_p;
7047 pt_entry_t spte;
7048
7049 pte_p = pmap_pte(map->pmap, va);
7050 if (0 == pte_p) {
7051 return FALSE;
7052 }
7053 if (vm_map_entry_has_device_pager(map, va)) {
7054 return FALSE;
7055 }
7056 spte = *pte_p;
7057 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7058 }
7059 #endif
7060
7061 void
7062 fillPage(
7063 ppnum_t pn,
7064 unsigned int fill)
7065 {
7066 unsigned int *addr;
7067 int count;
7068
7069 addr = (unsigned int *) phystokv(ptoa(pn));
7070 count = PAGE_SIZE / sizeof(unsigned int);
7071 while (count--) {
7072 *addr++ = fill;
7073 }
7074 }
7075
7076 extern void mapping_set_mod(ppnum_t pn);
7077
7078 void
7079 mapping_set_mod(
7080 ppnum_t pn)
7081 {
7082 pmap_set_modify(pn);
7083 }
7084
7085 extern void mapping_set_ref(ppnum_t pn);
7086
7087 void
7088 mapping_set_ref(
7089 ppnum_t pn)
7090 {
7091 pmap_set_reference(pn);
7092 }
7093
7094 /*
7095 * Clear specified attribute bits.
7096 *
7097 * Try to force an arm_fast_fault() for all mappings of
7098 * the page - to force attributes to be set again at fault time.
7099 * If the forcing succeeds, clear the cached bits at the head.
7100 * Otherwise, something must have been wired, so leave the cached
7101 * attributes alone.
7102 */
7103 MARK_AS_PMAP_TEXT static void
7104 phys_attribute_clear_with_flush_range(
7105 ppnum_t pn,
7106 unsigned int bits,
7107 int options,
7108 void *arg,
7109 pmap_tlb_flush_range_t *flush_range)
7110 {
7111 pmap_paddr_t pa = ptoa(pn);
7112 vm_prot_t allow_mode = VM_PROT_ALL;
7113
7114 #if XNU_MONITOR
7115 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7116 panic("%s: illegal request, "
7117 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7118 __FUNCTION__,
7119 pn, bits, options, arg, flush_range);
7120 }
7121 #endif
7122 if ((arg != NULL) || (flush_range != NULL)) {
7123 options = options & ~PMAP_OPTIONS_NOFLUSH;
7124 }
7125
7126 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7127 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7128 "invalid options",
7129 pn, bits, options, arg, flush_range);
7130 }
7131
7132 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7133 (options & PMAP_OPTIONS_NOFLUSH))) {
7134 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7135 "should not clear 'modified' without flushing TLBs",
7136 pn, bits, options, arg, flush_range);
7137 }
7138
7139 assert(pn != vm_page_fictitious_addr);
7140
7141 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7142 assert(bits == PP_ATTR_MODIFIED);
7143
7144 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7145 /*
7146 * We short circuit this case; it should not need to
7147 * invoke arm_force_fast_fault, so just clear the modified bit.
7148 * pmap_page_protect has taken care of resetting
7149 * the state so that we'll see the next write as a fault to
7150 * the VM (i.e. we don't want a fast fault).
7151 */
7152 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7153 return;
7154 }
7155 if (bits & PP_ATTR_REFERENCED) {
7156 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7157 }
7158 if (bits & PP_ATTR_MODIFIED) {
7159 allow_mode &= ~VM_PROT_WRITE;
7160 }
7161
7162 if (bits == PP_ATTR_NOENCRYPT) {
7163 /*
7164 * We short circuit this case; it should not need to
7165 * invoke arm_force_fast_fault, so just clear and
7166 * return. On ARM, this bit is just a debugging aid.
7167 */
7168 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7169 return;
7170 }
7171
7172 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7173 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7174 }
7175 }
7176
7177 MARK_AS_PMAP_TEXT void
7178 phys_attribute_clear_internal(
7179 ppnum_t pn,
7180 unsigned int bits,
7181 int options,
7182 void *arg)
7183 {
7184 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7185 }
7186
7187 #if __ARM_RANGE_TLBI__
7188 MARK_AS_PMAP_TEXT static vm_map_address_t
7189 phys_attribute_clear_twig_internal(
7190 pmap_t pmap,
7191 vm_map_address_t start,
7192 vm_map_address_t end,
7193 unsigned int bits,
7194 unsigned int options,
7195 pmap_tlb_flush_range_t *flush_range)
7196 {
7197 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7198 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7199 assert(end >= start);
7200 assert((end - start) <= pt_attr_twig_size(pt_attr));
7201 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7202 vm_map_address_t va = start;
7203 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7204 tt_entry_t *tte_p;
7205 tte_p = pmap_tte(pmap, start);
7206 unsigned int npages = 0;
7207
7208 if (tte_p == (tt_entry_t *) NULL) {
7209 return end;
7210 }
7211
7212 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7213 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7214
7215 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7216 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7217 assert(end_pte_p >= start_pte_p);
7218 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7219 if (__improbable(npages++ && pmap_pending_preemption())) {
7220 return va;
7221 }
7222 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7223 if (pa_valid(pa)) {
7224 ppnum_t pn = (ppnum_t) atop(pa);
7225 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7226 }
7227 }
7228 }
7229 return end;
7230 }
7231
7232 MARK_AS_PMAP_TEXT vm_map_address_t
7233 phys_attribute_clear_range_internal(
7234 pmap_t pmap,
7235 vm_map_address_t start,
7236 vm_map_address_t end,
7237 unsigned int bits,
7238 unsigned int options)
7239 {
7240 if (__improbable(end < start)) {
7241 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7242 }
7243 validate_pmap_mutable(pmap);
7244
7245 vm_map_address_t va = start;
7246 pmap_tlb_flush_range_t flush_range = {
7247 .ptfr_pmap = pmap,
7248 .ptfr_start = start,
7249 .ptfr_end = end,
7250 .ptfr_flush_needed = false
7251 };
7252
7253 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7254 return va;
7255 }
7256
7257 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7258
7259 while (va < end) {
7260 vm_map_address_t curr_end;
7261
7262 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7263 if (curr_end > end) {
7264 curr_end = end;
7265 }
7266
7267 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7268 if ((va < curr_end) || pmap_pending_preemption()) {
7269 break;
7270 }
7271 }
7272 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7273 if (flush_range.ptfr_flush_needed) {
7274 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7275 flush_range.ptfr_start,
7276 flush_range.ptfr_end - flush_range.ptfr_start,
7277 flush_range.ptfr_pmap,
7278 true,
7279 false);
7280 sync_tlb_flush();
7281 }
7282 return va;
7283 }
7284
7285 static void
7286 phys_attribute_clear_range(
7287 pmap_t pmap,
7288 vm_map_address_t start,
7289 vm_map_address_t end,
7290 unsigned int bits,
7291 unsigned int options)
7292 {
7293 /*
7294 * We allow single-page requests to execute non-preemptibly,
7295 * as it doesn't make sense to sample AST_URGENT for a single-page
7296 * operation, and there are a couple of special use cases that
7297 * require a non-preemptible single-page operation.
7298 */
7299 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7300 pmap_verify_preemptible();
7301 }
7302
7303 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7304
7305 while (start < end) {
7306 #if XNU_MONITOR
7307 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7308 #else
7309 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7310 #endif
7311 }
7312
7313 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7314 }
7315 #endif /* __ARM_RANGE_TLBI__ */
7316
7317 static void
7318 phys_attribute_clear(
7319 ppnum_t pn,
7320 unsigned int bits,
7321 int options,
7322 void *arg)
7323 {
7324 /*
7325 * Do we really want this tracepoint? It will be extremely chatty.
7326 * Also, should we have a corresponding trace point for the set path?
7327 */
7328 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7329
7330 #if XNU_MONITOR
7331 phys_attribute_clear_ppl(pn, bits, options, arg);
7332 #else
7333 phys_attribute_clear_internal(pn, bits, options, arg);
7334 #endif
7335
7336 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7337 }
7338
7339 /*
7340 * Set specified attribute bits.
7341 *
7342 * Set cached value in the pv head because we have
7343 * no per-mapping hardware support for referenced and
7344 * modify bits.
7345 */
7346 MARK_AS_PMAP_TEXT void
7347 phys_attribute_set_internal(
7348 ppnum_t pn,
7349 unsigned int bits)
7350 {
7351 pmap_paddr_t pa = ptoa(pn);
7352 assert(pn != vm_page_fictitious_addr);
7353
7354 #if XNU_MONITOR
7355 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7356 panic("%s: illegal request, "
7357 "pn=%u, bits=%#x",
7358 __FUNCTION__,
7359 pn, bits);
7360 }
7361 #endif
7362
7363 ppattr_pa_set_bits(pa, (uint16_t)bits);
7364
7365 return;
7366 }
7367
7368 static void
7369 phys_attribute_set(
7370 ppnum_t pn,
7371 unsigned int bits)
7372 {
7373 #if XNU_MONITOR
7374 phys_attribute_set_ppl(pn, bits);
7375 #else
7376 phys_attribute_set_internal(pn, bits);
7377 #endif
7378 }
7379
7380
7381 /*
7382 * Check specified attribute bits.
7383 *
7384 * use the software cached bits (since no hw support).
7385 */
7386 static boolean_t
7387 phys_attribute_test(
7388 ppnum_t pn,
7389 unsigned int bits)
7390 {
7391 pmap_paddr_t pa = ptoa(pn);
7392 assert(pn != vm_page_fictitious_addr);
7393 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7394 }
7395
7396
7397 /*
7398 * Set the modify/reference bits on the specified physical page.
7399 */
7400 void
7401 pmap_set_modify(ppnum_t pn)
7402 {
7403 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7404 }
7405
7406
7407 /*
7408 * Clear the modify bits on the specified physical page.
7409 */
7410 void
7411 pmap_clear_modify(
7412 ppnum_t pn)
7413 {
7414 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7415 }
7416
7417
7418 /*
7419 * pmap_is_modified:
7420 *
7421 * Return whether or not the specified physical page is modified
7422 * by any physical maps.
7423 */
7424 boolean_t
7425 pmap_is_modified(
7426 ppnum_t pn)
7427 {
7428 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7429 }
7430
7431
7432 /*
7433 * Set the reference bit on the specified physical page.
7434 */
7435 static void
7436 pmap_set_reference(
7437 ppnum_t pn)
7438 {
7439 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7440 }
7441
7442 /*
7443 * Clear the reference bits on the specified physical page.
7444 */
7445 void
7446 pmap_clear_reference(
7447 ppnum_t pn)
7448 {
7449 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7450 }
7451
7452
7453 /*
7454 * pmap_is_referenced:
7455 *
7456 * Return whether or not the specified physical page is referenced
7457 * by any physical maps.
7458 */
7459 boolean_t
7460 pmap_is_referenced(
7461 ppnum_t pn)
7462 {
7463 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7464 }
7465
7466 /*
7467 * pmap_get_refmod(phys)
7468 * returns the referenced and modified bits of the specified
7469 * physical page.
7470 */
7471 unsigned int
7472 pmap_get_refmod(
7473 ppnum_t pn)
7474 {
7475 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7476 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7477 }
7478
7479 static inline unsigned int
7480 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7481 {
7482 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7483 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7484 }
7485
7486 /*
7487 * pmap_clear_refmod(phys, mask)
7488 * clears the referenced and modified bits as specified by the mask
7489 * of the specified physical page.
7490 */
7491 void
7492 pmap_clear_refmod_options(
7493 ppnum_t pn,
7494 unsigned int mask,
7495 unsigned int options,
7496 void *arg)
7497 {
7498 unsigned int bits;
7499
7500 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7501 phys_attribute_clear(pn, bits, options, arg);
7502 }
7503
7504 /*
7505 * Perform pmap_clear_refmod_options on a virtual address range.
7506 * The operation will be performed in bulk & tlb flushes will be coalesced
7507 * if possible.
7508 *
7509 * Returns true if the operation is supported on this platform.
7510 * If this function returns false, the operation is not supported and
7511 * nothing has been modified in the pmap.
7512 */
7513 bool
7514 pmap_clear_refmod_range_options(
7515 pmap_t pmap __unused,
7516 vm_map_address_t start __unused,
7517 vm_map_address_t end __unused,
7518 unsigned int mask __unused,
7519 unsigned int options __unused)
7520 {
7521 #if __ARM_RANGE_TLBI__
7522 unsigned int bits;
7523 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7524 phys_attribute_clear_range(pmap, start, end, bits, options);
7525 return true;
7526 #else /* __ARM_RANGE_TLBI__ */
7527 #pragma unused(pmap, start, end, mask, options)
7528 /*
7529 * This operation allows the VM to bulk modify refmod bits on a virtually
7530 * contiguous range of addresses. This is large performance improvement on
7531 * platforms that support ranged tlbi instructions. But on older platforms,
7532 * we can only flush per-page or the entire asid. So we currently
7533 * only support this operation on platforms that support ranged tlbi.
7534 * instructions. On other platforms, we require that
7535 * the VM modify the bits on a per-page basis.
7536 */
7537 return false;
7538 #endif /* __ARM_RANGE_TLBI__ */
7539 }
7540
7541 void
7542 pmap_clear_refmod(
7543 ppnum_t pn,
7544 unsigned int mask)
7545 {
7546 pmap_clear_refmod_options(pn, mask, 0, NULL);
7547 }
7548
7549 unsigned int
7550 pmap_disconnect_options(
7551 ppnum_t pn,
7552 unsigned int options,
7553 void *arg)
7554 {
7555 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7556 /*
7557 * On ARM, the "modified" bit is managed by software, so
7558 * we know up-front if the physical page is "modified",
7559 * without having to scan all the PTEs pointing to it.
7560 * The caller should have made the VM page "busy" so noone
7561 * should be able to establish any new mapping and "modify"
7562 * the page behind us.
7563 */
7564 if (pmap_is_modified(pn)) {
7565 /*
7566 * The page has been modified and will be sent to
7567 * the VM compressor.
7568 */
7569 options |= PMAP_OPTIONS_COMPRESSOR;
7570 } else {
7571 /*
7572 * The page hasn't been modified and will be freed
7573 * instead of compressed.
7574 */
7575 }
7576 }
7577
7578 /* disconnect the page */
7579 pmap_page_protect_options(pn, 0, options, arg);
7580
7581 /* return ref/chg status */
7582 return pmap_get_refmod(pn);
7583 }
7584
7585 /*
7586 * Routine:
7587 * pmap_disconnect
7588 *
7589 * Function:
7590 * Disconnect all mappings for this page and return reference and change status
7591 * in generic format.
7592 *
7593 */
7594 unsigned int
7595 pmap_disconnect(
7596 ppnum_t pn)
7597 {
7598 pmap_page_protect(pn, 0); /* disconnect the page */
7599 return pmap_get_refmod(pn); /* return ref/chg status */
7600 }
7601
7602 boolean_t
7603 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7604 {
7605 if (ptoa(first) >= vm_last_phys) {
7606 return FALSE;
7607 }
7608 if (ptoa(last) < vm_first_phys) {
7609 return FALSE;
7610 }
7611
7612 return TRUE;
7613 }
7614
7615 /*
7616 * The state maintained by the noencrypt functions is used as a
7617 * debugging aid on ARM. This incurs some overhead on the part
7618 * of the caller. A special case check in phys_attribute_clear
7619 * (the most expensive path) currently minimizes this overhead,
7620 * but stubbing these functions out on RELEASE kernels yields
7621 * further wins.
7622 */
7623 boolean_t
7624 pmap_is_noencrypt(
7625 ppnum_t pn)
7626 {
7627 #if DEVELOPMENT || DEBUG
7628 boolean_t result = FALSE;
7629
7630 if (!pa_valid(ptoa(pn))) {
7631 return FALSE;
7632 }
7633
7634 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7635
7636 return result;
7637 #else
7638 #pragma unused(pn)
7639 return FALSE;
7640 #endif
7641 }
7642
7643 void
7644 pmap_set_noencrypt(
7645 ppnum_t pn)
7646 {
7647 #if DEVELOPMENT || DEBUG
7648 if (!pa_valid(ptoa(pn))) {
7649 return;
7650 }
7651
7652 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7653 #else
7654 #pragma unused(pn)
7655 #endif
7656 }
7657
7658 void
7659 pmap_clear_noencrypt(
7660 ppnum_t pn)
7661 {
7662 #if DEVELOPMENT || DEBUG
7663 if (!pa_valid(ptoa(pn))) {
7664 return;
7665 }
7666
7667 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7668 #else
7669 #pragma unused(pn)
7670 #endif
7671 }
7672
7673 #if XNU_MONITOR
7674 boolean_t
7675 pmap_is_monitor(ppnum_t pn)
7676 {
7677 assert(pa_valid(ptoa(pn)));
7678 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7679 }
7680 #endif
7681
7682 void
7683 pmap_lock_phys_page(ppnum_t pn)
7684 {
7685 #if !XNU_MONITOR
7686 unsigned int pai;
7687 pmap_paddr_t phys = ptoa(pn);
7688
7689 if (pa_valid(phys)) {
7690 pai = pa_index(phys);
7691 pvh_lock(pai);
7692 } else
7693 #else
7694 (void)pn;
7695 #endif
7696 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7697 }
7698
7699
7700 void
7701 pmap_unlock_phys_page(ppnum_t pn)
7702 {
7703 #if !XNU_MONITOR
7704 unsigned int pai;
7705 pmap_paddr_t phys = ptoa(pn);
7706
7707 if (pa_valid(phys)) {
7708 pai = pa_index(phys);
7709 pvh_unlock(pai);
7710 } else
7711 #else
7712 (void)pn;
7713 #endif
7714 { simple_unlock(&phys_backup_lock);}
7715 }
7716
7717 MARK_AS_PMAP_TEXT static void
7718 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7719 {
7720 if (pmap != kernel_pmap) {
7721 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7722 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7723 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7724 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7725 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7726 #if __ARM_MIXED_PAGE_SIZE__
7727 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7728 #endif
7729 }
7730
7731
7732 #if __ARM_MIXED_PAGE_SIZE__
7733 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7734 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7735 }
7736 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7737
7738
7739 if (pmap != kernel_pmap) {
7740 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7741 } else if (!pmap_user_ttb_is_clear()) {
7742 pmap_clear_user_ttb_internal();
7743 }
7744 }
7745
7746 MARK_AS_PMAP_TEXT void
7747 pmap_clear_user_ttb_internal(void)
7748 {
7749 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7750 }
7751
7752 void
7753 pmap_clear_user_ttb(void)
7754 {
7755 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7756 #if XNU_MONITOR
7757 pmap_clear_user_ttb_ppl();
7758 #else
7759 pmap_clear_user_ttb_internal();
7760 #endif
7761 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7762 }
7763
7764
7765 #if defined(__arm64__)
7766 /*
7767 * Marker for use in multi-pass fast-fault PV list processing.
7768 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7769 * these functions, as compressed PTEs should never be present in PV lists.
7770 * Note that this only holds true for arm64; for arm32 we don't have enough
7771 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7772 * and WRITEABLE marker depending on whether the PTE is valid.
7773 */
7774 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7775 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7776 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7777 #endif
7778
7779
7780 MARK_AS_PMAP_TEXT static boolean_t
7781 arm_force_fast_fault_with_flush_range(
7782 ppnum_t ppnum,
7783 vm_prot_t allow_mode,
7784 int options,
7785 pmap_tlb_flush_range_t *flush_range)
7786 {
7787 pmap_paddr_t phys = ptoa(ppnum);
7788 pv_entry_t *pve_p;
7789 pt_entry_t *pte_p;
7790 unsigned int pai;
7791 unsigned int pass1_updated = 0;
7792 unsigned int pass2_updated = 0;
7793 boolean_t result;
7794 pv_entry_t **pv_h;
7795 bool is_reusable;
7796 bool ref_fault;
7797 bool mod_fault;
7798 bool clear_write_fault = false;
7799 bool ref_aliases_mod = false;
7800 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7801
7802 assert(ppnum != vm_page_fictitious_addr);
7803
7804 if (!pa_valid(phys)) {
7805 return FALSE; /* Not a managed page. */
7806 }
7807
7808 result = TRUE;
7809 ref_fault = false;
7810 mod_fault = false;
7811 pai = pa_index(phys);
7812 if (__probable(mustsynch)) {
7813 pvh_lock(pai);
7814 }
7815 pv_h = pai_to_pvh(pai);
7816
7817 #if XNU_MONITOR
7818 if (__improbable(ppattr_pa_test_monitor(phys))) {
7819 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7820 }
7821 #endif
7822 pte_p = PT_ENTRY_NULL;
7823 pve_p = PV_ENTRY_NULL;
7824 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7825 pte_p = pvh_ptep(pv_h);
7826 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7827 pve_p = pvh_pve_list(pv_h);
7828 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7829 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7830 }
7831
7832 is_reusable = ppattr_test_reusable(pai);
7833
7834 /*
7835 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7836 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7837 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7838 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7839 * operation, TLB invalidation may be handled by the caller so it's possible for
7840 * tlb_flush_needed to be true while issue_tlbi is false.
7841 */
7842 bool issue_tlbi = false;
7843 bool tlb_flush_needed = false;
7844
7845 pv_entry_t *orig_pve_p = pve_p;
7846 pt_entry_t *orig_pte_p = pte_p;
7847 int pve_ptep_idx = 0;
7848
7849 /*
7850 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7851 * TLB invalidation in pass 2.
7852 */
7853 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7854 pt_entry_t spte;
7855 pt_entry_t tmplate;
7856
7857 if (pve_p != PV_ENTRY_NULL) {
7858 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7859 if (pte_p == PT_ENTRY_NULL) {
7860 goto fff_skip_pve_pass1;
7861 }
7862 }
7863
7864 #ifdef PVH_FLAG_IOMMU
7865 if (pvh_ptep_is_iommu(pte_p)) {
7866 goto fff_skip_pve_pass1;
7867 }
7868 #endif
7869 if (*pte_p == ARM_PTE_EMPTY) {
7870 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7871 }
7872 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7873 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7874 }
7875
7876 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7877 const pmap_t pmap = ptdp->pmap;
7878 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7879 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7880
7881 assert(va >= pmap->min && va < pmap->max);
7882
7883 /* update pmap stats and ledgers */
7884 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7885 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7886 if (is_altacct) {
7887 /*
7888 * We do not track "reusable" status for
7889 * "alternate accounting" mappings.
7890 */
7891 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7892 is_reusable &&
7893 is_internal &&
7894 pmap != kernel_pmap) {
7895 /* one less "reusable" */
7896 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7897 /* one more "internal" */
7898 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7899 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7900
7901 /*
7902 * Since the page is being marked non-reusable, we assume that it will be
7903 * modified soon. Avoid the cost of another trap to handle the fast
7904 * fault when we next write to this page.
7905 */
7906 clear_write_fault = true;
7907 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7908 !is_reusable &&
7909 is_internal &&
7910 pmap != kernel_pmap) {
7911 /* one more "reusable" */
7912 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7913 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7914 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7915 }
7916
7917 bool wiredskip = pte_is_wired(*pte_p) &&
7918 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7919
7920 if (wiredskip) {
7921 result = FALSE;
7922 goto fff_skip_pve_pass1;
7923 }
7924
7925 spte = *pte_p;
7926 tmplate = spte;
7927
7928 #if HAS_FEAT_XS
7929 /**
7930 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7931 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7932 */
7933 assert(!pte_is_xs(pt_attr, spte));
7934 #endif /* HAS_FEAT_XS */
7935 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7936 /* read protection sets the pte to fault */
7937 tmplate = tmplate & ~ARM_PTE_AF;
7938 ref_fault = true;
7939 }
7940 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7941 /* take away write permission if set */
7942 if (pmap == kernel_pmap) {
7943 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7944 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7945 pte_set_was_writeable(tmplate, true);
7946 mod_fault = true;
7947 }
7948 } else {
7949 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7950 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7951 pte_set_was_writeable(tmplate, true);
7952 mod_fault = true;
7953 }
7954 }
7955 }
7956
7957 #if MACH_ASSERT && XNU_MONITOR
7958 if (is_pte_xprr_protected(pmap, spte)) {
7959 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7960 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7961 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7962 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7963 ppnum, options, allow_mode);
7964 }
7965 }
7966 #endif /* MACH_ASSERT && XNU_MONITOR */
7967
7968 if (result && (tmplate != spte)) {
7969 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7970 !(options & PMAP_OPTIONS_NOFLUSH)) {
7971 tlb_flush_needed = true;
7972 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7973 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7974 #ifdef ARM_PTE_FF_MARKER
7975 assert(!(spte & ARM_PTE_FF_MARKER));
7976 tmplate |= ARM_PTE_FF_MARKER;
7977 ++pass1_updated;
7978 #endif
7979 issue_tlbi = true;
7980 }
7981 }
7982 write_pte_fast(pte_p, tmplate);
7983 }
7984
7985 fff_skip_pve_pass1:
7986 pte_p = PT_ENTRY_NULL;
7987 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
7988 pve_ptep_idx = 0;
7989 pve_p = pve_next(pve_p);
7990 }
7991 }
7992
7993 if (tlb_flush_needed) {
7994 FLUSH_PTE_STRONG();
7995 }
7996
7997 if (!issue_tlbi) {
7998 goto fff_finish;
7999 }
8000
8001 /* Pass 2: Issue any required TLB invalidations */
8002 pve_p = orig_pve_p;
8003 pte_p = orig_pte_p;
8004 pve_ptep_idx = 0;
8005
8006 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8007 if (pve_p != PV_ENTRY_NULL) {
8008 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8009 if (pte_p == PT_ENTRY_NULL) {
8010 goto fff_skip_pve_pass2;
8011 }
8012 }
8013
8014 #ifdef PVH_FLAG_IOMMU
8015 if (pvh_ptep_is_iommu(pte_p)) {
8016 goto fff_skip_pve_pass2;
8017 }
8018 #endif
8019
8020 #ifdef ARM_PTE_FF_MARKER
8021 pt_entry_t spte = *pte_p;
8022
8023 if (!(spte & ARM_PTE_FF_MARKER)) {
8024 goto fff_skip_pve_pass2;
8025 } else {
8026 spte &= (~ARM_PTE_FF_MARKER);
8027 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8028 write_pte_fast(pte_p, spte);
8029 ++pass2_updated;
8030 }
8031 #endif
8032 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8033 const pmap_t pmap = ptdp->pmap;
8034 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8035
8036 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8037 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8038 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8039 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8040 }
8041
8042 fff_skip_pve_pass2:
8043 pte_p = PT_ENTRY_NULL;
8044 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8045 pve_ptep_idx = 0;
8046 pve_p = pve_next(pve_p);
8047 }
8048 }
8049
8050 fff_finish:
8051 if (__improbable(pass1_updated != pass2_updated)) {
8052 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8053 __func__, pass1_updated, pass2_updated);
8054 }
8055
8056 /*
8057 * If we are using the same approach for ref and mod
8058 * faults on this PTE, do not clear the write fault;
8059 * this would cause both ref and mod to be set on the
8060 * page again, and prevent us from taking ANY read/write
8061 * fault on the mapping.
8062 */
8063 if (clear_write_fault && !ref_aliases_mod) {
8064 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8065 }
8066 if (tlb_flush_needed) {
8067 if (flush_range) {
8068 /* Delayed flush. Signal to the caller that the flush is needed. */
8069 flush_range->ptfr_flush_needed = true;
8070 } else {
8071 sync_tlb_flush();
8072 }
8073 }
8074
8075 /* update global "reusable" status for this page */
8076 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8077 ppattr_clear_reusable(pai);
8078 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8079 ppattr_set_reusable(pai);
8080 }
8081
8082 if (mod_fault) {
8083 ppattr_set_modfault(pai);
8084 }
8085 if (ref_fault) {
8086 ppattr_set_reffault(pai);
8087 }
8088 if (__probable(mustsynch)) {
8089 pvh_unlock(pai);
8090 }
8091 return result;
8092 }
8093
8094 MARK_AS_PMAP_TEXT boolean_t
8095 arm_force_fast_fault_internal(
8096 ppnum_t ppnum,
8097 vm_prot_t allow_mode,
8098 int options)
8099 {
8100 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8101 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8102 }
8103 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8104 }
8105
8106 /*
8107 * Routine: arm_force_fast_fault
8108 *
8109 * Function:
8110 * Force all mappings for this page to fault according
8111 * to the access modes allowed, so we can gather ref/modify
8112 * bits again.
8113 */
8114
8115 boolean_t
8116 arm_force_fast_fault(
8117 ppnum_t ppnum,
8118 vm_prot_t allow_mode,
8119 int options,
8120 __unused void *arg)
8121 {
8122 pmap_paddr_t phys = ptoa(ppnum);
8123
8124 assert(ppnum != vm_page_fictitious_addr);
8125
8126 if (!pa_valid(phys)) {
8127 return FALSE; /* Not a managed page. */
8128 }
8129
8130 #if XNU_MONITOR
8131 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8132 #else
8133 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8134 #endif
8135 }
8136
8137 /*
8138 * Routine: arm_clear_fast_fault
8139 *
8140 * Function:
8141 * Clear pending force fault for all mappings for this page based on
8142 * the observed fault type, update ref/modify bits.
8143 */
8144 MARK_AS_PMAP_TEXT static boolean_t
8145 arm_clear_fast_fault(
8146 ppnum_t ppnum,
8147 vm_prot_t fault_type,
8148 pt_entry_t *pte_p)
8149 {
8150 pmap_paddr_t pa = ptoa(ppnum);
8151 pv_entry_t *pve_p;
8152 unsigned int pai;
8153 boolean_t result;
8154 bool tlb_flush_needed = false;
8155 pv_entry_t **pv_h;
8156 unsigned int npve = 0;
8157 unsigned int pass1_updated = 0;
8158 unsigned int pass2_updated = 0;
8159
8160 assert(ppnum != vm_page_fictitious_addr);
8161
8162 if (!pa_valid(pa)) {
8163 return FALSE; /* Not a managed page. */
8164 }
8165
8166 result = FALSE;
8167 pai = pa_index(pa);
8168 pvh_assert_locked(pai);
8169 pv_h = pai_to_pvh(pai);
8170
8171 pve_p = PV_ENTRY_NULL;
8172 if (pte_p == PT_ENTRY_NULL) {
8173 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8174 pte_p = pvh_ptep(pv_h);
8175 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8176 pve_p = pvh_pve_list(pv_h);
8177 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8178 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8179 }
8180 }
8181
8182 pv_entry_t *orig_pve_p = pve_p;
8183 pt_entry_t *orig_pte_p = pte_p;
8184 int pve_ptep_idx = 0;
8185
8186 /*
8187 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8188 * TLB invalidation in pass 2.
8189 */
8190 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8191 pt_entry_t spte;
8192 pt_entry_t tmplate;
8193
8194 if (pve_p != PV_ENTRY_NULL) {
8195 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8196 if (pte_p == PT_ENTRY_NULL) {
8197 goto cff_skip_pve_pass1;
8198 }
8199 }
8200
8201 #ifdef PVH_FLAG_IOMMU
8202 if (pvh_ptep_is_iommu(pte_p)) {
8203 goto cff_skip_pve_pass1;
8204 }
8205 #endif
8206 if (*pte_p == ARM_PTE_EMPTY) {
8207 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8208 }
8209
8210 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8211 const pmap_t pmap = ptdp->pmap;
8212 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8213
8214 assert(va >= pmap->min && va < pmap->max);
8215
8216 spte = *pte_p;
8217 tmplate = spte;
8218
8219 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8220 {
8221 if (pmap == kernel_pmap) {
8222 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8223 } else {
8224 assert(pmap->type != PMAP_TYPE_NESTED);
8225 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8226 }
8227 }
8228
8229 tmplate |= ARM_PTE_AF;
8230
8231 pte_set_was_writeable(tmplate, false);
8232 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8233 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8234 tmplate = spte | ARM_PTE_AF;
8235
8236 {
8237 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8238 }
8239 }
8240
8241 #if MACH_ASSERT && XNU_MONITOR
8242 if (is_pte_xprr_protected(pmap, spte)) {
8243 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8244 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8245 "ppnum=0x%x, fault_type=0x%x",
8246 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8247 ppnum, fault_type);
8248 }
8249 }
8250 #endif /* MACH_ASSERT && XNU_MONITOR */
8251
8252 assert(spte != ARM_PTE_TYPE_FAULT);
8253 if (spte != tmplate) {
8254 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8255 #ifdef ARM_PTE_FF_MARKER
8256 assert(!(spte & ARM_PTE_FF_MARKER));
8257 tmplate |= ARM_PTE_FF_MARKER;
8258 ++pass1_updated;
8259 #endif
8260 tlb_flush_needed = true;
8261 }
8262 write_pte_fast(pte_p, tmplate);
8263 result = TRUE;
8264 }
8265
8266 cff_skip_pve_pass1:
8267 pte_p = PT_ENTRY_NULL;
8268 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8269 pve_ptep_idx = 0;
8270 pve_p = pve_next(pve_p);
8271 ++npve;
8272 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8273 break;
8274 }
8275 }
8276 }
8277
8278 if (!tlb_flush_needed) {
8279 goto cff_finish;
8280 }
8281
8282 FLUSH_PTE_STRONG();
8283
8284 /* Pass 2: Issue any required TLB invalidations */
8285 pve_p = orig_pve_p;
8286 pte_p = orig_pte_p;
8287 pve_ptep_idx = 0;
8288 npve = 0;
8289
8290 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8291 if (pve_p != PV_ENTRY_NULL) {
8292 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8293 if (pte_p == PT_ENTRY_NULL) {
8294 goto cff_skip_pve_pass2;
8295 }
8296 }
8297
8298 #ifdef PVH_FLAG_IOMMU
8299 if (pvh_ptep_is_iommu(pte_p)) {
8300 goto cff_skip_pve_pass2;
8301 }
8302 #endif
8303
8304 #ifdef ARM_PTE_FF_MARKER
8305 pt_entry_t spte = *pte_p;
8306
8307 if (!(spte & ARM_PTE_FF_MARKER)) {
8308 goto cff_skip_pve_pass2;
8309 } else {
8310 spte &= (~ARM_PTE_FF_MARKER);
8311 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8312 write_pte_fast(pte_p, spte);
8313 ++pass2_updated;
8314 }
8315 #endif
8316 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8317 const pmap_t pmap = ptdp->pmap;
8318 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8319
8320 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8321 pmap, true, false);
8322
8323 cff_skip_pve_pass2:
8324 pte_p = PT_ENTRY_NULL;
8325 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8326 pve_ptep_idx = 0;
8327 pve_p = pve_next(pve_p);
8328 ++npve;
8329 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8330 break;
8331 }
8332 }
8333 }
8334
8335 cff_finish:
8336 if (__improbable(pass1_updated != pass2_updated)) {
8337 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8338 __func__, pass1_updated, pass2_updated);
8339 }
8340 if (tlb_flush_needed) {
8341 sync_tlb_flush();
8342 }
8343 return result;
8344 }
8345
8346 /*
8347 * Determine if the fault was induced by software tracking of
8348 * modify/reference bits. If so, re-enable the mapping (and set
8349 * the appropriate bits).
8350 *
8351 * Returns KERN_SUCCESS if the fault was induced and was
8352 * successfully handled.
8353 *
8354 * Returns KERN_FAILURE if the fault was not induced and
8355 * the function was unable to deal with it.
8356 *
8357 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8358 * disallows this type of access.
8359 *
8360 * Returns KERN_ABORTED if the pmap lock is taken and a
8361 * preemption is pending.
8362 *
8363 */
8364 MARK_AS_PMAP_TEXT kern_return_t
8365 arm_fast_fault_internal(
8366 pmap_t pmap,
8367 vm_map_address_t va,
8368 vm_prot_t fault_type,
8369 __unused bool was_af_fault,
8370 __unused bool from_user)
8371 {
8372 kern_return_t result = KERN_FAILURE;
8373 pt_entry_t *ptep;
8374 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8375 unsigned int pai;
8376 pmap_paddr_t pa;
8377 validate_pmap_mutable(pmap);
8378
8379 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8380 return KERN_ABORTED;
8381 }
8382
8383 /*
8384 * If the entry doesn't exist, is completely invalid, or is already
8385 * valid, we can't fix it here.
8386 */
8387
8388 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8389 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8390 if (ptep != PT_ENTRY_NULL) {
8391 while (true) {
8392 spte = *((volatile pt_entry_t*)ptep);
8393
8394 pa = pte_to_pa(spte);
8395
8396 if ((spte == ARM_PTE_TYPE_FAULT) ||
8397 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8398 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8399 return result;
8400 }
8401
8402 if (!pa_valid(pa)) {
8403 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8404 #if XNU_MONITOR
8405 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8406 return KERN_PROTECTION_FAILURE;
8407 } else
8408 #endif
8409 return result;
8410 }
8411 pai = pa_index(pa);
8412 pvh_lock(pai);
8413 if (*ptep == spte) {
8414 /*
8415 * Double-check the spte value, as we care about the AF bit.
8416 * It's also possible that pmap_page_protect() transitioned the
8417 * PTE to compressed/empty before we grabbed the PVH lock.
8418 */
8419 break;
8420 }
8421 pvh_unlock(pai);
8422 }
8423 } else {
8424 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8425 return result;
8426 }
8427
8428
8429 if ((result != KERN_SUCCESS) &&
8430 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8431 /*
8432 * An attempted access will always clear ref/mod fault state, as
8433 * appropriate for the fault type. arm_clear_fast_fault will
8434 * update the associated PTEs for the page as appropriate; if
8435 * any PTEs are updated, we redrive the access. If the mapping
8436 * does not actually allow for the attempted access, the
8437 * following fault will (hopefully) fail to update any PTEs, and
8438 * thus cause arm_fast_fault to decide that it failed to handle
8439 * the fault.
8440 */
8441 if (ppattr_test_reffault(pai)) {
8442 ppattr_clear_reffault(pai);
8443 }
8444 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8445 ppattr_clear_modfault(pai);
8446 }
8447
8448 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8449 /*
8450 * Should this preserve KERN_PROTECTION_FAILURE? The
8451 * cost of not doing so is a another fault in a case
8452 * that should already result in an exception.
8453 */
8454 result = KERN_SUCCESS;
8455 }
8456 }
8457
8458 /*
8459 * If the PTE already has sufficient permissions, we can report the fault as handled.
8460 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8461 * on mappings of the same page
8462 */
8463 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8464 uintptr_t ap_ro, ap_rw, ap_x;
8465 if (pmap == kernel_pmap) {
8466 ap_ro = ARM_PTE_AP(AP_RONA);
8467 ap_rw = ARM_PTE_AP(AP_RWNA);
8468 ap_x = ARM_PTE_NX;
8469 } else {
8470 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8471 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8472 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8473 }
8474 /*
8475 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8476 * hardware they may be xPRR-protected, in which case they'll be handled
8477 * by the is_pte_xprr_protected() case above. Additionally, the exception
8478 * handling path currently does not call arm_fast_fault() without at least
8479 * VM_PROT_READ in fault_type.
8480 */
8481 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8482 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8483 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8484 result = KERN_SUCCESS;
8485 }
8486 }
8487 }
8488
8489 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8490 /*
8491 * A prior arm_clear_fast_fault() operation may have returned early due to
8492 * another pending PV list operation or an excessively large PV list.
8493 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8494 * taking a fault on the same mapping.
8495 */
8496 result = KERN_SUCCESS;
8497 }
8498
8499 pvh_unlock(pai);
8500 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8501 return result;
8502 }
8503
8504 kern_return_t
8505 arm_fast_fault(
8506 pmap_t pmap,
8507 vm_map_address_t va,
8508 vm_prot_t fault_type,
8509 bool was_af_fault,
8510 __unused bool from_user)
8511 {
8512 kern_return_t result = KERN_FAILURE;
8513
8514 if (va < pmap->min || va >= pmap->max) {
8515 return result;
8516 }
8517
8518 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8519 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8520 from_user);
8521
8522 do {
8523 #if XNU_MONITOR
8524 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8525 #else
8526 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8527 #endif
8528 } while (result == KERN_ABORTED);
8529
8530 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8531
8532 return result;
8533 }
8534
8535 void
8536 pmap_copy_page(
8537 ppnum_t psrc,
8538 ppnum_t pdst)
8539 {
8540 bcopy_phys((addr64_t) (ptoa(psrc)),
8541 (addr64_t) (ptoa(pdst)),
8542 PAGE_SIZE);
8543 }
8544
8545
8546 /*
8547 * pmap_copy_page copies the specified (machine independent) pages.
8548 */
8549 void
8550 pmap_copy_part_page(
8551 ppnum_t psrc,
8552 vm_offset_t src_offset,
8553 ppnum_t pdst,
8554 vm_offset_t dst_offset,
8555 vm_size_t len)
8556 {
8557 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8558 (addr64_t) (ptoa(pdst) + dst_offset),
8559 len);
8560 }
8561
8562
8563 /*
8564 * pmap_zero_page zeros the specified (machine independent) page.
8565 */
8566 void
8567 pmap_zero_page(
8568 ppnum_t pn)
8569 {
8570 assert(pn != vm_page_fictitious_addr);
8571 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8572 }
8573
8574 /*
8575 * pmap_zero_part_page
8576 * zeros the specified (machine independent) part of a page.
8577 */
8578 void
8579 pmap_zero_part_page(
8580 ppnum_t pn,
8581 vm_offset_t offset,
8582 vm_size_t len)
8583 {
8584 assert(pn != vm_page_fictitious_addr);
8585 assert(offset + len <= PAGE_SIZE);
8586 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8587 }
8588
8589 void
8590 pmap_map_globals(
8591 void)
8592 {
8593 pt_entry_t *ptep, pte;
8594
8595 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8596 assert(ptep != PT_ENTRY_NULL);
8597 assert(*ptep == ARM_PTE_EMPTY);
8598
8599 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8600 #if __ARM_KERNEL_PROTECT__
8601 pte |= ARM_PTE_NG;
8602 #endif /* __ARM_KERNEL_PROTECT__ */
8603 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8604 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8605 *ptep = pte;
8606 FLUSH_PTE();
8607 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8608
8609 #if KASAN
8610 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8611 #endif
8612 }
8613
8614 vm_offset_t
8615 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8616 {
8617 if (__improbable(index >= CPUWINDOWS_MAX)) {
8618 panic("%s: invalid index %u", __func__, index);
8619 }
8620 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8621 }
8622
8623 MARK_AS_PMAP_TEXT unsigned int
8624 pmap_map_cpu_windows_copy_internal(
8625 ppnum_t pn,
8626 vm_prot_t prot,
8627 unsigned int wimg_bits)
8628 {
8629 pt_entry_t *ptep = NULL, pte;
8630 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8631 unsigned int cpu_num;
8632 unsigned int i;
8633 vm_offset_t cpu_copywindow_vaddr = 0;
8634 bool need_strong_sync = false;
8635
8636 #if XNU_MONITOR
8637 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8638 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8639 #endif
8640
8641 #if XNU_MONITOR
8642 #ifdef __ARM_COHERENT_IO__
8643 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8644 panic("%s: attempted to map a managed page, "
8645 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8646 __FUNCTION__,
8647 pn, prot, wimg_bits);
8648 }
8649 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8650 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8651 }
8652
8653 #else /* __ARM_COHERENT_IO__ */
8654 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8655 #endif /* __ARM_COHERENT_IO__ */
8656 #endif /* XNU_MONITOR */
8657 cpu_num = pmap_cpu_data->cpu_number;
8658
8659 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8660 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8661 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8662 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8663 if (*ptep == ARM_PTE_TYPE_FAULT) {
8664 break;
8665 }
8666 }
8667 if (i == CPUWINDOWS_MAX) {
8668 panic("pmap_map_cpu_windows_copy: out of window");
8669 }
8670
8671 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8672 #if __ARM_KERNEL_PROTECT__
8673 pte |= ARM_PTE_NG;
8674 #endif /* __ARM_KERNEL_PROTECT__ */
8675
8676 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8677
8678 if (prot & VM_PROT_WRITE) {
8679 pte |= ARM_PTE_AP(AP_RWNA);
8680 } else {
8681 pte |= ARM_PTE_AP(AP_RONA);
8682 }
8683 #if HAS_FEAT_XS
8684 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8685 #endif
8686 write_pte_fast(ptep, pte);
8687 /*
8688 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8689 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8690 */
8691 FLUSH_PTE_STRONG();
8692 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8693 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8694
8695 return i;
8696 }
8697
8698 unsigned int
8699 pmap_map_cpu_windows_copy(
8700 ppnum_t pn,
8701 vm_prot_t prot,
8702 unsigned int wimg_bits)
8703 {
8704 #if XNU_MONITOR
8705 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8706 #else
8707 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8708 #endif
8709 }
8710
8711 MARK_AS_PMAP_TEXT void
8712 pmap_unmap_cpu_windows_copy_internal(
8713 unsigned int index)
8714 {
8715 pt_entry_t *ptep;
8716 unsigned int cpu_num;
8717 vm_offset_t cpu_copywindow_vaddr = 0;
8718 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8719
8720 cpu_num = pmap_cpu_data->cpu_number;
8721
8722 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8723 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8724 * (which are likely to have been on I/O memory) are complete before
8725 * tearing down the mapping. */
8726 __builtin_arm_dsb(DSB_SY);
8727 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8728 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8729 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8730 }
8731
8732 void
8733 pmap_unmap_cpu_windows_copy(
8734 unsigned int index)
8735 {
8736 #if XNU_MONITOR
8737 return pmap_unmap_cpu_windows_copy_ppl(index);
8738 #else
8739 return pmap_unmap_cpu_windows_copy_internal(index);
8740 #endif
8741 }
8742
8743 #if XNU_MONITOR
8744
8745 MARK_AS_PMAP_TEXT void
8746 pmap_invoke_with_page(
8747 ppnum_t page_number,
8748 void *ctx,
8749 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8750 {
8751 #pragma unused(page_number, ctx, callback)
8752 }
8753
8754 /*
8755 * Loop over every pmap_io_range (I/O ranges marked as owned by
8756 * the PPL in the device tree) and conditionally call callback() on each range
8757 * that needs to be included in the hibernation image.
8758 *
8759 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8760 * context is needed in the callback.
8761 * @param callback Callback function invoked on each range (gated by flag).
8762 */
8763 MARK_AS_PMAP_TEXT void
8764 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8765 {
8766 extern const pmap_io_range_t* io_attr_table;
8767 extern const unsigned int num_io_rgns;
8768 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8769 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8770 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8771 }
8772 }
8773 }
8774
8775 /**
8776 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8777 * PPL-owned page. Otherwise, do nothing.
8778 *
8779 * @param addr Physical address of the page to set the HASHED flag on.
8780 */
8781 MARK_AS_PMAP_TEXT void
8782 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8783 {
8784 /* Ignore non-managed kernel memory. */
8785 if (!pa_valid(addr)) {
8786 return;
8787 }
8788
8789 const unsigned int pai = pa_index(addr);
8790 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8791 pv_entry_t **pv_h = pai_to_pvh(pai);
8792
8793 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8794 pvh_lock(pai);
8795 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8796 pvh_unlock(pai);
8797 }
8798 }
8799
8800 /**
8801 * Loop through every physical page in the system and clear out the HASHED flag
8802 * on every PPL-owned page. That flag is used to keep track of which pages have
8803 * been hashed into the hibernation image during the hibernation entry process.
8804 *
8805 * The HASHED flag needs to be cleared out between hibernation cycles because the
8806 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8807 * image with the HASHED flag set on certain pages. It's important to clear the
8808 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8809 * into the hibernation image can't be compromised across hibernation cycles.
8810 */
8811 MARK_AS_PMAP_TEXT void
8812 pmap_clear_ppl_hashed_flag_all(void)
8813 {
8814 const unsigned int last_index = pa_index(vm_last_phys);
8815 pv_entry_t **pv_h = NULL;
8816
8817 for (int pai = 0; pai < last_index; ++pai) {
8818 pv_h = pai_to_pvh(pai);
8819
8820 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8821 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8822 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8823 pvh_lock(pai);
8824 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8825 pvh_unlock(pai);
8826 }
8827 }
8828 }
8829
8830 /**
8831 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8832 * ppl_hib driver will call this after all wired pages have been copied into the
8833 * hibernation image.
8834 */
8835 MARK_AS_PMAP_TEXT void
8836 pmap_check_ppl_hashed_flag_all(void)
8837 {
8838 const unsigned int last_index = pa_index(vm_last_phys);
8839 pv_entry_t **pv_h = NULL;
8840
8841 for (int pai = 0; pai < last_index; ++pai) {
8842 pv_h = pai_to_pvh(pai);
8843
8844 /**
8845 * The PMAP stacks are explicitly not saved into the image so skip checking
8846 * the pages that contain the PMAP stacks.
8847 */
8848 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8849 (pai < pa_index(pmap_stacks_end_pa));
8850
8851 if (!is_pmap_stack &&
8852 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8853 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8854 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8855 }
8856 }
8857 }
8858
8859 #endif /* XNU_MONITOR */
8860
8861 /*
8862 * Indicate that a pmap is intended to be used as a nested pmap
8863 * within one or more larger address spaces. This must be set
8864 * before pmap_nest() is called with this pmap as the 'subordinate'.
8865 */
8866 MARK_AS_PMAP_TEXT void
8867 pmap_set_nested_internal(
8868 pmap_t pmap)
8869 {
8870 validate_pmap_mutable(pmap);
8871 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8872 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8873 __func__, pmap, pmap->type);
8874 }
8875
8876 /**
8877 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8878 * this pmap its own nested pmap.
8879 */
8880 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8881 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8882 }
8883
8884 pmap_get_pt_ops(pmap)->free_id(pmap);
8885 }
8886
8887 void
8888 pmap_set_nested(
8889 pmap_t pmap)
8890 {
8891 #if XNU_MONITOR
8892 pmap_set_nested_ppl(pmap);
8893 #else
8894 pmap_set_nested_internal(pmap);
8895 #endif
8896 }
8897
8898 bool
8899 pmap_is_nested(
8900 pmap_t pmap)
8901 {
8902 return pmap->type == PMAP_TYPE_NESTED;
8903 }
8904
8905 /*
8906 * pmap_trim_range(pmap, start, end)
8907 *
8908 * pmap = pmap to operate on
8909 * start = start of the range
8910 * end = end of the range
8911 *
8912 * Attempts to deallocate TTEs for the given range in the nested range.
8913 */
8914 MARK_AS_PMAP_TEXT static void
8915 pmap_trim_range(
8916 pmap_t pmap,
8917 addr64_t start,
8918 addr64_t end)
8919 {
8920 addr64_t cur;
8921 addr64_t nested_region_start;
8922 addr64_t nested_region_end;
8923 addr64_t adjusted_start;
8924 addr64_t adjusted_end;
8925 addr64_t adjust_offmask;
8926 tt_entry_t * tte_p;
8927 pt_entry_t * pte_p;
8928 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8929
8930 if (__improbable(end < start)) {
8931 panic("%s: invalid address range, "
8932 "pmap=%p, start=%p, end=%p",
8933 __func__,
8934 pmap, (void*)start, (void*)end);
8935 }
8936
8937 nested_region_start = pmap->nested_region_addr;
8938 nested_region_end = nested_region_start + pmap->nested_region_size;
8939
8940 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8941 panic("%s: range outside nested region %p-%p, "
8942 "pmap=%p, start=%p, end=%p",
8943 __func__, (void *)nested_region_start, (void *)nested_region_end,
8944 pmap, (void*)start, (void*)end);
8945 }
8946
8947 /* Contract the range to TT page boundaries. */
8948 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8949 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8950 adjusted_end = end & ~adjust_offmask;
8951
8952 /* Iterate over the range, trying to remove TTEs. */
8953 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
8954 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8955
8956 tte_p = pmap_tte(pmap, cur);
8957
8958 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
8959 pte_p = (pt_entry_t *) ttetokv(*tte_p);
8960
8961 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
8962 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
8963 /* Deallocate for the nested map. */
8964 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8965 } else if (pmap->type == PMAP_TYPE_USER) {
8966 /**
8967 * Just remove for the parent map. If the leaf table pointed
8968 * to by the TTE being removed (owned by the nested pmap)
8969 * has any mappings, then this call will panic. This
8970 * enforces the policy that tables being trimmed must be
8971 * empty to prevent possible use-after-free attacks.
8972 */
8973 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
8974 } else {
8975 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
8976 }
8977 } else {
8978 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8979 }
8980 }
8981
8982 /* Remove empty L2 TTs. */
8983 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
8984 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
8985
8986 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
8987 /* For each L1 entry in our range... */
8988 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
8989
8990 bool remove_tt1e = true;
8991 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
8992 tt_entry_t * tt2e_start;
8993 tt_entry_t * tt2e_end;
8994 tt_entry_t * tt2e_p;
8995 tt_entry_t tt1e;
8996
8997 if (tt1e_p == NULL) {
8998 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
8999 continue;
9000 }
9001
9002 tt1e = *tt1e_p;
9003
9004 if (tt1e == ARM_TTE_TYPE_FAULT) {
9005 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9006 continue;
9007 }
9008
9009 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9010 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9011
9012 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9013 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9014 /*
9015 * If any TTEs are populated, don't remove the
9016 * L1 TT.
9017 */
9018 remove_tt1e = false;
9019 }
9020 }
9021
9022 if (remove_tt1e) {
9023 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9024 } else {
9025 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9026 }
9027 }
9028 }
9029
9030 /**
9031 * State machine for multi-step pmap trimming. Trimming is the action of
9032 * deallocating the TTEs of the shared region of pmaps down to a given range.
9033 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9034 * disabling preemption for too long. These steps include computing the bounds
9035 * of the shared region, trimming the head of the "grand", trimming the tail of
9036 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9037 * different conditions.
9038 *
9039 * @param grand the pmap in which the pages are nested
9040 * @param subord the pmap from which the pages are shared, or nested
9041 * @param vstart start of the used range in "grand"
9042 * @param size size of the used range
9043 * @param state the current state of the state machine
9044 *
9045 * @return the next state of the state machine, to be used in the next call
9046 * into this function.
9047 */
9048 MARK_AS_PMAP_TEXT pmap_trim_state_t
9049 pmap_trim_internal(
9050 pmap_t grand,
9051 pmap_t subord,
9052 addr64_t vstart,
9053 uint64_t size,
9054 pmap_trim_state_t state)
9055 {
9056 /* Validation needs to be done regardless of state. */
9057 addr64_t vend;
9058
9059 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9060 panic("%s: grand addr wraps around, "
9061 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9062 __func__, grand, subord, (void*)vstart, size, state);
9063 }
9064
9065 validate_pmap_mutable(grand);
9066 validate_pmap(subord);
9067
9068 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9069 panic("%s: subord is of non-nestable type 0x%hhx, "
9070 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9071 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9072 }
9073
9074 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9075 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9076 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9077 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9078 }
9079
9080 if (__improbable(grand->nested_pmap != subord)) {
9081 panic("%s: grand->nested != subord, "
9082 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9083 __func__, grand, subord, (void*)vstart, size, state);
9084 }
9085
9086 if (__improbable((size != 0) &&
9087 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9088 panic("%s: grand range not in nested region, "
9089 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9090 __func__, grand, subord, (void*)vstart, size, state);
9091 }
9092
9093 /* Trimming starts with figuring out the bounds for the grand. */
9094 if (state == PMAP_TRIM_STATE_START) {
9095 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9096
9097 /**
9098 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9099 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9100 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9101 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9102 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9103 * PMAP_TRIM_STATE_DONE.
9104 */
9105 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9106 assert(subord->nested_bounds_set);
9107
9108 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9109 if (!grand->nested_bounds_set) {
9110 /* Inherit the bounds from subord. */
9111 grand->nested_region_true_start = subord->nested_region_true_start;
9112 grand->nested_region_true_end = subord->nested_region_true_end;
9113 grand->nested_bounds_set = true;
9114 }
9115
9116 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9117
9118 /* Now that the grand has bounds, we are done. */
9119 return PMAP_TRIM_STATE_DONE;
9120 }
9121
9122 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9123 if ((!subord->nested_bounds_set) && size) {
9124 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9125 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9126
9127 subord->nested_region_true_start = vstart;
9128 subord->nested_region_true_end = vend;
9129 subord->nested_region_true_start &= ~adjust_offmask;
9130
9131 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9132 panic("%s: padded true end wraps around, "
9133 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9134 __func__, grand, subord, (void*)vstart, size, state);
9135 }
9136
9137 subord->nested_region_true_end &= ~adjust_offmask;
9138 subord->nested_bounds_set = true;
9139 }
9140
9141 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9142 if (subord->nested_bounds_set) {
9143 /* Inherit the bounds from subord. */
9144 grand->nested_region_true_start = subord->nested_region_true_start;
9145 grand->nested_region_true_end = subord->nested_region_true_end;
9146 grand->nested_bounds_set = true;
9147
9148 /* If we know the bounds, we can trim the pmap. */
9149 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9150
9151 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9152 } else {
9153 /* Don't trim if we don't know the bounds. */
9154 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9155
9156 return PMAP_TRIM_STATE_DONE;
9157 }
9158 }
9159
9160 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9161 if (!grand->nested_bounds_set) {
9162 panic("%s: !grand->nested_bounds_set, "
9163 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9164 __func__, grand, subord, (void*)vstart, size, state);
9165 }
9166
9167 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9168 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9169 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9170 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9171 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9172 (unsigned int)grand->nested_no_bounds_ref_state);
9173 }
9174
9175 #if XNU_MONITOR
9176 if (pmap_pending_preemption()) {
9177 return PMAP_TRIM_STATE_GRAND_AFTER;
9178 }
9179 #endif
9180
9181 state = PMAP_TRIM_STATE_GRAND_AFTER;
9182 }
9183
9184 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9185 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9186 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9187 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9188 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9189 (unsigned int)grand->nested_no_bounds_ref_state);
9190 }
9191
9192 #if XNU_MONITOR
9193 if (pmap_pending_preemption()) {
9194 return PMAP_TRIM_STATE_SUBORD;
9195 }
9196 #endif
9197
9198 state = PMAP_TRIM_STATE_SUBORD;
9199 }
9200
9201 /* START state is guaranteed to compute the bounds for the subord. */
9202 if (!subord->nested_bounds_set) {
9203 panic("%s: !subord->nested_bounds_set, "
9204 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9205 __func__, grand, subord, (void*)vstart, size, state);
9206 }
9207
9208 if (state == PMAP_TRIM_STATE_SUBORD) {
9209 /**
9210 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9211 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9212 * called once grand's nested tables have been fully trimmed, and can only be called once
9213 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9214 * the state update is visible only once the preceding trim operation is complete. An
9215 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9216 * but locking within pmap_trim_range() should make that harmless (and all but one will
9217 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9218 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9219 * of the state CAS.
9220 */
9221 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9222 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9223 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9224 (unsigned int)grand->nested_no_bounds_ref_state);
9225 }
9226 pmap_trim_subord(subord);
9227 }
9228
9229 return PMAP_TRIM_STATE_DONE;
9230 }
9231
9232 MARK_AS_PMAP_TEXT static void
9233 pmap_trim_self(pmap_t pmap)
9234 {
9235 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9236 /* If we have a no bounds ref, we need to drop it. */
9237 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9238 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9239 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9240 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9241 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9242 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9243
9244 if (nested_bounds_set) {
9245 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9246 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9247 }
9248 /*
9249 * Try trimming the nested pmap, in case we had the
9250 * last reference.
9251 */
9252 pmap_trim_subord(pmap->nested_pmap);
9253 }
9254 }
9255
9256 /*
9257 * pmap_trim_subord(grand, subord)
9258 *
9259 * grand = pmap that we have nested subord in
9260 * subord = nested pmap we are attempting to trim
9261 *
9262 * Trims subord if possible
9263 */
9264 MARK_AS_PMAP_TEXT static void
9265 pmap_trim_subord(pmap_t subord)
9266 {
9267 bool contract_subord = false;
9268
9269 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9270
9271 subord->nested_no_bounds_refcnt--;
9272
9273 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9274 /* If this was the last no bounds reference, trim subord. */
9275 contract_subord = true;
9276 }
9277
9278 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9279
9280 if (contract_subord) {
9281 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9282 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9283 }
9284 }
9285
9286 /**
9287 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9288 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9289 * disabling preemption for too long.
9290 *
9291 * @note When we load the shared region we always create pages tables for the
9292 * entire region. In practice, the shared cache may use just a portion
9293 * of that. Before we know the bounds of the shared region, it can
9294 * already be mapped into processes. Therefore, once the bounds are
9295 * known, "trimming" comes in handy to remove the unnecessary page
9296 * tables in the processes the shared region is mapped in, and eventually
9297 * those in the shared region itself. Note that the shared region must
9298 * be trimmed after the user processes because it has the L3 entries
9299 * everyone else is pointing to.
9300 *
9301 * @param grand the pmap in which the pages are nested
9302 * @param subord the pmap from which the pages are shared, or nested
9303 * @param vstart start of the used range in "grand"
9304 * @param size size of the used range
9305 */
9306 void
9307 pmap_trim(
9308 pmap_t grand,
9309 pmap_t subord,
9310 addr64_t vstart,
9311 uint64_t size)
9312 {
9313 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9314
9315 #if XNU_MONITOR
9316 /* On PPL systems, drives the state machine until its done. */
9317 while (state != PMAP_TRIM_STATE_DONE) {
9318 __assert_only pmap_trim_state_t old_state = state;
9319 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9320
9321 /* Are we making progress? */
9322 assert(old_state != state);
9323 }
9324
9325 pmap_ledger_check_balance(grand);
9326 pmap_ledger_check_balance(subord);
9327 #else
9328 state = pmap_trim_internal(grand, subord, vstart, size, state);
9329
9330 /* On non-PPL systems, we expect the implementation to finish in one call. */
9331 assert(state == PMAP_TRIM_STATE_DONE);
9332 #endif
9333 }
9334
9335 #if HAS_APPLE_PAC
9336 void *
9337 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9338 {
9339 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9340 panic("attempt to sign user pointer without process independent key");
9341 }
9342
9343 void *res = NULL;
9344 uint64_t current_intr_state = pmap_interrupts_disable();
9345
9346 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9347
9348 __compiler_materialize_and_prevent_reordering_on(value);
9349 switch (key) {
9350 case ptrauth_key_asia:
9351 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9352 break;
9353 case ptrauth_key_asda:
9354 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9355 break;
9356 default:
9357 __builtin_unreachable();
9358 }
9359 __compiler_materialize_and_prevent_reordering_on(res);
9360
9361 ml_disable_user_jop_key(jop_key, saved_jop_state);
9362
9363 pmap_interrupts_restore(current_intr_state);
9364
9365 return res;
9366 }
9367
9368 void *
9369 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9370 {
9371 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9372 }
9373
9374 void *
9375 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9376 {
9377 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9378 panic("attempt to auth user pointer without process independent key");
9379 }
9380
9381 void *res = NULL;
9382 uint64_t current_intr_state = pmap_interrupts_disable();
9383
9384 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9385 __compiler_materialize_and_prevent_reordering_on(value);
9386 res = ml_auth_ptr_unchecked(value, key, discriminator);
9387 __compiler_materialize_and_prevent_reordering_on(res);
9388 ml_disable_user_jop_key(jop_key, saved_jop_state);
9389
9390 pmap_interrupts_restore(current_intr_state);
9391
9392 return res;
9393 }
9394
9395 void *
9396 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9397 {
9398 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9399 }
9400 #endif /* HAS_APPLE_PAC */
9401
9402 /*
9403 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9404 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9405 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9406 * return value, to indicate where a preempted [un]nest operation should resume.
9407 * When the return value contains the ending address of the nested region with
9408 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9409 */
9410 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9411
9412 /*
9413 * kern_return_t pmap_nest(grand, subord, vstart, size)
9414 *
9415 * grand = the pmap that we will nest subord into
9416 * subord = the pmap that goes into the grand
9417 * vstart = start of range in pmap to be inserted
9418 * size = Size of nest area (up to 16TB)
9419 *
9420 * Inserts a pmap into another. This is used to implement shared segments.
9421 *
9422 */
9423
9424 /**
9425 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9426 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9427 * This function operates in 3 main phases:
9428 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9429 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9430 * the mapping range are present in subord.
9431 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9432 * contains pointers to subord's leaf-level pagetable pages for the specified
9433 * VA range.
9434 *
9435 * This function may return early due to pending AST_URGENT preemption; if so
9436 * it will indicate the need to be re-entered.
9437 *
9438 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9439 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9440 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9441 * @param size twig-aligned size of the nesting range
9442 * @param vrestart the twig-aligned starting address of the current call. May contain
9443 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9444 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9445 * KERN_RESOURCE_SHORTAGE on allocation failure.
9446 *
9447 * @return the virtual address at which to restart the operation, possibly including
9448 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9449 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9450 */
9451 MARK_AS_PMAP_TEXT vm_map_offset_t
9452 pmap_nest_internal(
9453 pmap_t grand,
9454 pmap_t subord,
9455 addr64_t vstart,
9456 uint64_t size,
9457 vm_map_offset_t vrestart,
9458 kern_return_t *krp)
9459 {
9460 kern_return_t kr = KERN_FAILURE;
9461 vm_map_offset_t vaddr;
9462 tt_entry_t *stte_p;
9463 tt_entry_t *gtte_p;
9464 uint64_t nested_region_unnested_table_bitmap_size;
9465 unsigned int* nested_region_unnested_table_bitmap = NULL;
9466 uint64_t new_nested_region_unnested_table_bitmap_size;
9467 unsigned int* new_nested_region_unnested_table_bitmap = NULL;
9468 int expand_options = 0;
9469 bool deref_subord = true;
9470 bool grand_locked = false;
9471
9472 addr64_t vend;
9473 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9474 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9475 }
9476 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9477 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9478 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9479 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9480 }
9481
9482 assert(krp != NULL);
9483 validate_pmap_mutable(grand);
9484 validate_pmap(subord);
9485 #if XNU_MONITOR
9486 /*
9487 * Ordering is important here. validate_pmap() has already ensured subord is a
9488 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9489 * be in the process of being destroyed. If destruction is already committed,
9490 * then the check of ref_count below will cover us. If destruction is initiated
9491 * during or after this call, then pmap_destroy() will catch the non-zero
9492 * nested_count.
9493 */
9494 os_atomic_inc(&subord->nested_count, relaxed);
9495 os_atomic_thread_fence(seq_cst);
9496 #endif
9497 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9498 panic("%s: invalid subordinate pmap %p", __func__, subord);
9499 }
9500
9501 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9502 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9503 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9504 }
9505
9506 #if XNU_MONITOR
9507 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9508 #endif
9509
9510 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9511 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9512 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9513 grand, vstart, size, (unsigned long long)vrestart);
9514 }
9515
9516 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9517 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9518 }
9519
9520 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9521 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9522 }
9523
9524 if (subord->nested_region_unnested_table_bitmap == NULL) {
9525 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9526
9527 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9528 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9529 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9530 __func__, nested_region_unnested_table_bitmap_size,
9531 grand, subord, vstart, size);
9532 }
9533
9534 #if XNU_MONITOR
9535 pmap_paddr_t pa = 0;
9536
9537 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9538 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9539 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9540 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9541 grand, subord, vstart, size);
9542 }
9543
9544 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9545
9546 if (kr != KERN_SUCCESS) {
9547 goto nest_cleanup;
9548 }
9549
9550 assert(pa);
9551
9552 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9553 #else
9554 nested_region_unnested_table_bitmap = kalloc_data(
9555 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9556 Z_WAITOK | Z_ZERO);
9557 #endif
9558
9559 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9560 kr = KERN_ABORTED;
9561 goto nest_cleanup;
9562 }
9563
9564 if (subord->nested_region_unnested_table_bitmap == NULL) {
9565 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9566 subord->nested_region_addr = vstart;
9567 subord->nested_region_size = (mach_vm_offset_t) size;
9568
9569 /**
9570 * Ensure that the rest of the subord->nested_region_* fields are
9571 * initialized and visible before setting the nested_region_unnested_table_bitmap
9572 * field (which is used as the flag to say that the rest are initialized).
9573 */
9574 __builtin_arm_dmb(DMB_ISHST);
9575 subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9576 nested_region_unnested_table_bitmap = NULL;
9577 }
9578 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9579 if (nested_region_unnested_table_bitmap != NULL) {
9580 #if XNU_MONITOR
9581 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9582 #else
9583 kfree_data(nested_region_unnested_table_bitmap,
9584 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9585 #endif
9586 nested_region_unnested_table_bitmap = NULL;
9587 }
9588 }
9589
9590 /**
9591 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9592 * speculated before their initialization.
9593 */
9594 __builtin_arm_dmb(DMB_ISHLD);
9595
9596 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9597 uint64_t new_size;
9598
9599 nested_region_unnested_table_bitmap = NULL;
9600 nested_region_unnested_table_bitmap_size = 0ULL;
9601 new_size = vend - subord->nested_region_addr;
9602
9603 new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9604
9605 if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9606 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9607 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9608 __func__, new_nested_region_unnested_table_bitmap_size,
9609 grand, subord, vstart, size);
9610 }
9611
9612 #if XNU_MONITOR
9613 pmap_paddr_t pa = 0;
9614
9615 if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9616 panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9617 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9618 __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9619 grand, subord, vstart, new_size);
9620 }
9621
9622 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9623
9624 if (kr != KERN_SUCCESS) {
9625 goto nest_cleanup;
9626 }
9627
9628 assert(pa);
9629
9630 new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9631 #else
9632 new_nested_region_unnested_table_bitmap = kalloc_data(
9633 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9634 Z_WAITOK | Z_ZERO);
9635 #endif
9636 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9637 kr = KERN_ABORTED;
9638 goto nest_cleanup;
9639 }
9640
9641 if (subord->nested_region_size < new_size) {
9642 bcopy(subord->nested_region_unnested_table_bitmap,
9643 new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size);
9644 nested_region_unnested_table_bitmap_size = subord->nested_region_unnested_table_bitmap_size;
9645 nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9646 subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9647 subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9648 subord->nested_region_size = new_size;
9649 new_nested_region_unnested_table_bitmap = NULL;
9650 }
9651 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9652 if (nested_region_unnested_table_bitmap != NULL) {
9653 #if XNU_MONITOR
9654 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9655 #else
9656 kfree_data(nested_region_unnested_table_bitmap,
9657 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9658 #endif
9659 nested_region_unnested_table_bitmap = NULL;
9660 }
9661 if (new_nested_region_unnested_table_bitmap != NULL) {
9662 #if XNU_MONITOR
9663 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9664 #else
9665 kfree_data(new_nested_region_unnested_table_bitmap,
9666 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9667 #endif
9668 new_nested_region_unnested_table_bitmap = NULL;
9669 }
9670 }
9671
9672 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9673 kr = KERN_ABORTED;
9674 goto nest_cleanup;
9675 }
9676
9677 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9678 /**
9679 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9680 * into a nested pmap, which would then produce multiple levels of nesting.
9681 */
9682 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9683 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9684 }
9685 /*
9686 * If this is grand's first nesting operation, keep the reference on subord.
9687 * It will be released by pmap_destroy_internal() when grand is destroyed.
9688 */
9689 deref_subord = false;
9690
9691 if (!subord->nested_bounds_set) {
9692 /*
9693 * We are nesting without the shared regions bounds
9694 * being known. We'll have to trim the pmap later.
9695 */
9696 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9697 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9698 panic("%s: grand %p already nested", __func__, grand);
9699 }
9700 subord->nested_no_bounds_refcnt++;
9701 }
9702
9703 grand->nested_region_addr = vstart;
9704 grand->nested_region_size = (mach_vm_offset_t) size;
9705 } else {
9706 if (__improbable(grand->nested_pmap != subord)) {
9707 panic("pmap_nest() pmap %p has a nested pmap", grand);
9708 } else if (__improbable(grand->nested_region_addr > vstart)) {
9709 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9710 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9711 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9712 }
9713 }
9714
9715 vaddr = vrestart & ~PMAP_NEST_GRAND;
9716 if (vaddr < subord->nested_region_true_start) {
9717 vaddr = subord->nested_region_true_start;
9718 }
9719
9720 addr64_t true_end = vend;
9721 if (true_end > subord->nested_region_true_end) {
9722 true_end = subord->nested_region_true_end;
9723 }
9724 __unused unsigned int ttecount = 0;
9725
9726 if (vrestart & PMAP_NEST_GRAND) {
9727 goto nest_grand;
9728 }
9729
9730 while (vaddr < true_end) {
9731 stte_p = pmap_tte(subord, vaddr);
9732 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9733 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9734 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9735
9736 if (kr != KERN_SUCCESS) {
9737 goto done;
9738 }
9739
9740 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9741 }
9742 vaddr += pt_attr_twig_size(pt_attr);
9743 vrestart = vaddr;
9744 ++ttecount;
9745 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9746 pmap_pending_preemption())) {
9747 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9748 kr = KERN_SUCCESS;
9749 goto done;
9750 }
9751 }
9752 /*
9753 * copy TTEs from subord pmap into grand pmap
9754 */
9755
9756 vaddr = (vm_map_offset_t) vstart;
9757 if (vaddr < subord->nested_region_true_start) {
9758 vaddr = subord->nested_region_true_start;
9759 }
9760 vrestart = vaddr | PMAP_NEST_GRAND;
9761
9762 nest_grand:
9763 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9764
9765 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9766 kr = KERN_ABORTED;
9767 goto done;
9768 }
9769 while (vaddr < true_end) {
9770 stte_p = pmap_tte(subord, vaddr);
9771 if (__improbable(stte_p == PT_ENTRY_NULL)) {
9772 panic("%s: subord pmap %p not exapnded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9773 }
9774 gtte_p = pmap_tte(grand, vaddr);
9775 if (gtte_p == PT_ENTRY_NULL) {
9776 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9777 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9778 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9779 if (kr == KERN_SUCCESS) {
9780 kr = KERN_ABORTED;
9781 }
9782 }
9783
9784 if (kr != KERN_SUCCESS) {
9785 goto done;
9786 }
9787
9788 gtte_p = pmap_tt2e(grand, vaddr);
9789 }
9790 /* Don't leak a page table page. Don't violate break-before-make. */
9791 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9792 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9793 __func__, gtte_p, grand);
9794 }
9795 *gtte_p = *stte_p;
9796
9797 vaddr += pt_attr_twig_size(pt_attr);
9798 vrestart = vaddr | PMAP_NEST_GRAND;
9799 ++ttecount;
9800 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9801 pmap_pending_preemption())) {
9802 break;
9803 }
9804 }
9805 if (vaddr >= true_end) {
9806 vrestart = vend | PMAP_NEST_GRAND;
9807 }
9808
9809 kr = KERN_SUCCESS;
9810 done:
9811
9812 FLUSH_PTE();
9813 __builtin_arm_isb(ISB_SY);
9814
9815 if (grand_locked) {
9816 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9817 }
9818
9819 nest_cleanup:
9820 #if XNU_MONITOR
9821 if (kr != KERN_SUCCESS) {
9822 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9823 *krp = kr;
9824 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9825 }
9826 #else
9827 if (kr != KERN_SUCCESS) {
9828 *krp = kr;
9829 }
9830 #endif
9831 if (nested_region_unnested_table_bitmap != NULL) {
9832 #if XNU_MONITOR
9833 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9834 #else
9835 kfree_data(nested_region_unnested_table_bitmap,
9836 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9837 #endif
9838 }
9839 if (new_nested_region_unnested_table_bitmap != NULL) {
9840 #if XNU_MONITOR
9841 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9842 #else
9843 kfree_data(new_nested_region_unnested_table_bitmap,
9844 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9845 #endif
9846 }
9847 if (deref_subord) {
9848 #if XNU_MONITOR
9849 os_atomic_dec(&subord->nested_count, relaxed);
9850 #endif
9851 pmap_destroy_internal(subord);
9852 }
9853 return vrestart;
9854 }
9855
9856 kern_return_t
9857 pmap_nest(
9858 pmap_t grand,
9859 pmap_t subord,
9860 addr64_t vstart,
9861 uint64_t size)
9862 {
9863 kern_return_t kr = KERN_SUCCESS;
9864 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9865 vm_map_offset_t vend = vaddr + size;
9866 __unused vm_map_offset_t vlast = vaddr;
9867
9868 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9869 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9870 VM_KERNEL_ADDRHIDE(vstart));
9871
9872 pmap_verify_preemptible();
9873 #if XNU_MONITOR
9874 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9875 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9876 if (kr == KERN_RESOURCE_SHORTAGE) {
9877 pmap_alloc_page_for_ppl(0);
9878 kr = KERN_SUCCESS;
9879 } else if (kr == KERN_ABORTED) {
9880 /**
9881 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9882 * that it won't update kr when KERN_SUCCESS is to be returned.
9883 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9884 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9885 */
9886 kr = KERN_SUCCESS;
9887 continue;
9888 } else if (kr != KERN_SUCCESS) {
9889 break;
9890 } else if (vaddr == vlast) {
9891 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9892 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9893 }
9894 vlast = vaddr;
9895 }
9896
9897 pmap_ledger_check_balance(grand);
9898 pmap_ledger_check_balance(subord);
9899 #else
9900 /**
9901 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9902 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9903 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9904 */
9905 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9906 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9907 }
9908 #endif
9909
9910 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9911
9912 return kr;
9913 }
9914
9915 /*
9916 * kern_return_t pmap_unnest(grand, vaddr)
9917 *
9918 * grand = the pmap that will have the virtual range unnested
9919 * vaddr = start of range in pmap to be unnested
9920 * size = size of range in pmap to be unnested
9921 *
9922 */
9923
9924 kern_return_t
9925 pmap_unnest(
9926 pmap_t grand,
9927 addr64_t vaddr,
9928 uint64_t size)
9929 {
9930 return pmap_unnest_options(grand, vaddr, size, 0);
9931 }
9932
9933 /**
9934 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
9935 * from a top-level pmap ('grand'). The corresponding mappings in the nested
9936 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
9937 * still have the region nested. The mappings in 'grand' will be left empty
9938 * with the assumption that they will be demand-filled by subsequent access faults.
9939 *
9940 * This function operates in 2 main phases:
9941 * 1. Iteration over the nested pmap's mappings for the specified range to mark
9942 * them non-global.
9943 * 2. Clearing of the twig-level TTEs for the address range in grand.
9944 *
9945 * This function may return early due to pending AST_URGENT preemption; if so
9946 * it will indicate the need to be re-entered.
9947 *
9948 * @param grand pmap from which to unnest mappings
9949 * @param vaddr twig-aligned virtual address for the beginning of the nested range
9950 * @param size twig-aligned size of the nested range
9951 * @param vrestart the page-aligned starting address of the current call. May contain
9952 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
9953 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
9954 * grand is being torn down and step 1) above is not needed.
9955 *
9956 * @return the virtual address at which to restart the operation, possibly including
9957 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9958 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
9959 */
9960 MARK_AS_PMAP_TEXT vm_map_offset_t
9961 pmap_unnest_options_internal(
9962 pmap_t grand,
9963 addr64_t vaddr,
9964 uint64_t size,
9965 vm_map_offset_t vrestart,
9966 unsigned int option)
9967 {
9968 vm_map_offset_t start;
9969 vm_map_offset_t addr;
9970 tt_entry_t *tte_p;
9971 unsigned int current_index;
9972 unsigned int start_index;
9973 unsigned int max_index;
9974 unsigned int entry_count = 0;
9975
9976 addr64_t vend;
9977 addr64_t true_end;
9978 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
9979 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
9980 }
9981 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9982 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
9983 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9984 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
9985 }
9986
9987 validate_pmap_mutable(grand);
9988
9989 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9990
9991 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
9992 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
9993 (unsigned long long)vaddr, (unsigned long long)size);
9994 }
9995
9996 if (__improbable(grand->nested_pmap == NULL)) {
9997 panic("%s: %p has no nested pmap", __func__, grand);
9998 }
9999
10000 true_end = vend;
10001 if (true_end > grand->nested_pmap->nested_region_true_end) {
10002 true_end = grand->nested_pmap->nested_region_true_end;
10003 }
10004
10005 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10006 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10007 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10008 }
10009
10010 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10011 return vrestart;
10012 }
10013
10014 start = vrestart;
10015 if (start < grand->nested_pmap->nested_region_true_start) {
10016 start = grand->nested_pmap->nested_region_true_start;
10017 }
10018 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10019 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10020 bool flush_tlb = false;
10021
10022 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10023 pt_entry_t *bpte, *cpte;
10024
10025 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10026
10027 bpte = pmap_pte(grand->nested_pmap, addr);
10028
10029 /*
10030 * If we've re-entered this function partway through unnesting a leaf region, the
10031 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10032 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10033 * address.
10034 */
10035 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10036 (addr & pt_attr_twig_offmask(pt_attr))) {
10037 /*
10038 * Mark the 'twig' region as being unnested. Every mapping entered within
10039 * the nested pmap in this region will now be marked non-global. Do this
10040 * before marking any of the PTEs within the region as non-global to avoid
10041 * the possibility of pmap_enter() subsequently inserting a global mapping
10042 * in the region, which could lead to a TLB conflict if a non-global entry
10043 * is later inserted for the same VA in a pmap which has fully unnested this
10044 * region.
10045 */
10046 setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10047 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10048 pmap_paddr_t pa;
10049 unsigned int pai = 0;
10050 boolean_t managed = FALSE;
10051 pt_entry_t spte;
10052
10053 if ((*cpte != ARM_PTE_TYPE_FAULT)
10054 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10055 spte = *((volatile pt_entry_t*)cpte);
10056 while (!managed) {
10057 pa = pte_to_pa(spte);
10058 if (!pa_valid(pa)) {
10059 break;
10060 }
10061 pai = pa_index(pa);
10062 pvh_lock(pai);
10063 spte = *((volatile pt_entry_t*)cpte);
10064 pa = pte_to_pa(spte);
10065 if (pai == pa_index(pa)) {
10066 managed = TRUE;
10067 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10068 }
10069 pvh_unlock(pai);
10070 }
10071
10072 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10073 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10074 flush_tlb = true;
10075 }
10076
10077 if (managed) {
10078 pvh_assert_locked(pai);
10079 pvh_unlock(pai);
10080 }
10081 }
10082
10083 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10084 vrestart = addr;
10085 ++entry_count;
10086 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10087 pmap_pending_preemption())) {
10088 goto unnest_subord_done;
10089 }
10090 }
10091 }
10092 addr = vlim;
10093 vrestart = addr;
10094 ++entry_count;
10095 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10096 pmap_pending_preemption())) {
10097 break;
10098 }
10099 }
10100
10101 unnest_subord_done:
10102 if (flush_tlb) {
10103 FLUSH_PTE_STRONG();
10104 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10105 }
10106
10107 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10108 if (current_index < max_index) {
10109 return vrestart;
10110 }
10111 }
10112
10113 /*
10114 * invalidate all pdes for segment at vaddr in pmap grand
10115 */
10116 if (vrestart & PMAP_NEST_GRAND) {
10117 addr = vrestart & ~PMAP_NEST_GRAND;
10118 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10119 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10120 }
10121 } else {
10122 addr = vaddr;
10123 vrestart = vaddr | PMAP_NEST_GRAND;
10124 }
10125
10126 /**
10127 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10128 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10129 * upon reentry.
10130 */
10131 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10132 return vrestart;
10133 }
10134
10135 if (addr < grand->nested_pmap->nested_region_true_start) {
10136 addr = grand->nested_pmap->nested_region_true_start;
10137 }
10138
10139 start = addr;
10140
10141 while (addr < true_end) {
10142 tte_p = pmap_tte(grand, addr);
10143 /*
10144 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10145 * so it's possible that a region we're trying to unnest may not have been
10146 * nested in the first place.
10147 */
10148 if (tte_p != NULL) {
10149 *tte_p = ARM_TTE_TYPE_FAULT;
10150 }
10151 addr += pt_attr_twig_size(pt_attr);
10152 vrestart = addr | PMAP_NEST_GRAND;
10153 ++entry_count;
10154 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10155 pmap_pending_preemption())) {
10156 break;
10157 }
10158 }
10159 if (addr >= true_end) {
10160 vrestart = vend | PMAP_NEST_GRAND;
10161 }
10162
10163 FLUSH_PTE_STRONG();
10164 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10165
10166 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10167
10168 return vrestart;
10169 }
10170
10171 kern_return_t
10172 pmap_unnest_options(
10173 pmap_t grand,
10174 addr64_t vaddr,
10175 uint64_t size,
10176 unsigned int option)
10177 {
10178 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10179 vm_map_offset_t vend = vaddr + size;
10180
10181 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10182 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10183
10184 pmap_verify_preemptible();
10185 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10186 #if XNU_MONITOR
10187 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10188 #else
10189 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10190 #endif
10191 }
10192
10193 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10194
10195 return KERN_SUCCESS;
10196 }
10197
10198 boolean_t
10199 pmap_adjust_unnest_parameters(
10200 __unused pmap_t p,
10201 __unused vm_map_offset_t *s,
10202 __unused vm_map_offset_t *e)
10203 {
10204 return TRUE; /* to get to log_unnest_badness()... */
10205 }
10206
10207 #if PMAP_FORK_NEST
10208 /**
10209 * Perform any necessary pre-nesting of the parent's shared region at fork()
10210 * time.
10211 *
10212 * @note This should only be called from vm_map_fork().
10213 *
10214 * @param old_pmap The pmap of the parent task.
10215 * @param new_pmap The pmap of the child task.
10216 * @param nesting_start An output parameter that is updated with the start
10217 * address of the range that was pre-nested
10218 * @param nesting_end An output parameter that is updated with the end
10219 * address of the range that was pre-nested
10220 *
10221 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10222 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10223 */
10224 kern_return_t
10225 pmap_fork_nest(
10226 pmap_t old_pmap,
10227 pmap_t new_pmap,
10228 vm_map_offset_t *nesting_start,
10229 vm_map_offset_t *nesting_end)
10230 {
10231 if (old_pmap == NULL || new_pmap == NULL) {
10232 return KERN_INVALID_ARGUMENT;
10233 }
10234 if (old_pmap->nested_pmap == NULL) {
10235 return KERN_SUCCESS;
10236 }
10237 pmap_nest(new_pmap,
10238 old_pmap->nested_pmap,
10239 old_pmap->nested_region_addr,
10240 old_pmap->nested_region_size);
10241 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10242 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10243 new_pmap->nested_region_size == old_pmap->nested_region_size,
10244 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10245 new_pmap->nested_pmap,
10246 new_pmap->nested_region_addr,
10247 new_pmap->nested_region_size,
10248 old_pmap->nested_pmap,
10249 old_pmap->nested_region_addr,
10250 old_pmap->nested_region_size);
10251 *nesting_start = old_pmap->nested_region_addr;
10252 *nesting_end = *nesting_start + old_pmap->nested_region_size;
10253 return KERN_SUCCESS;
10254 }
10255 #endif /* PMAP_FORK_NEST */
10256
10257 /*
10258 * disable no-execute capability on
10259 * the specified pmap
10260 */
10261 #if DEVELOPMENT || DEBUG
10262 void
10263 pmap_disable_NX(
10264 pmap_t pmap)
10265 {
10266 pmap->nx_enabled = FALSE;
10267 }
10268 #else
10269 void
10270 pmap_disable_NX(
10271 __unused pmap_t pmap)
10272 {
10273 }
10274 #endif
10275
10276 /*
10277 * flush a range of hardware TLB entries.
10278 * NOTE: assumes the smallest TLB entry in use will be for
10279 * an ARM small page (4K).
10280 */
10281
10282 #if __ARM_RANGE_TLBI__
10283 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10284 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10285 #else
10286 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10287 #endif // __ARM_RANGE_TLBI__
10288 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10289 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10290 "of npages to 32 bits below may truncate.");
10291
10292 static void
10293 flush_mmu_tlb_region_asid_async(
10294 vm_offset_t va,
10295 size_t length,
10296 pmap_t pmap,
10297 bool last_level_only __unused,
10298 bool strong __unused)
10299 {
10300 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10301 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10302 size_t npages = length >> pmap_page_shift;
10303 uint32_t asid;
10304
10305 asid = pmap->hw_asid;
10306
10307 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10308 boolean_t flush_all = FALSE;
10309
10310 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10311 flush_all = TRUE;
10312 }
10313 if (flush_all) {
10314 flush_mmu_tlb_async();
10315 } else {
10316 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10317 }
10318 return;
10319 }
10320 #if __ARM_RANGE_TLBI__
10321 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10322 /**
10323 * Note that casting npages to 32 bits here is always safe thanks to
10324 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10325 */
10326 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10327 if (pmap->type == PMAP_TYPE_NESTED) {
10328 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10329 } else {
10330 flush_mmu_tlb_range_async(va, last_level_only, strong);
10331 }
10332 return;
10333 }
10334 #endif
10335 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10336 va = tlbi_asid(asid) | tlbi_addr(va);
10337
10338 if (pmap->type == PMAP_TYPE_NESTED) {
10339 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10340 } else {
10341 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10342 }
10343 }
10344
10345 MARK_AS_PMAP_TEXT static void
10346 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10347 {
10348 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10349 }
10350
10351 void
10352 flush_mmu_tlb_region(
10353 vm_offset_t va,
10354 unsigned length)
10355 {
10356 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10357 sync_tlb_flush();
10358 }
10359
10360 unsigned int
10361 pmap_cache_attributes(
10362 ppnum_t pn)
10363 {
10364 pmap_paddr_t paddr;
10365 unsigned int pai;
10366 unsigned int result;
10367 pp_attr_t pp_attr_current;
10368
10369 paddr = ptoa(pn);
10370
10371 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10372
10373 if (!pa_valid(paddr)) {
10374 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10375 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10376 }
10377
10378 result = VM_WIMG_DEFAULT;
10379
10380 pai = pa_index(paddr);
10381
10382 pp_attr_current = pp_attr_table[pai];
10383 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10384 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10385 }
10386 return result;
10387 }
10388
10389 MARK_AS_PMAP_TEXT static void
10390 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10391 {
10392 if ((wimg_bits_prev != wimg_bits_new)
10393 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10394 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10395 && (wimg_bits_new != VM_WIMG_COPYBACK))
10396 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10397 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10398 pmap_sync_page_attributes_phys(pn);
10399 }
10400
10401 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10402 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10403 }
10404 }
10405
10406 MARK_AS_PMAP_TEXT __unused void
10407 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10408 {
10409 pmap_paddr_t paddr = ptoa(pn);
10410 const unsigned int pai = pa_index(paddr);
10411
10412 if (__improbable(!pa_valid(paddr))) {
10413 panic("%s called on non-managed page 0x%08x", __func__, pn);
10414 }
10415
10416 pvh_lock(pai);
10417
10418 #if XNU_MONITOR
10419 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10420 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10421 }
10422 #endif
10423
10424 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10425
10426 pvh_unlock(pai);
10427
10428 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10429 }
10430
10431 void *
10432 pmap_map_compressor_page(ppnum_t pn)
10433 {
10434 #if __ARM_PTE_PHYSMAP__
10435 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10436 if (cacheattr != VM_WIMG_DEFAULT) {
10437 #if XNU_MONITOR
10438 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10439 #else
10440 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10441 #endif
10442 }
10443 #endif
10444 return (void*)phystokv(ptoa(pn));
10445 }
10446
10447 void
10448 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10449 {
10450 #if __ARM_PTE_PHYSMAP__
10451 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10452 if (cacheattr != VM_WIMG_DEFAULT) {
10453 #if XNU_MONITOR
10454 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10455 #else
10456 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10457 #endif
10458 }
10459 #endif
10460 }
10461
10462 /**
10463 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10464 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10465 *
10466 * @param user_page_list List of pages to be updated.
10467 * @param page_cnt Number of pages in total in user_page_list.
10468 * @param cacheattr The new cache attribute.
10469 *
10470 * @return Success if true is returned.
10471 */
10472 bool
10473 pmap_batch_set_cache_attributes(
10474 upl_page_info_array_t user_page_list,
10475 unsigned int page_cnt,
10476 unsigned int cacheattr)
10477 {
10478 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10479
10480 if (page_cnt == 0) {
10481 return true;
10482 }
10483
10484 batch_set_cache_attr_state_t states;
10485 states.page_index = 0;
10486 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10487 states.tlb_flush_pass_needed = false;
10488 states.rt_cache_flush_pass_needed = false;
10489
10490 /* Verify we are being called from a preemptible context. */
10491 pmap_verify_preemptible();
10492
10493 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10494 #if XNU_MONITOR
10495 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10496 #else /* !XNU_MONITOR */
10497 states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10498 #endif /* XNU_MONITOR */
10499 }
10500
10501 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10502 return true;
10503 }
10504
10505 /**
10506 * Flushes TLB entries associated with the page specified by paddr, but do not
10507 * issue barriers yet.
10508 *
10509 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10510 */
10511 MARK_AS_PMAP_TEXT static void
10512 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10513 {
10514 #if __ARM_PTE_PHYSMAP__
10515 /* Flush the physical aperture mappings. */
10516 const vm_offset_t kva = phystokv(paddr);
10517 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10518 #endif /* __ARM_PTE_PHYSMAP__ */
10519
10520 /* Flush the mappings tracked in the ptes. */
10521 const unsigned int pai = pa_index(paddr);
10522 pv_entry_t **pv_h = pai_to_pvh(pai);
10523
10524 pt_entry_t *pte_p = PT_ENTRY_NULL;
10525 pv_entry_t *pve_p = PV_ENTRY_NULL;
10526
10527 pvh_assert_locked(pai);
10528
10529 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10530 pte_p = pvh_ptep(pv_h);
10531 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10532 pve_p = pvh_pve_list(pv_h);
10533 pte_p = PT_ENTRY_NULL;
10534 }
10535
10536 int pve_ptep_idx = 0;
10537 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10538 if (pve_p != PV_ENTRY_NULL) {
10539 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10540 if (pte_p == PT_ENTRY_NULL) {
10541 goto flush_tlb_skip_pte;
10542 }
10543 }
10544
10545 #ifdef PVH_FLAG_IOMMU
10546 if (pvh_ptep_is_iommu(pte_p)) {
10547 goto flush_tlb_skip_pte;
10548 }
10549 #endif /* PVH_FLAG_IOMMU */
10550 pmap_t pmap = ptep_get_pmap(pte_p);
10551 vm_map_address_t va = ptep_get_va(pte_p);
10552
10553 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10554 pmap, true, false);
10555
10556 flush_tlb_skip_pte:
10557 pte_p = PT_ENTRY_NULL;
10558 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10559 pve_ptep_idx = 0;
10560 pve_p = pve_next(pve_p);
10561 }
10562 }
10563 }
10564
10565 /**
10566 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10567 *
10568 * @param pai The Physical Address Index of the entry.
10569 * @param cacheattr The new cache attribute.
10570 */
10571 MARK_AS_PMAP_TEXT static void
10572 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10573 {
10574 pvh_assert_locked(pai);
10575
10576 pp_attr_t pp_attr_current, pp_attr_template;
10577 do {
10578 pp_attr_current = pp_attr_table[pai];
10579 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10580
10581 /**
10582 * WIMG bits should only be updated under the PVH lock, but we should do
10583 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10584 */
10585 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10586 }
10587
10588 /**
10589 * Batch updates the cache attributes of a list of pages in three passes.
10590 *
10591 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10592 * In pass two, TLB entries are flushed for each page in the list if necessary.
10593 * In pass three, caches are cleaned for each page in the list if necessary.
10594 *
10595 * When running in PPL, this function may decide to return to the caller in response
10596 * to AST_URGENT.
10597 *
10598 * @param user_page_list List of pages to be updated.
10599 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10600 * @param page_cnt Number of pages in total in user_page_list.
10601 * @param cacheattr The new cache attributes.
10602 *
10603 * @return The new state of the state machine.
10604 */
10605 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10606 pmap_batch_set_cache_attributes_internal(
10607 #if XNU_MONITOR
10608 volatile upl_page_info_t *user_page_list,
10609 #else /* !XNU_MONITOR */
10610 upl_page_info_array_t user_page_list,
10611 #endif /* XNU_MONITOR */
10612 batch_set_cache_attr_state_t states,
10613 unsigned int page_cnt,
10614 unsigned int cacheattr)
10615 {
10616 uint64_t page_index = states.page_index;
10617 uint64_t state = states.state;
10618 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10619 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10620
10621 /* For verifying progress. */
10622 __assert_only const uint64_t page_index_old = page_index;
10623 __assert_only const uint64_t state_old = state;
10624
10625 /* Assert page_index and state are within their range. */
10626 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10627 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10628 }
10629
10630 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10631 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10632 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10633 while (page_index < page_cnt) {
10634 const ppnum_t pn = user_page_list[page_index].phys_addr;
10635 const pmap_paddr_t paddr = ptoa(pn);
10636
10637 if (!pa_valid(paddr)) {
10638 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10639 }
10640
10641 const unsigned int pai = pa_index(paddr);
10642
10643 /* Lock the page. */
10644 pvh_lock(pai);
10645
10646 #if XNU_MONITOR
10647 if (ppattr_pa_test_monitor(paddr)) {
10648 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10649 }
10650 #endif /* XNU_MONITOR */
10651 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10652
10653 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10654 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10655 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10656 }
10657
10658 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10659
10660 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10661 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10662 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10663 }
10664
10665 /* Update the cache attributes in PTE and PP_ATTR table. */
10666 if (wimg_bits_new != wimg_bits_prev) {
10667 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10668 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10669 }
10670
10671 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10672 rt_cache_flush_pass_needed = true;
10673 }
10674
10675 pvh_unlock(pai);
10676
10677 page_index++;
10678
10679 #if XNU_MONITOR
10680 /**
10681 * Check for AST_URGENT every page, as the pve list search in cache
10682 * update can take non-constant time.
10683 */
10684 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10685 goto pbscai_exit;
10686 }
10687 #endif /* XNU_MONITOR */
10688 }
10689
10690 /* page_index == page_cnt && !pmap_pending_preemption() */
10691 if (tlb_flush_pass_needed) {
10692 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10693 } else if (rt_cache_flush_pass_needed) {
10694 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10695 } else {
10696 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10697 }
10698 page_index = 0;
10699
10700 /* Sync the PTE writes before potential TLB/Cache flushes. */
10701 FLUSH_PTE_STRONG();
10702
10703 #if XNU_MONITOR
10704 if (__improbable(pmap_pending_preemption())) {
10705 goto pbscai_exit;
10706 }
10707 #endif /* XNU_MONITOR */
10708 }
10709
10710 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10711 /**
10712 * Pass 2: for each physical page and for each mapping, we need to flush
10713 * the TLB for it.
10714 */
10715 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10716 while (page_index < page_cnt) {
10717 const ppnum_t pn = user_page_list[page_index].phys_addr;
10718
10719 const pmap_paddr_t paddr = ptoa(pn);
10720 if (!pa_valid(paddr)) {
10721 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10722 }
10723
10724 const unsigned int pai = pa_index(paddr);
10725
10726 pvh_lock(pai);
10727 pmap_flush_tlb_for_paddr_locked_async(paddr);
10728 pvh_unlock(pai);
10729
10730 page_index++;
10731
10732 #if XNU_MONITOR
10733 /**
10734 * Check for AST_URGENT every page, as the pve list search in cache
10735 * update can take non-constant time.
10736 */
10737 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10738 goto pbscai_exit;
10739 }
10740 #endif /* XNU_MONITOR */
10741 }
10742
10743 #if HAS_FEAT_XS
10744 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10745 arm64_sync_tlb(false);
10746 #else
10747 /**
10748 * For targets that distinguish between mild and strong DSB, mild DSB
10749 * will not drain the prefetcher. This can lead to prefetch-driven
10750 * cache fills that defeat the uncacheable requirement of the RT memory type.
10751 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10752 */
10753 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10754 #endif
10755
10756 if (rt_cache_flush_pass_needed) {
10757 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10758 } else {
10759 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10760 }
10761 page_index = 0;
10762
10763 #if XNU_MONITOR
10764 if (__improbable(pmap_pending_preemption())) {
10765 goto pbscai_exit;
10766 }
10767 #endif /* XNU_MONITOR */
10768 }
10769
10770 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10771 /* Pass 3: Flush the cache if the page is recently set to RT */
10772 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10773 #if !XNU_MONITOR
10774 /**
10775 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10776 * in the state where DC by VA instructions remain enabled.
10777 */
10778 disable_preemption();
10779 #endif /* !XNU_MONITOR */
10780
10781 assert(get_preemption_level() > 0);
10782
10783 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10784 /**
10785 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10786 * and the host will handle cache maintenance for it. So we don't need to
10787 * worry about enabling the ops here for AVP.
10788 */
10789 enable_dc_mva_ops();
10790 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10791
10792 while (page_index < page_cnt) {
10793 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10794
10795 if (!pa_valid(paddr)) {
10796 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10797 }
10798
10799 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10800
10801 page_index++;
10802
10803 #if XNU_MONITOR
10804 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10805 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10806 disable_dc_mva_ops();
10807 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10808 goto pbscai_exit;
10809 }
10810 #endif /* XNU_MONITOR */
10811 }
10812
10813 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10814 disable_dc_mva_ops();
10815 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10816
10817 #if !XNU_MONITOR
10818 enable_preemption();
10819 #endif /* !XNU_MONITOR */
10820
10821 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10822 page_index = 0;
10823 }
10824
10825 #if XNU_MONITOR
10826 pbscai_exit:
10827 #endif /* XNU_MONITOR */
10828 /* Assert page_index and state are within their range. */
10829 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10830
10831 /* Make sure we are making progress in this call. */
10832 assert(page_index > page_index_old || state > state_old);
10833
10834 batch_set_cache_attr_state_t states_new;
10835 states_new.page_index = page_index;
10836 states_new.state = state;
10837 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10838 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10839 return states_new;
10840 }
10841
10842 MARK_AS_PMAP_TEXT static void
10843 pmap_set_cache_attributes_priv(
10844 ppnum_t pn,
10845 unsigned int cacheattr,
10846 boolean_t external __unused)
10847 {
10848 pmap_paddr_t paddr;
10849 unsigned int pai;
10850 pp_attr_t pp_attr_current;
10851 pp_attr_t pp_attr_template;
10852 unsigned int wimg_bits_prev, wimg_bits_new;
10853
10854 paddr = ptoa(pn);
10855
10856 if (!pa_valid(paddr)) {
10857 return; /* Not a managed page. */
10858 }
10859
10860 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10861 cacheattr = VM_WIMG_DEFAULT;
10862 }
10863
10864 pai = pa_index(paddr);
10865
10866 pvh_lock(pai);
10867
10868 #if XNU_MONITOR
10869 if (external && ppattr_pa_test_monitor(paddr)) {
10870 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10871 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10872 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10873 }
10874 #endif
10875
10876 do {
10877 pp_attr_current = pp_attr_table[pai];
10878 wimg_bits_prev = VM_WIMG_DEFAULT;
10879 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10880 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10881 }
10882
10883 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10884
10885 /**
10886 * WIMG bits should only be updated under the PVH lock, but we should do
10887 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10888 */
10889 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10890
10891 wimg_bits_new = VM_WIMG_DEFAULT;
10892 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10893 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10894 }
10895
10896 if (wimg_bits_new != wimg_bits_prev) {
10897 pmap_update_cache_attributes_locked(pn, cacheattr, true);
10898 }
10899
10900 pvh_unlock(pai);
10901
10902 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10903 }
10904
10905 MARK_AS_PMAP_TEXT void
10906 pmap_set_cache_attributes_internal(
10907 ppnum_t pn,
10908 unsigned int cacheattr)
10909 {
10910 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10911 }
10912
10913 void
10914 pmap_set_cache_attributes(
10915 ppnum_t pn,
10916 unsigned int cacheattr)
10917 {
10918 #if XNU_MONITOR
10919 pmap_set_cache_attributes_ppl(pn, cacheattr);
10920 #else
10921 pmap_set_cache_attributes_internal(pn, cacheattr);
10922 #endif
10923 }
10924
10925 /**
10926 * Updates the page numbered ppnum to have attribute specified by attributes.
10927 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10928 * The necessity of the TLB flush is returned in case this function is called
10929 * in a batched manner and the TLB flush is intended to be done at a different
10930 * timing.
10931 *
10932 * @param ppnum Page Number of the page to be updated.
10933 * @param attributes The new cache attributes.
10934 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
10935 * immediately.
10936 *
10937 * @return Returns true if a TLB flush is needed for this update regardless of
10938 * whether a flush has occurred already.
10939 */
10940 MARK_AS_PMAP_TEXT bool
10941 pmap_update_cache_attributes_locked(
10942 ppnum_t ppnum,
10943 unsigned attributes,
10944 bool perform_tlbi)
10945 {
10946 pmap_paddr_t phys = ptoa(ppnum);
10947 pv_entry_t *pve_p;
10948 pt_entry_t *pte_p;
10949 pv_entry_t **pv_h;
10950 pt_entry_t tmplate;
10951 unsigned int pai;
10952 boolean_t tlb_flush_needed = false;
10953
10954 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
10955
10956 if (pmap_panic_dev_wimg_on_managed) {
10957 switch (attributes & VM_WIMG_MASK) {
10958 case VM_WIMG_IO: // nGnRnE
10959 case VM_WIMG_POSTED: // nGnRE
10960 /* supported on DRAM, but slow, so we disallow */
10961
10962 case VM_WIMG_POSTED_REORDERED: // nGRE
10963 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
10964 /* unsupported on DRAM */
10965
10966 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
10967 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
10968 break;
10969
10970 default:
10971 /* not device type memory, all good */
10972
10973 break;
10974 }
10975 }
10976
10977 #if __ARM_PTE_PHYSMAP__
10978 vm_offset_t kva = phystokv(phys);
10979 pte_p = pmap_pte(kernel_pmap, kva);
10980
10981 tmplate = *pte_p;
10982 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
10983 #if XNU_MONITOR
10984 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
10985 #else
10986 tmplate |= wimg_to_pte(attributes, phys);
10987 #endif
10988 if (tmplate & ARM_PTE_HINT_MASK) {
10989 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
10990 __FUNCTION__, pte_p, (void *)kva, tmplate);
10991 }
10992
10993 if (perform_tlbi) {
10994 write_pte_strong(pte_p, tmplate);
10995 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10996 } else {
10997 write_pte_fast(pte_p, tmplate);
10998 }
10999 tlb_flush_needed = true;
11000 #endif
11001
11002 pai = pa_index(phys);
11003
11004 pv_h = pai_to_pvh(pai);
11005
11006 pte_p = PT_ENTRY_NULL;
11007 pve_p = PV_ENTRY_NULL;
11008 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11009 pte_p = pvh_ptep(pv_h);
11010 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11011 pve_p = pvh_pve_list(pv_h);
11012 pte_p = PT_ENTRY_NULL;
11013 }
11014
11015 int pve_ptep_idx = 0;
11016 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11017 vm_map_address_t va;
11018 pmap_t pmap;
11019
11020 if (pve_p != PV_ENTRY_NULL) {
11021 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11022 if (pte_p == PT_ENTRY_NULL) {
11023 goto cache_skip_pve;
11024 }
11025 }
11026
11027 #ifdef PVH_FLAG_IOMMU
11028 if (pvh_ptep_is_iommu(pte_p)) {
11029 goto cache_skip_pve;
11030 }
11031 #endif
11032 pmap = ptep_get_pmap(pte_p);
11033 #if HAS_FEAT_XS
11034 /**
11035 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11036 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11037 */
11038 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11039 #endif /* HAS_FEAT_XS */
11040 va = ptep_get_va(pte_p);
11041
11042 tmplate = *pte_p;
11043 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11044 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11045
11046 if (perform_tlbi) {
11047 write_pte_strong(pte_p, tmplate);
11048 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11049 pmap, true, false);
11050 } else {
11051 write_pte_fast(pte_p, tmplate);
11052 }
11053 tlb_flush_needed = true;
11054
11055 cache_skip_pve:
11056 pte_p = PT_ENTRY_NULL;
11057 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11058 pve_ptep_idx = 0;
11059 pve_p = pve_next(pve_p);
11060 }
11061 }
11062 if (perform_tlbi && tlb_flush_needed) {
11063 #if HAS_FEAT_XS
11064 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11065 arm64_sync_tlb(false);
11066 #else
11067 /**
11068 * For targets that distinguish between mild and strong DSB, mild DSB
11069 * will not drain the prefetcher. This can lead to prefetch-driven
11070 * cache fills that defeat the uncacheable requirement of the RT memory type.
11071 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11072 */
11073 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11074 #endif
11075 }
11076
11077 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11078
11079 return tlb_flush_needed;
11080 }
11081
11082 /**
11083 * Mark a pmap as being dedicated to use for a commpage mapping.
11084 * The pmap itself will never be activated on a CPU; its mappings will
11085 * only be embedded in userspace pmaps at a fixed virtual address.
11086 *
11087 * @param pmap the pmap to mark as belonging to a commpage.
11088 */
11089 static void
11090 pmap_set_commpage(pmap_t pmap)
11091 {
11092 #if XNU_MONITOR
11093 assert(!pmap_ppl_locked_down);
11094 #endif
11095 assert(pmap->type == PMAP_TYPE_USER);
11096 pmap->type = PMAP_TYPE_COMMPAGE;
11097 /*
11098 * Free the pmap's ASID. This pmap should not ever be directly
11099 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11100 * ASID space contention but will also cause pmap_switch() to panic
11101 * if an attacker tries to activate this pmap. Disable preemption to
11102 * accommodate the *_nopreempt spinlock in free_asid().
11103 */
11104 mp_disable_preemption();
11105 pmap_get_pt_ops(pmap)->free_id(pmap);
11106 mp_enable_preemption();
11107 }
11108
11109 static void
11110 pmap_update_tt3e(
11111 pmap_t pmap,
11112 vm_address_t address,
11113 tt_entry_t template)
11114 {
11115 tt_entry_t *ptep, pte;
11116
11117 ptep = pmap_tt3e(pmap, address);
11118 if (ptep == NULL) {
11119 panic("%s: no ptep?", __FUNCTION__);
11120 }
11121
11122 pte = *ptep;
11123 pte = tte_to_pa(pte) | template;
11124 write_pte_strong(ptep, pte);
11125 }
11126
11127 /* Note absence of non-global bit */
11128 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11129 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11130 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11131 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11132
11133 /* Note absence of non-global bit and no-execute bit. */
11134 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11135 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11136 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11137 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11138
11139 void
11140 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11141 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11142 {
11143 kern_return_t kr;
11144 pmap_paddr_t data_pa = 0; // data address
11145 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11146 pmap_paddr_t text_pa = 0; // text address
11147
11148 *kernel_data_addr = 0;
11149 *kernel_text_addr = 0;
11150 *user_text_addr = 0;
11151
11152 #if XNU_MONITOR
11153 data_pa = pmap_alloc_page_for_kern(0);
11154 assert(data_pa);
11155 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11156 ro_data_pa = pmap_alloc_page_for_kern(0);
11157 assert(ro_data_pa);
11158 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11159 #if CONFIG_ARM_PFZ
11160 text_pa = pmap_alloc_page_for_kern(0);
11161 assert(text_pa);
11162 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11163 #endif
11164
11165 #else /* XNU_MONITOR */
11166 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11167 /*
11168 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11169 * mapped at page granularity, so a separate page for kernel RO data would not
11170 * be useful.
11171 */
11172 ro_data_pa = data_pa;
11173 #if CONFIG_ARM_PFZ
11174 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11175 #endif
11176
11177 #endif /* XNU_MONITOR */
11178
11179 /*
11180 * In order to avoid burning extra pages on mapping the shared page, we
11181 * create a dedicated pmap for the shared page. We forcibly nest the
11182 * translation tables from this pmap into other pmaps. The level we
11183 * will nest at depends on the MMU configuration (page size, TTBR range,
11184 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11185 *
11186 * Note that this is NOT "the nested pmap" (which is used to nest the
11187 * shared cache).
11188 *
11189 * Note that we update parameters of the entry for our unique needs (NG
11190 * entry, etc.).
11191 */
11192 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11193 assert(commpage_pmap_default != NULL);
11194 pmap_set_commpage(commpage_pmap_default);
11195
11196 /* The user 64-bit mappings... */
11197 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11198 assert(kr == KERN_SUCCESS);
11199 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11200
11201 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11202 assert(kr == KERN_SUCCESS);
11203 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11204 #if CONFIG_ARM_PFZ
11205 /* User mapping of comm page text section for 64 bit mapping only
11206 *
11207 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11208 * user processes to get this page mapped in, they should never call into
11209 * this page.
11210 *
11211 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11212 * is slid in the same L3 as the data commpage. It is either outside the
11213 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11214 * it is reserved and unavailable to mach VM for future mappings.
11215 */
11216 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11217 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11218
11219 vm_map_address_t commpage_text_va = 0;
11220
11221 do {
11222 int text_leaf_index = random() % num_ptes;
11223
11224 // Generate a VA for the commpage text with the same root and twig index as data
11225 // comm page, but with new leaf index we've just generated.
11226 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11227 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11228 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11229
11230 // Assert that this is empty
11231 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11232 assert(ptep != PT_ENTRY_NULL);
11233 assert(*ptep == ARM_TTE_EMPTY);
11234
11235 // At this point, we've found the address we want to insert our comm page at
11236 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11237 assert(kr == KERN_SUCCESS);
11238 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11239 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11240
11241 *user_text_addr = commpage_text_va;
11242 #endif
11243
11244 /* ...and the user 32-bit mappings. */
11245 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11246 assert(kr == KERN_SUCCESS);
11247 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11248
11249 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11250 assert(kr == KERN_SUCCESS);
11251 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11252 #if __ARM_MIXED_PAGE_SIZE__
11253 /**
11254 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11255 * new set of page tables that point to the exact same 16K shared page as
11256 * before. Only the first 4K of the 16K shared page is mapped since that's
11257 * the only part that contains relevant data.
11258 */
11259 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11260 assert(commpage_pmap_4k != NULL);
11261 pmap_set_commpage(commpage_pmap_4k);
11262
11263 /* The user 64-bit mappings... */
11264 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11265 assert(kr == KERN_SUCCESS);
11266 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11267
11268 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11269 assert(kr == KERN_SUCCESS);
11270 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11271
11272 /* ...and the user 32-bit mapping. */
11273 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11274 assert(kr == KERN_SUCCESS);
11275 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11276
11277 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11278 assert(kr == KERN_SUCCESS);
11279 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11280 #endif
11281
11282 /* For manipulation in kernel, go straight to physical page */
11283 *kernel_data_addr = phystokv(data_pa);
11284 assert(commpage_ro_data_kva == 0);
11285 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11286 assert(commpage_text_kva == 0);
11287 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11288 }
11289
11290
11291 /*
11292 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11293 * with user controlled TTEs for regions that aren't explicitly reserved by the
11294 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11295 */
11296 #if (ARM_PGSHIFT == 14)
11297 /**
11298 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11299 * commpage completely above the maximum 32-bit userspace VA.
11300 */
11301 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11302
11303 /**
11304 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11305 * userspace VAs can nest the commpage completely above the maximum 64-bit
11306 * userpace VA, but that technically isn't true on macOS. On those systems, the
11307 * commpage lives within the userspace VA range, but is protected by the VM as
11308 * a reserved region (see vm_reserved_regions[] definition for more info).
11309 */
11310
11311 #elif (ARM_PGSHIFT == 12)
11312 /**
11313 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11314 * above the maximum userspace VA.
11315 */
11316 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11317 #else
11318 #error Nested shared page mapping is unsupported on this config
11319 #endif
11320
11321 MARK_AS_PMAP_TEXT kern_return_t
11322 pmap_insert_commpage_internal(
11323 pmap_t pmap)
11324 {
11325 kern_return_t kr = KERN_SUCCESS;
11326 vm_offset_t commpage_vaddr;
11327 pt_entry_t *ttep, *src_ttep;
11328 int options = 0;
11329 pmap_t commpage_pmap = commpage_pmap_default;
11330
11331 /* Validate the pmap input before accessing its data. */
11332 validate_pmap_mutable(pmap);
11333
11334 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11335 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11336
11337 #if __ARM_MIXED_PAGE_SIZE__
11338 #if !__ARM_16K_PG__
11339 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11340 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11341 #endif /* !__ARM_16K_PG__ */
11342
11343 /* Choose the correct shared page pmap to use. */
11344 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11345 if (pmap_page_size == 16384) {
11346 commpage_pmap = commpage_pmap_default;
11347 } else if (pmap_page_size == 4096) {
11348 commpage_pmap = commpage_pmap_4k;
11349 } else {
11350 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11351 }
11352 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11353
11354 #if XNU_MONITOR
11355 options |= PMAP_OPTIONS_NOWAIT;
11356 #endif /* XNU_MONITOR */
11357
11358 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11359 #error We assume a single page.
11360 #endif
11361
11362 if (pmap_is_64bit(pmap)) {
11363 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11364 } else {
11365 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11366 }
11367
11368
11369 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11370
11371 /*
11372 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11373 * two (2MB) depending on the address space layout. For 16KB pages, each level
11374 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11375 * to "nest".
11376 *
11377 * Note: This is not "nesting" in the shared cache sense. This definition of
11378 * nesting just means inserting pointers to pre-allocated tables inside of
11379 * the passed in pmap to allow us to share page tables (which map the shared
11380 * page) for every task. This saves at least one page of memory per process
11381 * compared to creating new page tables in every process for mapping the
11382 * shared page.
11383 */
11384
11385 /**
11386 * Allocate the twig page tables if needed, and slam a pointer to the shared
11387 * page's tables into place.
11388 */
11389 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11390 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11391
11392 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11393
11394 if (kr != KERN_SUCCESS) {
11395 #if XNU_MONITOR
11396 if (kr == KERN_RESOURCE_SHORTAGE) {
11397 return kr;
11398 } else
11399 #endif
11400 if (kr == KERN_ABORTED) {
11401 return kr;
11402 } else {
11403 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11404 }
11405 }
11406
11407 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11408 }
11409
11410 if (*ttep != ARM_PTE_EMPTY) {
11411 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11412 }
11413
11414 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11415
11416 *ttep = *src_ttep;
11417 FLUSH_PTE_STRONG();
11418
11419 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11420
11421 return kr;
11422 }
11423
11424 static void
11425 pmap_unmap_commpage(
11426 pmap_t pmap)
11427 {
11428 pt_entry_t *ttep;
11429 vm_offset_t commpage_vaddr;
11430 pmap_t commpage_pmap = commpage_pmap_default;
11431
11432 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11433 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11434
11435 #if __ARM_MIXED_PAGE_SIZE__
11436 #if !__ARM_16K_PG__
11437 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11438 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11439 #endif /* !__ARM_16K_PG__ */
11440
11441 /* Choose the correct shared page pmap to use. */
11442 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11443 if (pmap_page_size == 16384) {
11444 commpage_pmap = commpage_pmap_default;
11445 } else if (pmap_page_size == 4096) {
11446 commpage_pmap = commpage_pmap_4k;
11447 } else {
11448 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11449 }
11450 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11451
11452 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11453 #error We assume a single page.
11454 #endif
11455
11456 if (pmap_is_64bit(pmap)) {
11457 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11458 } else {
11459 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11460 }
11461
11462
11463 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11464
11465 if (ttep == NULL) {
11466 return;
11467 }
11468
11469 /* It had better be mapped to the shared page. */
11470 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11471 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11472 }
11473
11474 *ttep = ARM_TTE_EMPTY;
11475 FLUSH_PTE_STRONG();
11476
11477 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11478 sync_tlb_flush();
11479 }
11480
11481 void
11482 pmap_insert_commpage(
11483 pmap_t pmap)
11484 {
11485 kern_return_t kr = KERN_FAILURE;
11486 #if XNU_MONITOR
11487 do {
11488 kr = pmap_insert_commpage_ppl(pmap);
11489
11490 if (kr == KERN_RESOURCE_SHORTAGE) {
11491 pmap_alloc_page_for_ppl(0);
11492 }
11493 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11494
11495 pmap_ledger_check_balance(pmap);
11496 #else
11497 do {
11498 kr = pmap_insert_commpage_internal(pmap);
11499 } while (kr == KERN_ABORTED);
11500 #endif
11501
11502 if (kr != KERN_SUCCESS) {
11503 panic("%s: failed to insert the shared page, kr=%d, "
11504 "pmap=%p",
11505 __FUNCTION__, kr,
11506 pmap);
11507 }
11508 }
11509
11510 static boolean_t
11511 pmap_is_64bit(
11512 pmap_t pmap)
11513 {
11514 return pmap->is_64bit;
11515 }
11516
11517 bool
11518 pmap_is_exotic(
11519 pmap_t pmap __unused)
11520 {
11521 return false;
11522 }
11523
11524
11525 /* ARMTODO -- an implementation that accounts for
11526 * holes in the physical map, if any.
11527 */
11528 boolean_t
11529 pmap_valid_page(
11530 ppnum_t pn)
11531 {
11532 return pa_valid(ptoa(pn));
11533 }
11534
11535 boolean_t
11536 pmap_bootloader_page(
11537 ppnum_t pn)
11538 {
11539 pmap_paddr_t paddr = ptoa(pn);
11540
11541 if (pa_valid(paddr)) {
11542 return FALSE;
11543 }
11544 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11545 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11546 }
11547
11548 MARK_AS_PMAP_TEXT boolean_t
11549 pmap_is_empty_internal(
11550 pmap_t pmap,
11551 vm_map_offset_t va_start,
11552 vm_map_offset_t va_end)
11553 {
11554 vm_map_offset_t block_start, block_end;
11555 tt_entry_t *tte_p;
11556
11557 if (pmap == NULL) {
11558 return TRUE;
11559 }
11560
11561 validate_pmap(pmap);
11562
11563 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11564 unsigned int initial_not_in_kdp = not_in_kdp;
11565
11566 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11567 pmap_lock(pmap, PMAP_LOCK_SHARED);
11568 }
11569
11570
11571 /* TODO: This will be faster if we increment ttep at each level. */
11572 block_start = va_start;
11573
11574 while (block_start < va_end) {
11575 pt_entry_t *bpte_p, *epte_p;
11576 pt_entry_t *pte_p;
11577
11578 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11579 if (block_end > va_end) {
11580 block_end = va_end;
11581 }
11582
11583 tte_p = pmap_tte(pmap, block_start);
11584 if ((tte_p != PT_ENTRY_NULL)
11585 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11586 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11587 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11588 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11589
11590 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11591 if (*pte_p != ARM_PTE_EMPTY) {
11592 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11593 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11594 }
11595 return FALSE;
11596 }
11597 }
11598 }
11599 block_start = block_end;
11600 }
11601
11602 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11603 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11604 }
11605
11606 return TRUE;
11607 }
11608
11609 boolean_t
11610 pmap_is_empty(
11611 pmap_t pmap,
11612 vm_map_offset_t va_start,
11613 vm_map_offset_t va_end)
11614 {
11615 #if XNU_MONITOR
11616 return pmap_is_empty_ppl(pmap, va_start, va_end);
11617 #else
11618 return pmap_is_empty_internal(pmap, va_start, va_end);
11619 #endif
11620 }
11621
11622 vm_map_offset_t
11623 pmap_max_offset(
11624 boolean_t is64,
11625 unsigned int option)
11626 {
11627 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11628 }
11629
11630 vm_map_offset_t
11631 pmap_max_64bit_offset(
11632 __unused unsigned int option)
11633 {
11634 vm_map_offset_t max_offset_ret = 0;
11635
11636 #if defined(__arm64__)
11637 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11638 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11639 max_offset_ret = arm64_pmap_max_offset_default;
11640 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11641 max_offset_ret = min_max_offset;
11642 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11643 max_offset_ret = MACH_VM_MAX_ADDRESS;
11644 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11645 if (arm64_pmap_max_offset_default) {
11646 max_offset_ret = arm64_pmap_max_offset_default;
11647 } else if (max_mem > 0xC0000000) {
11648 // devices with > 3GB of memory
11649 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11650 } else if (max_mem > 0x40000000) {
11651 // devices with > 1GB and <= 3GB of memory
11652 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11653 } else {
11654 // devices with <= 1 GB of memory
11655 max_offset_ret = min_max_offset;
11656 }
11657 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11658 if (arm64_pmap_max_offset_default) {
11659 // Allow the boot-arg to override jumbo size
11660 max_offset_ret = arm64_pmap_max_offset_default;
11661 } else {
11662 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11663 }
11664 } else {
11665 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11666 }
11667
11668 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11669 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11670 assert(max_offset_ret >= min_max_offset);
11671 }
11672 #else
11673 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11674 #endif
11675
11676 return max_offset_ret;
11677 }
11678
11679 vm_map_offset_t
11680 pmap_max_32bit_offset(
11681 unsigned int option)
11682 {
11683 vm_map_offset_t max_offset_ret = 0;
11684
11685 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11686 max_offset_ret = arm_pmap_max_offset_default;
11687 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11688 max_offset_ret = VM_MAX_ADDRESS;
11689 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11690 max_offset_ret = VM_MAX_ADDRESS;
11691 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11692 if (arm_pmap_max_offset_default) {
11693 max_offset_ret = arm_pmap_max_offset_default;
11694 } else if (max_mem > 0x20000000) {
11695 max_offset_ret = VM_MAX_ADDRESS;
11696 } else {
11697 max_offset_ret = VM_MAX_ADDRESS;
11698 }
11699 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11700 max_offset_ret = VM_MAX_ADDRESS;
11701 } else {
11702 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11703 }
11704
11705 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11706 return max_offset_ret;
11707 }
11708
11709 #if CONFIG_DTRACE
11710 /*
11711 * Constrain DTrace copyin/copyout actions
11712 */
11713 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11714 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11715
11716 kern_return_t
11717 dtrace_copyio_preflight(
11718 __unused addr64_t va)
11719 {
11720 if (current_map() == kernel_map) {
11721 return KERN_FAILURE;
11722 } else {
11723 return KERN_SUCCESS;
11724 }
11725 }
11726
11727 kern_return_t
11728 dtrace_copyio_postflight(
11729 __unused addr64_t va)
11730 {
11731 return KERN_SUCCESS;
11732 }
11733 #endif /* CONFIG_DTRACE */
11734
11735
11736 void
11737 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11738 {
11739 }
11740
11741
11742 void
11743 pmap_flush(
11744 __unused pmap_flush_context *cpus_to_flush)
11745 {
11746 /* not implemented yet */
11747 return;
11748 }
11749
11750 #if XNU_MONITOR
11751
11752 /*
11753 * Enforce that the address range described by kva and nbytes is not currently
11754 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11755 * unintentionally writing to PPL-owned memory.
11756 */
11757 void
11758 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11759 {
11760 vm_offset_t end;
11761 if (os_add_overflow(kva, nbytes, &end)) {
11762 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11763 }
11764 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11765 pmap_paddr_t pa = kvtophys_nofail(ckva);
11766 pp_attr_t attr;
11767 unsigned int pai = pa_index(pa);
11768 if (ckva == phystokv(pa)) {
11769 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11770 }
11771 do {
11772 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11773 if (attr & PP_ATTR_MONITOR) {
11774 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11775 }
11776 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11777 }
11778 }
11779
11780 void
11781 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11782 {
11783 vm_offset_t end;
11784 if (os_add_overflow(kva, nbytes, &end)) {
11785 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11786 }
11787 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11788 pmap_paddr_t pa = kvtophys_nofail(ckva);
11789
11790 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11791 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11792 }
11793 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11794 ppattr_pa_clear_no_monitor(pa);
11795 }
11796 }
11797
11798 /**
11799 * Lock down a page, making all mappings read-only, and preventing further
11800 * mappings or removal of this particular kva's mapping. Effectively, it makes
11801 * the physical page at kva immutable (see the ppl_writable parameter for an
11802 * exception to this).
11803 *
11804 * @param kva Valid address to any mapping of the physical page to lockdown.
11805 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11806 * @param ppl_writable True if the PPL should still be able to write to the page
11807 * using the physical aperture mapping. False will make the
11808 * page read-only for both the kernel and PPL in the
11809 * physical aperture.
11810 */
11811
11812 MARK_AS_PMAP_TEXT static void
11813 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11814 {
11815 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11816 }
11817
11818 /**
11819 * Lock down a page, giving all mappings the specified maximum permissions, and
11820 * preventing further mappings or removal of this particular kva's mapping.
11821 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11822 * parameter for an exception to this).
11823 *
11824 * @param kva Valid address to any mapping of the physical page to lockdown.
11825 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11826 * @param ppl_writable True if the PPL should still be able to write to the page
11827 * using the physical aperture mapping. False will make the
11828 * page read-only for both the kernel and PPL in the
11829 * physical aperture.
11830 * @param prot Maximum permissions to allow in existing alias mappings
11831 */
11832 MARK_AS_PMAP_TEXT static void
11833 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11834 {
11835 const pmap_paddr_t pa = kvtophys_nofail(kva);
11836 const unsigned int pai = pa_index(pa);
11837
11838 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11839 pvh_lock(pai);
11840 pv_entry_t **pvh = pai_to_pvh(pai);
11841 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11842
11843 if (__improbable(ppattr_pa_test_monitor(pa))) {
11844 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11845 }
11846
11847 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11848 panic("%s: %#lx already locked down/executable (%#llx)",
11849 __func__, kva, (uint64_t)pvh_flags);
11850 }
11851
11852
11853 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11854
11855 /* Update the physical aperture mapping to prevent kernel write access. */
11856 const unsigned int new_xprr_perm =
11857 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11858 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11859
11860 pvh_unlock(pai);
11861
11862 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11863
11864 /**
11865 * Double-check that the mapping didn't change physical addresses before the
11866 * LOCKDOWN flag was set (there is a brief window between the above
11867 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11868 *
11869 * This doesn't solve the ABA problem, but this doesn't have to since once
11870 * the pvh_lock() is grabbed no new mappings can be created on this physical
11871 * page without the LOCKDOWN flag already set (so any future mappings can
11872 * only be RO, and no existing mappings can be removed).
11873 */
11874 if (kvtophys_nofail(kva) != pa) {
11875 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11876 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11877 }
11878 }
11879
11880 /**
11881 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11882 * kernel once again.
11883 *
11884 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11885 * to unlockdown a page that was never locked down, will panic.
11886 *
11887 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11888 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11889 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11890 * passed to the paired pmap_ppl_lockdown_page() call. Any
11891 * deviation will result in a panic.
11892 */
11893 MARK_AS_PMAP_TEXT static void
11894 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11895 {
11896 pvh_assert_locked(pai);
11897 pv_entry_t **pvh = pai_to_pvh(pai);
11898 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11899
11900 if (__improbable(!(pvh_flags & lockdown_flag))) {
11901 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11902 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11903 }
11904
11905
11906 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11907
11908 /* Restore the pre-lockdown physical aperture mapping permissions. */
11909 const unsigned int old_xprr_perm =
11910 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11911 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11912 }
11913
11914 /**
11915 * Release a page from being locked down to the PPL, making it writable to the
11916 * kernel once again.
11917 *
11918 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11919 * to unlockdown a page that was never locked down, will panic.
11920 *
11921 * @param kva Valid address to any mapping of the physical page to unlockdown.
11922 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11923 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11924 * passed to the paired pmap_ppl_lockdown_page() call. Any
11925 * deviation will result in a panic.
11926 */
11927 MARK_AS_PMAP_TEXT static void
11928 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11929 {
11930 const pmap_paddr_t pa = kvtophys_nofail(kva);
11931 const unsigned int pai = pa_index(pa);
11932
11933 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11934 pvh_lock(pai);
11935 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
11936 pvh_unlock(pai);
11937 }
11938
11939 #else /* XNU_MONITOR */
11940
11941 void __unused
11942 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11943 {
11944 }
11945
11946 void __unused
11947 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
11948 {
11949 }
11950
11951 #endif /* !XNU_MONITOR */
11952
11953
11954 MARK_AS_PMAP_TEXT static inline void
11955 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11956 {
11957 #if XNU_MONITOR
11958 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11959 #else
11960 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
11961 #endif
11962 }
11963
11964 MARK_AS_PMAP_TEXT static inline void
11965 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
11966 {
11967 #if XNU_MONITOR
11968 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
11969 #else
11970 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
11971 #endif
11972 }
11973
11974 /**
11975 * Perform basic validation checks on the destination only and
11976 * corresponding offset/sizes prior to writing to a read only allocation.
11977 *
11978 * @note Should be called before writing to an allocation from the read
11979 * only allocator.
11980 *
11981 * @param zid The ID of the zone the allocation belongs to.
11982 * @param va VA of element being modified (destination).
11983 * @param offset Offset being written to, in the element.
11984 * @param new_data_size Size of modification.
11985 *
11986 */
11987
11988 MARK_AS_PMAP_TEXT static void
11989 pmap_ro_zone_validate_element_dst(
11990 zone_id_t zid,
11991 vm_offset_t va,
11992 vm_offset_t offset,
11993 vm_size_t new_data_size)
11994 {
11995 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
11996 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
11997 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
11998 }
11999
12000 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12001
12002 /* Check element is from correct zone and properly aligned */
12003 zone_require_ro(zid, elem_size, (void*)va);
12004
12005 if (__improbable(new_data_size > (elem_size - offset))) {
12006 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12007 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12008 }
12009 if (__improbable(offset >= elem_size)) {
12010 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12011 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12012 }
12013 }
12014
12015
12016 /**
12017 * Perform basic validation checks on the source, destination and
12018 * corresponding offset/sizes prior to writing to a read only allocation.
12019 *
12020 * @note Should be called before writing to an allocation from the read
12021 * only allocator.
12022 *
12023 * @param zid The ID of the zone the allocation belongs to.
12024 * @param va VA of element being modified (destination).
12025 * @param offset Offset being written to, in the element.
12026 * @param new_data Pointer to new data (source).
12027 * @param new_data_size Size of modification.
12028 *
12029 */
12030
12031 MARK_AS_PMAP_TEXT static void
12032 pmap_ro_zone_validate_element(
12033 zone_id_t zid,
12034 vm_offset_t va,
12035 vm_offset_t offset,
12036 const vm_offset_t new_data,
12037 vm_size_t new_data_size)
12038 {
12039 vm_offset_t sum = 0;
12040
12041 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12042 panic("%s: Integer addition overflow %p + %lu = %lu",
12043 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12044 }
12045
12046 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12047 }
12048
12049 /**
12050 * Ensure that physical page is locked down and pinned, before writing to it.
12051 *
12052 * @note Should be called before writing to an allocation from the read
12053 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12054 * ensure that it is called after the modification.
12055 *
12056 *
12057 * @param pa Physical address of the element being modified.
12058 * @param va Virtual address of element being modified.
12059 * @param size Size of the modification.
12060 *
12061 */
12062
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ro_zone_lock_phy_page(
12065 const pmap_paddr_t pa,
12066 vm_offset_t va,
12067 vm_size_t size)
12068 {
12069 const unsigned int pai = pa_index(pa);
12070 pvh_lock(pai);
12071
12072 /* Ensure that the physical page is locked down */
12073 #if XNU_MONITOR
12074 pv_entry_t **pvh = pai_to_pvh(pai);
12075 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12076 panic("%s: Physical page not locked down %llx", __func__, pa);
12077 }
12078 #endif /* XNU_MONITOR */
12079
12080 /* Ensure page can't become PPL-owned memory before the memcpy occurs */
12081 pmap_pin_kernel_pages(va, size);
12082 }
12083
12084 /**
12085 * Unlock and unpin physical page after writing to it.
12086 *
12087 * @note Should be called after writing to an allocation from the read
12088 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12089 * ensure that it has been called prior to the modification.
12090 *
12091 * @param pa Physical address of the element that was modified.
12092 * @param va Virtual address of element that was modified.
12093 * @param size Size of the modification.
12094 *
12095 */
12096
12097 MARK_AS_PMAP_TEXT static void
12098 pmap_ro_zone_unlock_phy_page(
12099 const pmap_paddr_t pa,
12100 vm_offset_t va,
12101 vm_size_t size)
12102 {
12103 const unsigned int pai = pa_index(pa);
12104 pmap_unpin_kernel_pages(va, size);
12105 pvh_unlock(pai);
12106 }
12107
12108 /**
12109 * Function to copy kauth_cred from new_data to kv.
12110 * Function defined in "kern_prot.c"
12111 *
12112 * @note Will be removed upon completion of
12113 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12114 *
12115 * @param kv Address to copy new data to.
12116 * @param new_data Pointer to new data.
12117 *
12118 */
12119
12120 extern void
12121 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12122
12123 /**
12124 * Zalloc-specific memcpy that writes through the physical aperture
12125 * and ensures the element being modified is from a read-only zone.
12126 *
12127 * @note Designed to work only with the zone allocator's read-only submap.
12128 *
12129 * @param zid The ID of the zone to allocate from.
12130 * @param va VA of element to be modified.
12131 * @param offset Offset from element.
12132 * @param new_data Pointer to new data.
12133 * @param new_data_size Size of modification.
12134 *
12135 */
12136
12137 void
12138 pmap_ro_zone_memcpy(
12139 zone_id_t zid,
12140 vm_offset_t va,
12141 vm_offset_t offset,
12142 const vm_offset_t new_data,
12143 vm_size_t new_data_size)
12144 {
12145 #if XNU_MONITOR
12146 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12147 #else /* XNU_MONITOR */
12148 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12149 #endif /* XNU_MONITOR */
12150 }
12151
12152 MARK_AS_PMAP_TEXT void
12153 pmap_ro_zone_memcpy_internal(
12154 zone_id_t zid,
12155 vm_offset_t va,
12156 vm_offset_t offset,
12157 const vm_offset_t new_data,
12158 vm_size_t new_data_size)
12159 {
12160 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12161
12162 if (!new_data || new_data_size == 0) {
12163 return;
12164 }
12165
12166 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12167 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12168 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12169 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12170 }
12171
12172 /**
12173 * Zalloc-specific function to atomically mutate fields of an element that
12174 * belongs to a read-only zone, via the physcial aperture.
12175 *
12176 * @note Designed to work only with the zone allocator's read-only submap.
12177 *
12178 * @param zid The ID of the zone the element belongs to.
12179 * @param va VA of element to be modified.
12180 * @param offset Offset in element.
12181 * @param op Atomic operation to perform.
12182 * @param value Mutation value.
12183 *
12184 */
12185
12186 uint64_t
12187 pmap_ro_zone_atomic_op(
12188 zone_id_t zid,
12189 vm_offset_t va,
12190 vm_offset_t offset,
12191 zro_atomic_op_t op,
12192 uint64_t value)
12193 {
12194 #if XNU_MONITOR
12195 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12196 #else /* XNU_MONITOR */
12197 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12198 #endif /* XNU_MONITOR */
12199 }
12200
12201 MARK_AS_PMAP_TEXT uint64_t
12202 pmap_ro_zone_atomic_op_internal(
12203 zone_id_t zid,
12204 vm_offset_t va,
12205 vm_offset_t offset,
12206 zro_atomic_op_t op,
12207 uint64_t value)
12208 {
12209 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12210 vm_size_t value_size = op & 0xf;
12211
12212 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12213 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12214 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12215 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12216
12217 return value;
12218 }
12219
12220 /**
12221 * bzero for allocations from read only zones, that writes through the
12222 * physical aperture.
12223 *
12224 * @note This is called by the zfree path of all allocations from read
12225 * only zones.
12226 *
12227 * @param zid The ID of the zone the allocation belongs to.
12228 * @param va VA of element to be zeroed.
12229 * @param offset Offset in the element.
12230 * @param size Size of allocation.
12231 *
12232 */
12233
12234 void
12235 pmap_ro_zone_bzero(
12236 zone_id_t zid,
12237 vm_offset_t va,
12238 vm_offset_t offset,
12239 vm_size_t size)
12240 {
12241 #if XNU_MONITOR
12242 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12243 #else /* XNU_MONITOR */
12244 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12245 #endif /* XNU_MONITOR */
12246 }
12247
12248 MARK_AS_PMAP_TEXT void
12249 pmap_ro_zone_bzero_internal(
12250 zone_id_t zid,
12251 vm_offset_t va,
12252 vm_offset_t offset,
12253 vm_size_t size)
12254 {
12255 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12256 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12257 pmap_ro_zone_lock_phy_page(pa, va, size);
12258 bzero((void*)phystokv(pa), size);
12259 pmap_ro_zone_unlock_phy_page(pa, va, size);
12260 }
12261
12262 /**
12263 * Removes write access from the Physical Aperture.
12264 *
12265 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12266 * @note Designed to work only with the zone allocator's read-only submap.
12267 *
12268 * @param va VA of the page to restore write access to.
12269 *
12270 */
12271 MARK_AS_PMAP_TEXT static void
12272 pmap_phys_write_disable(vm_address_t va)
12273 {
12274 #if XNU_MONITOR
12275 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12276 #else /* XNU_MONITOR */
12277 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12278 #endif /* XNU_MONITOR */
12279 }
12280
12281 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12282
12283 MARK_AS_PMAP_TEXT mach_vm_size_t
12284 pmap_query_resident_internal(
12285 pmap_t pmap,
12286 vm_map_address_t start,
12287 vm_map_address_t end,
12288 mach_vm_size_t *compressed_bytes_p)
12289 {
12290 mach_vm_size_t resident_bytes = 0;
12291 mach_vm_size_t compressed_bytes = 0;
12292
12293 pt_entry_t *bpte, *epte;
12294 pt_entry_t *pte_p;
12295 tt_entry_t *tte_p;
12296
12297 if (pmap == NULL) {
12298 return PMAP_RESIDENT_INVALID;
12299 }
12300
12301 validate_pmap(pmap);
12302
12303 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12304
12305 /* Ensure that this request is valid, and addresses exactly one TTE. */
12306 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12307 (end % pt_attr_page_size(pt_attr)))) {
12308 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12309 }
12310
12311 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12312 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12313 }
12314
12315 pmap_lock(pmap, PMAP_LOCK_SHARED);
12316 tte_p = pmap_tte(pmap, start);
12317 if (tte_p == (tt_entry_t *) NULL) {
12318 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12319 return PMAP_RESIDENT_INVALID;
12320 }
12321 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12322 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12323 bpte = &pte_p[pte_index(pt_attr, start)];
12324 epte = &pte_p[pte_index(pt_attr, end)];
12325
12326 for (; bpte < epte; bpte++) {
12327 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12328 compressed_bytes += pt_attr_page_size(pt_attr);
12329 } else if (pa_valid(pte_to_pa(*bpte))) {
12330 resident_bytes += pt_attr_page_size(pt_attr);
12331 }
12332 }
12333 }
12334 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12335
12336 if (compressed_bytes_p) {
12337 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12338 *compressed_bytes_p += compressed_bytes;
12339 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12340 }
12341
12342 return resident_bytes;
12343 }
12344
12345 mach_vm_size_t
12346 pmap_query_resident(
12347 pmap_t pmap,
12348 vm_map_address_t start,
12349 vm_map_address_t end,
12350 mach_vm_size_t *compressed_bytes_p)
12351 {
12352 mach_vm_size_t total_resident_bytes;
12353 mach_vm_size_t compressed_bytes;
12354 vm_map_address_t va;
12355
12356
12357 if (pmap == PMAP_NULL) {
12358 if (compressed_bytes_p) {
12359 *compressed_bytes_p = 0;
12360 }
12361 return 0;
12362 }
12363
12364 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12365
12366 total_resident_bytes = 0;
12367 compressed_bytes = 0;
12368
12369 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12370 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12371 VM_KERNEL_ADDRHIDE(end));
12372
12373 va = start;
12374 while (va < end) {
12375 vm_map_address_t l;
12376 mach_vm_size_t resident_bytes;
12377
12378 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12379
12380 if (l > end) {
12381 l = end;
12382 }
12383 #if XNU_MONITOR
12384 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12385 #else
12386 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12387 #endif
12388 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12389 break;
12390 }
12391
12392 total_resident_bytes += resident_bytes;
12393
12394 va = l;
12395 }
12396
12397 if (compressed_bytes_p) {
12398 *compressed_bytes_p = compressed_bytes;
12399 }
12400
12401 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12402 total_resident_bytes);
12403
12404 return total_resident_bytes;
12405 }
12406
12407 #if MACH_ASSERT
12408 static void
12409 pmap_check_ledgers(
12410 pmap_t pmap)
12411 {
12412 int pid;
12413 char *procname;
12414
12415 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12416 /*
12417 * This pmap was not or is no longer fully associated
12418 * with a task (e.g. the old pmap after a fork()/exec() or
12419 * spawn()). Its "ledger" still points at a task that is
12420 * now using a different (and active) address space, so
12421 * we can't check that all the pmap ledgers are balanced here.
12422 *
12423 * If the "pid" is set, that means that we went through
12424 * pmap_set_process() in task_terminate_internal(), so
12425 * this task's ledger should not have been re-used and
12426 * all the pmap ledgers should be back to 0.
12427 */
12428 return;
12429 }
12430
12431 pid = pmap->pmap_pid;
12432 procname = pmap->pmap_procname;
12433
12434 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12435 }
12436 #endif /* MACH_ASSERT */
12437
12438 void
12439 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12440 {
12441 }
12442
12443 /**
12444 * The minimum shared region nesting size is used by the VM to determine when to
12445 * break up large mappings to nested regions. The smallest size that these
12446 * mappings can be broken into is determined by what page table level those
12447 * regions are being nested in at and the size of the page tables.
12448 *
12449 * For instance, if a nested region is nesting at L2 for a process utilizing
12450 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12451 * block entry).
12452 *
12453 * @param pmap The target pmap to determine the block size based on whether it's
12454 * using 16KB or 4KB page tables.
12455 */
12456 uint64_t
12457 pmap_shared_region_size_min(__unused pmap_t pmap)
12458 {
12459 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12460
12461 /**
12462 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12463 * 4KB pages). This means that a target pmap will contain L2 entries that
12464 * point to shared L3 page tables in the shared region pmap.
12465 */
12466 return pt_attr_twig_size(pt_attr);
12467 }
12468
12469 boolean_t
12470 pmap_enforces_execute_only(
12471 pmap_t pmap)
12472 {
12473 return pmap != kernel_pmap;
12474 }
12475
12476 MARK_AS_PMAP_TEXT void
12477 pmap_set_vm_map_cs_enforced_internal(
12478 pmap_t pmap,
12479 bool new_value)
12480 {
12481 validate_pmap_mutable(pmap);
12482 pmap->pmap_vm_map_cs_enforced = new_value;
12483 }
12484
12485 void
12486 pmap_set_vm_map_cs_enforced(
12487 pmap_t pmap,
12488 bool new_value)
12489 {
12490 #if XNU_MONITOR
12491 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12492 #else
12493 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12494 #endif
12495 }
12496
12497 extern int cs_process_enforcement_enable;
12498 bool
12499 pmap_get_vm_map_cs_enforced(
12500 pmap_t pmap)
12501 {
12502 if (cs_process_enforcement_enable) {
12503 return true;
12504 }
12505 return pmap->pmap_vm_map_cs_enforced;
12506 }
12507
12508 MARK_AS_PMAP_TEXT void
12509 pmap_set_jit_entitled_internal(
12510 __unused pmap_t pmap)
12511 {
12512 return;
12513 }
12514
12515 void
12516 pmap_set_jit_entitled(
12517 pmap_t pmap)
12518 {
12519 #if XNU_MONITOR
12520 pmap_set_jit_entitled_ppl(pmap);
12521 #else
12522 pmap_set_jit_entitled_internal(pmap);
12523 #endif
12524 }
12525
12526 bool
12527 pmap_get_jit_entitled(
12528 __unused pmap_t pmap)
12529 {
12530 return false;
12531 }
12532
12533 MARK_AS_PMAP_TEXT void
12534 pmap_set_tpro_internal(
12535 __unused pmap_t pmap)
12536 {
12537 return;
12538 }
12539
12540 void
12541 pmap_set_tpro(
12542 pmap_t pmap)
12543 {
12544 #if XNU_MONITOR
12545 pmap_set_tpro_ppl(pmap);
12546 #else /* XNU_MONITOR */
12547 pmap_set_tpro_internal(pmap);
12548 #endif /* XNU_MONITOR */
12549 }
12550
12551 bool
12552 pmap_get_tpro(
12553 __unused pmap_t pmap)
12554 {
12555 return false;
12556 }
12557
12558 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12559
12560 MARK_AS_PMAP_TEXT kern_return_t
12561 pmap_query_page_info_internal(
12562 pmap_t pmap,
12563 vm_map_offset_t va,
12564 int *disp_p)
12565 {
12566 pmap_paddr_t pa;
12567 int disp;
12568 unsigned int pai;
12569 pt_entry_t *pte_p, pte;
12570 pv_entry_t **pv_h, *pve_p;
12571
12572 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12573 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12574 *disp_p = 0;
12575 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12576 return KERN_INVALID_ARGUMENT;
12577 }
12578
12579 validate_pmap(pmap);
12580 pmap_lock(pmap, PMAP_LOCK_SHARED);
12581
12582 try_again:
12583 disp = 0;
12584 pte_p = pmap_pte(pmap, va);
12585 if (pte_p == PT_ENTRY_NULL) {
12586 goto done;
12587 }
12588 pte = *(volatile pt_entry_t*)pte_p;
12589 pa = pte_to_pa(pte);
12590 if (pa == 0) {
12591 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12592 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12593 if (pte & ARM_PTE_COMPRESSED_ALT) {
12594 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12595 }
12596 }
12597 } else {
12598 disp |= PMAP_QUERY_PAGE_PRESENT;
12599 pai = pa_index(pa);
12600 if (!pa_valid(pa)) {
12601 goto done;
12602 }
12603 pvh_lock(pai);
12604 if (pte != *(volatile pt_entry_t*)pte_p) {
12605 /* something changed: try again */
12606 pvh_unlock(pai);
12607 pmap_query_page_info_retries++;
12608 goto try_again;
12609 }
12610 pv_h = pai_to_pvh(pai);
12611 pve_p = PV_ENTRY_NULL;
12612 int pve_ptep_idx = 0;
12613 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12614 pve_p = pvh_pve_list(pv_h);
12615 while (pve_p != PV_ENTRY_NULL &&
12616 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12617 pve_p = pve_next(pve_p);
12618 }
12619 }
12620
12621 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12622 disp |= PMAP_QUERY_PAGE_ALTACCT;
12623 } else if (ppattr_test_reusable(pai)) {
12624 disp |= PMAP_QUERY_PAGE_REUSABLE;
12625 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12626 disp |= PMAP_QUERY_PAGE_INTERNAL;
12627 }
12628 pvh_unlock(pai);
12629 }
12630
12631 done:
12632 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12633 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12634 *disp_p = disp;
12635 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12636 return KERN_SUCCESS;
12637 }
12638
12639 kern_return_t
12640 pmap_query_page_info(
12641 pmap_t pmap,
12642 vm_map_offset_t va,
12643 int *disp_p)
12644 {
12645 #if XNU_MONITOR
12646 return pmap_query_page_info_ppl(pmap, va, disp_p);
12647 #else
12648 return pmap_query_page_info_internal(pmap, va, disp_p);
12649 #endif
12650 }
12651
12652
12653
12654 uint32_t
12655 pmap_user_va_bits(pmap_t pmap __unused)
12656 {
12657 #if __ARM_MIXED_PAGE_SIZE__
12658 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12659 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12660 #else
12661 return 64 - T0SZ_BOOT;
12662 #endif
12663 }
12664
12665 uint32_t
12666 pmap_kernel_va_bits(void)
12667 {
12668 return 64 - T1SZ_BOOT;
12669 }
12670
12671 static vm_map_size_t
12672 pmap_user_va_size(pmap_t pmap)
12673 {
12674 return 1ULL << pmap_user_va_bits(pmap);
12675 }
12676
12677
12678
12679 bool
12680 pmap_in_ppl(void)
12681 {
12682 // Unsupported
12683 return false;
12684 }
12685
12686 __attribute__((__noreturn__))
12687 void
12688 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12689 {
12690 panic("%s called on an unsupported platform.", __FUNCTION__);
12691 }
12692
12693 void *
12694 pmap_claim_reserved_ppl_page(void)
12695 {
12696 // Unsupported
12697 return NULL;
12698 }
12699
12700 void
12701 pmap_free_reserved_ppl_page(void __unused *kva)
12702 {
12703 // Unsupported
12704 }
12705
12706
12707 #if PMAP_CS_PPL_MONITOR
12708
12709 /* Immutable part of the trust cache runtime */
12710 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12711
12712 /* Mutable part of the trust cache runtime */
12713 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12714
12715 /* Lock for the trust cache runtime */
12716 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12717
12718 MARK_AS_PMAP_TEXT kern_return_t
12719 pmap_check_trust_cache_runtime_for_uuid_internal(
12720 const uint8_t check_uuid[kUUIDSize])
12721 {
12722 kern_return_t ret = KERN_DENIED;
12723
12724 if (amfi->TrustCache.version < 3) {
12725 /* AMFI change hasn't landed in the build */
12726 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12727 return KERN_NOT_SUPPORTED;
12728 }
12729
12730 /* Lock the runtime as shared */
12731 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12732
12733 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12734 &ppl_trust_cache_rt,
12735 check_uuid,
12736 NULL);
12737
12738 /* Unlock the runtime */
12739 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12740
12741 if (tc_ret.error == kTCReturnSuccess) {
12742 ret = KERN_SUCCESS;
12743 } else if (tc_ret.error == kTCReturnNotFound) {
12744 ret = KERN_NOT_FOUND;
12745 } else {
12746 ret = KERN_FAILURE;
12747 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12748 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12749 }
12750
12751 return ret;
12752 }
12753
12754 kern_return_t
12755 pmap_check_trust_cache_runtime_for_uuid(
12756 const uint8_t check_uuid[kUUIDSize])
12757 {
12758 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12759 }
12760
12761 MARK_AS_PMAP_TEXT kern_return_t
12762 pmap_load_trust_cache_with_type_internal(
12763 TCType_t type,
12764 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12765 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12766 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12767 {
12768 kern_return_t ret = KERN_DENIED;
12769 pmap_img4_payload_t *payload = NULL;
12770 size_t img4_payload_len = 0;
12771 size_t payload_len_aligned = 0;
12772 size_t manifest_len_aligned = 0;
12773
12774 /* Ignore the auxiliary manifest until we add support for it */
12775 (void)img4_aux_manifest;
12776 (void)img4_aux_manifest_len;
12777
12778
12779 #if PMAP_CS_INCLUDE_CODE_SIGNING
12780 if (pmap_cs) {
12781 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12782 panic("trust cache type not loadable from interface: %u", type);
12783 } else if (type >= kTCTypeTotal) {
12784 panic("attempted to load an unsupported trust cache type: %u", type);
12785 }
12786
12787 /* Validate entitlement for the calling process */
12788 if (TCTypeConfig[type].entitlementValue != NULL) {
12789 const bool entitlement_satisfied = check_entitlement_pmap(
12790 NULL,
12791 "com.apple.private.pmap.load-trust-cache",
12792 TCTypeConfig[type].entitlementValue,
12793 false,
12794 true);
12795
12796 if (entitlement_satisfied == false) {
12797 panic("attempted to load trust cache without entitlement: %u", type);
12798 }
12799 }
12800 }
12801 #endif
12802
12803 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12804 ret = pmap_reserve_ppl_page();
12805 if (ret != KERN_SUCCESS) {
12806 if (ret != KERN_RESOURCE_SHORTAGE) {
12807 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12808 }
12809 return ret;
12810 }
12811
12812 /* Align the passed in lengths to the page size -- round_page is overflow safe */
12813 payload_len_aligned = round_page(pmap_img4_payload_len);
12814 manifest_len_aligned = round_page(img4_manifest_len);
12815
12816 /* Ensure we have valid data passed in */
12817 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12818 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12819
12820 /*
12821 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12822 * data structure used by libTrustCache to manage the payload. We need to be able to
12823 * write to that data structure, so we keep the payload PPL writable.
12824 */
12825 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12826 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12827
12828 /* Should be safe to read from this now */
12829 payload = (pmap_img4_payload_t*)pmap_img4_payload;
12830
12831 /* Acquire a writable version of the trust cache data structure */
12832 TrustCache_t *trust_cache = &payload->trust_cache;
12833 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12834
12835 /* Calculate the correct length of the img4 payload */
12836 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12837 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12838 }
12839
12840 /* Exclusively lock the runtime */
12841 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12842
12843 /* Load the trust cache */
12844 TCReturn_t tc_ret = amfi->TrustCache.load(
12845 &ppl_trust_cache_rt,
12846 type,
12847 trust_cache,
12848 (const uintptr_t)payload->img4_payload, img4_payload_len,
12849 (const uintptr_t)img4_manifest, img4_manifest_len);
12850
12851 /* Unlock the runtime */
12852 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12853
12854 if (tc_ret.error == kTCReturnSuccess) {
12855 ret = KERN_SUCCESS;
12856 } else {
12857 if (tc_ret.error == kTCReturnDuplicate) {
12858 ret = KERN_ALREADY_IN_SET;
12859 } else {
12860 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12861 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12862
12863 ret = KERN_FAILURE;
12864 }
12865
12866 /* Unlock the payload data */
12867 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12868 trust_cache = NULL;
12869 payload = NULL;
12870 }
12871
12872 /* Unlock the manifest since it is no longer needed */
12873 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12874
12875 /* Return the CoreCrypto reserved page back to the free list */
12876 pmap_release_reserved_ppl_page();
12877
12878 return ret;
12879 }
12880
12881 kern_return_t
12882 pmap_load_trust_cache_with_type(
12883 TCType_t type,
12884 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12885 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12886 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12887 {
12888 kern_return_t ret = KERN_DENIED;
12889
12890 ret = pmap_load_trust_cache_with_type_ppl(
12891 type,
12892 pmap_img4_payload, pmap_img4_payload_len,
12893 img4_manifest, img4_manifest_len,
12894 img4_aux_manifest, img4_aux_manifest_len);
12895
12896 while (ret == KERN_RESOURCE_SHORTAGE) {
12897 /* Allocate a page from the free list */
12898 pmap_alloc_page_for_ppl(0);
12899
12900 /* Attempt the call again */
12901 ret = pmap_load_trust_cache_with_type_ppl(
12902 type,
12903 pmap_img4_payload, pmap_img4_payload_len,
12904 img4_manifest, img4_manifest_len,
12905 img4_aux_manifest, img4_aux_manifest_len);
12906 }
12907
12908 return ret;
12909 }
12910
12911 MARK_AS_PMAP_TEXT kern_return_t
12912 pmap_query_trust_cache_safe(
12913 TCQueryType_t query_type,
12914 const uint8_t cdhash[kTCEntryHashSize],
12915 TrustCacheQueryToken_t *query_token)
12916 {
12917 kern_return_t ret = KERN_NOT_FOUND;
12918
12919 /* Validate the query type preemptively */
12920 if (query_type >= kTCQueryTypeTotal) {
12921 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12922 return KERN_INVALID_ARGUMENT;
12923 }
12924
12925 /* Lock the runtime as shared */
12926 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12927
12928 TCReturn_t tc_ret = amfi->TrustCache.query(
12929 &ppl_trust_cache_rt,
12930 query_type,
12931 cdhash,
12932 query_token);
12933
12934 /* Unlock the runtime */
12935 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12936
12937 if (tc_ret.error == kTCReturnSuccess) {
12938 ret = KERN_SUCCESS;
12939 } else if (tc_ret.error == kTCReturnNotFound) {
12940 ret = KERN_NOT_FOUND;
12941 } else {
12942 ret = KERN_FAILURE;
12943 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
12944 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12945 }
12946
12947 return ret;
12948 }
12949
12950 MARK_AS_PMAP_TEXT kern_return_t
12951 pmap_query_trust_cache_internal(
12952 TCQueryType_t query_type,
12953 const uint8_t cdhash[kTCEntryHashSize],
12954 TrustCacheQueryToken_t *query_token)
12955 {
12956 kern_return_t ret = KERN_NOT_FOUND;
12957 TrustCacheQueryToken_t query_token_safe = {0};
12958 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
12959
12960 /* Copy in the CDHash into PPL storage */
12961 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
12962
12963 /* Query through the safe API since we're in the PPL now */
12964 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
12965
12966 if (query_token != NULL) {
12967 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12968 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
12969 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
12970 }
12971
12972 return ret;
12973 }
12974
12975 kern_return_t
12976 pmap_query_trust_cache(
12977 TCQueryType_t query_type,
12978 const uint8_t cdhash[kTCEntryHashSize],
12979 TrustCacheQueryToken_t *query_token)
12980 {
12981 kern_return_t ret = KERN_NOT_FOUND;
12982
12983 ret = pmap_query_trust_cache_ppl(
12984 query_type,
12985 cdhash,
12986 query_token);
12987
12988 return ret;
12989 }
12990
12991 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
12992 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
12993
12994 MARK_AS_PMAP_TEXT void
12995 pmap_toggle_developer_mode_internal(
12996 bool state)
12997 {
12998 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
12999
13000 /*
13001 * Only the following state transitions are allowed:
13002 * -- not set --> false
13003 * -- not set --> true
13004 * -- true --> false
13005 * -- true --> true
13006 * -- false --> false
13007 *
13008 * We never allow false --> true transitions.
13009 */
13010 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13011
13012 if ((current == false) && (state == true) && state_set) {
13013 panic("PMAP_CS: attempted to enable developer mode incorrectly");
13014 }
13015
13016 /* We're going to update the developer mode state, so update this first */
13017 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13018
13019 /* Update the developer mode state on the system */
13020 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13021 }
13022
13023 void
13024 pmap_toggle_developer_mode(
13025 bool state)
13026 {
13027 pmap_toggle_developer_mode_ppl(state);
13028 }
13029
13030 #endif /* PMAP_CS_PPL_MONITOR */
13031
13032 #if PMAP_CS_INCLUDE_CODE_SIGNING
13033
13034 static int
13035 pmap_cs_profiles_rbtree_compare(
13036 void *profile0,
13037 void *profile1)
13038 {
13039 if (profile0 < profile1) {
13040 return -1;
13041 } else if (profile0 > profile1) {
13042 return 1;
13043 }
13044 return 0;
13045 }
13046
13047 /* Red-black tree for managing provisioning profiles */
13048 MARK_AS_PMAP_DATA static
13049 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13050
13051 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13052 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13053
13054 /* Lock for the profile red-black tree */
13055 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13056
13057 void
13058 pmap_initialize_provisioning_profiles(void)
13059 {
13060 /* Initialize the profiles red-black tree lock */
13061 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13062 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13063
13064 /* Initialize the red-black tree itself */
13065 RB_INIT(&pmap_cs_registered_profiles);
13066
13067 printf("initialized PPL provisioning profile data\n");
13068 }
13069
13070 static bool
13071 pmap_is_testflight_profile(
13072 pmap_cs_profile_t *profile_obj)
13073 {
13074 const char *entitlement_name = "beta-reports-active";
13075 const size_t entitlement_length = strlen(entitlement_name);
13076 CEQueryOperation_t query[2] = {0};
13077
13078 /* If the profile provisions no entitlements, then it isn't a test flight one */
13079 if (profile_obj->entitlements_ctx == NULL) {
13080 return false;
13081 }
13082
13083 /* Build our CoreEntitlements query */
13084 query[0].opcode = kCEOpSelectKey;
13085 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13086 query[0].parameters.stringParameter.length = entitlement_length;
13087 query[1] = CEMatchBool(true);
13088
13089 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13090 profile_obj->entitlements_ctx,
13091 query, 2);
13092
13093 if (ce_err == amfi->CoreEntitlements.kNoError) {
13094 return true;
13095 }
13096
13097 return false;
13098 }
13099
13100 static bool
13101 pmap_is_development_profile(
13102 pmap_cs_profile_t *profile_obj)
13103 {
13104 /* Check for UPP */
13105 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13106 *profile_obj->profile_ctx,
13107 CESelectDictValue("ProvisionsAllDevices"));
13108 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13109 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13110 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13111 return false;
13112 }
13113 }
13114
13115 /* Check for TestFlight profile */
13116 if (pmap_is_testflight_profile(profile_obj) == true) {
13117 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13118 return false;
13119 }
13120
13121 pmap_cs_log_info("%p: development profile", profile_obj);
13122 return true;
13123 }
13124
13125 static kern_return_t
13126 pmap_initialize_profile_entitlements(
13127 pmap_cs_profile_t *profile_obj)
13128 {
13129 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13130 *profile_obj->profile_ctx,
13131 CESelectDictValue("Entitlements"));
13132
13133 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13134 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13135 profile_obj->entitlements_ctx = NULL;
13136
13137 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13138 return KERN_NOT_FOUND;
13139 }
13140
13141 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13142 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13143
13144 CEValidationResult ce_result = {0};
13145 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13146 pmap_cs_core_entitlements_runtime,
13147 &ce_result,
13148 der_start, der_end);
13149 if (ce_err != amfi->CoreEntitlements.kNoError) {
13150 pmap_cs_log_error("unable to validate profile entitlements: %s",
13151 amfi->CoreEntitlements.GetErrorString(ce_err));
13152
13153 return KERN_ABORTED;
13154 }
13155
13156 struct CEQueryContext query_ctx = {0};
13157 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13158 pmap_cs_core_entitlements_runtime,
13159 ce_result,
13160 &query_ctx);
13161 if (ce_err != amfi->CoreEntitlements.kNoError) {
13162 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13163 amfi->CoreEntitlements.GetErrorString(ce_err));
13164
13165 return KERN_ABORTED;
13166 }
13167
13168 /* Setup the entitlements context within the profile object */
13169 profile_obj->entitlements_ctx_storage = query_ctx;
13170 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13171
13172 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13173 return KERN_SUCCESS;
13174 }
13175
13176 kern_return_t
13177 pmap_register_provisioning_profile_internal(
13178 const vm_address_t payload_addr,
13179 const vm_size_t payload_size)
13180 {
13181 kern_return_t ret = KERN_DENIED;
13182 pmap_cs_profile_t *profile_obj = NULL;
13183 pmap_profile_payload_t *profile_payload = NULL;
13184 vm_size_t max_profile_blob_size = 0;
13185 const uint8_t *profile_content = NULL;
13186 size_t profile_content_length = 0;
13187
13188
13189 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13190 ret = pmap_reserve_ppl_page();
13191 if (ret != KERN_SUCCESS) {
13192 if (ret != KERN_RESOURCE_SHORTAGE) {
13193 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13194 }
13195 return ret;
13196 }
13197
13198 /* Ensure we have valid data passed in */
13199 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13200
13201 /*
13202 * Lockdown the data passed in. The pmap profile payload also contains the profile
13203 * data structure used by the PPL to manage the payload. We need to be able to write
13204 * to that data structure, so we keep the payload PPL writable.
13205 */
13206 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13207
13208 /* Should be safe to read from this now */
13209 profile_payload = (pmap_profile_payload_t*)payload_addr;
13210
13211 /* Ensure the profile blob size provided is valid */
13212 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13213 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13214 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13215 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13216 }
13217
13218 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13219 const bool allow_development_root_cert = true;
13220 #else
13221 const bool allow_development_root_cert = false;
13222 #endif
13223
13224 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13225 profile_payload->profile_blob, profile_payload->profile_blob_size,
13226 allow_development_root_cert,
13227 &profile_content, &profile_content_length);
13228
13229 /* Release the PPL page allocated for CoreCrypto */
13230 pmap_release_reserved_ppl_page();
13231
13232 if (ct_result != 0) {
13233 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13234 } else if ((profile_content == NULL) || profile_content_length == 0) {
13235 panic("PMAP_CS: profile does not have any content: %p | %lu",
13236 profile_content, profile_content_length);
13237 }
13238
13239 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13240 pmap_cs_core_entitlements_runtime,
13241 CCDER_CONSTRUCTED_SET,
13242 false,
13243 profile_content, profile_content + profile_content_length);
13244 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13245 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13246 }
13247
13248 /* Acquire a writable version of the profile data structure */
13249 profile_obj = &profile_payload->profile_obj_storage;
13250 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13251
13252 profile_obj->original_payload = profile_payload;
13253 profile_obj->profile_ctx_storage = profile_ctx_storage;
13254 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13255 os_atomic_store(&profile_obj->reference_count, 0, release);
13256
13257 /* Setup the entitlements provisioned by the profile */
13258 ret = pmap_initialize_profile_entitlements(profile_obj);
13259 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13260 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13261 }
13262
13263 /* Setup properties of the profile */
13264 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13265
13266 /* Mark as validated since it passed all checks */
13267 profile_obj->profile_validated = true;
13268
13269 /* Add the profile to the red-black tree */
13270 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13271 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13272 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13273 }
13274 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13275
13276 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13277 return KERN_SUCCESS;
13278 }
13279
13280 kern_return_t
13281 pmap_register_provisioning_profile(
13282 const vm_address_t payload_addr,
13283 const vm_size_t payload_size)
13284 {
13285 kern_return_t ret = KERN_DENIED;
13286
13287 ret = pmap_register_provisioning_profile_ppl(
13288 payload_addr,
13289 payload_size);
13290
13291 while (ret == KERN_RESOURCE_SHORTAGE) {
13292 /* Allocate a page from the free list */
13293 pmap_alloc_page_for_ppl(0);
13294
13295 /* Attempt the call again */
13296 ret = pmap_register_provisioning_profile_ppl(
13297 payload_addr,
13298 payload_size);
13299 }
13300
13301 return ret;
13302 }
13303
13304 kern_return_t
13305 pmap_unregister_provisioning_profile_internal(
13306 pmap_cs_profile_t *profile_obj)
13307 {
13308 kern_return_t ret = KERN_DENIED;
13309
13310 /* Lock the red-black tree exclusively */
13311 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13312
13313 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13314 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13315 }
13316
13317 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13318 if (reference_count != 0) {
13319 ret = KERN_FAILURE;
13320 goto exit;
13321 }
13322
13323 /* Remove the profile from the red-black tree */
13324 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13325
13326 /* Unregistration was a success */
13327 ret = KERN_SUCCESS;
13328
13329 exit:
13330 /* Unlock the red-black tree */
13331 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13332
13333 if (ret == KERN_SUCCESS) {
13334 /* Get the original payload address */
13335 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13336 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13337
13338 /* Get the original payload size */
13339 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13340 payload_size = round_page(payload_size);
13341
13342 /* Unlock the profile payload */
13343 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13344 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13345 profile_payload, payload_size);
13346
13347 profile_obj = NULL;
13348 }
13349 return ret;
13350 }
13351
13352 kern_return_t
13353 pmap_unregister_provisioning_profile(
13354 pmap_cs_profile_t *profile_obj)
13355 {
13356 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13357 }
13358
13359 kern_return_t
13360 pmap_associate_provisioning_profile_internal(
13361 pmap_cs_code_directory_t *cd_entry,
13362 pmap_cs_profile_t *profile_obj)
13363 {
13364 kern_return_t ret = KERN_DENIED;
13365
13366 /* Acquire the lock on the code directory */
13367 pmap_cs_lock_code_directory(cd_entry);
13368
13369 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13370 pmap_cs_log_error("disallowing profile association with verified signature");
13371 goto exit;
13372 } else if (cd_entry->profile_obj != NULL) {
13373 pmap_cs_log_error("disallowing multiple profile associations with signature");
13374 goto exit;
13375 }
13376
13377 /* Lock the red-black tree as shared */
13378 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13379
13380 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13381 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13382 } else if (profile_obj->profile_validated == false) {
13383 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13384 }
13385
13386 /* Associate the profile with the signature */
13387 cd_entry->profile_obj = profile_obj;
13388
13389 /* Increment the reference count on the profile object */
13390 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13391 if (reference_count == 0) {
13392 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13393 }
13394
13395 /* Unlock the red-black tree */
13396 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13397
13398 /* Association was a success */
13399 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13400 ret = KERN_SUCCESS;
13401
13402 exit:
13403 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13404
13405 return ret;
13406 }
13407
13408 kern_return_t
13409 pmap_associate_provisioning_profile(
13410 pmap_cs_code_directory_t *cd_entry,
13411 pmap_cs_profile_t *profile_obj)
13412 {
13413 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13414 }
13415
13416 kern_return_t
13417 pmap_disassociate_provisioning_profile_internal(
13418 pmap_cs_code_directory_t *cd_entry)
13419 {
13420 pmap_cs_profile_t *profile_obj = NULL;
13421 kern_return_t ret = KERN_DENIED;
13422
13423 /* Acquire the lock on the code directory */
13424 pmap_cs_lock_code_directory(cd_entry);
13425
13426 if (cd_entry->profile_obj == NULL) {
13427 ret = KERN_NOT_FOUND;
13428 goto exit;
13429 }
13430 profile_obj = cd_entry->profile_obj;
13431
13432 /* Disassociate the profile from the signature */
13433 cd_entry->profile_obj = NULL;
13434
13435 /* Disassociation was a success */
13436 ret = KERN_SUCCESS;
13437
13438 exit:
13439 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13440
13441 if (ret == KERN_SUCCESS) {
13442 /* Decrement the reference count on the profile object */
13443 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13444 if (reference_count == UINT32_MAX) {
13445 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13446 }
13447 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13448 }
13449 return ret;
13450 }
13451
13452 kern_return_t
13453 pmap_disassociate_provisioning_profile(
13454 pmap_cs_code_directory_t *cd_entry)
13455 {
13456 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13457 }
13458
13459 kern_return_t
13460 pmap_associate_kernel_entitlements_internal(
13461 pmap_cs_code_directory_t *cd_entry,
13462 const void *kernel_entitlements)
13463 {
13464 kern_return_t ret = KERN_DENIED;
13465
13466 if (kernel_entitlements == NULL) {
13467 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13468 }
13469
13470 /* Acquire the lock on the code directory */
13471 pmap_cs_lock_code_directory(cd_entry);
13472
13473 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13474 ret = KERN_DENIED;
13475 goto out;
13476 } else if (cd_entry->kernel_entitlements != NULL) {
13477 ret = KERN_DENIED;
13478 goto out;
13479 }
13480 cd_entry->kernel_entitlements = kernel_entitlements;
13481
13482 /* Association was a success */
13483 ret = KERN_SUCCESS;
13484
13485 out:
13486 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13487 return ret;
13488 }
13489
13490 kern_return_t
13491 pmap_associate_kernel_entitlements(
13492 pmap_cs_code_directory_t *cd_entry,
13493 const void *kernel_entitlements)
13494 {
13495 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13496 }
13497
13498 kern_return_t
13499 pmap_resolve_kernel_entitlements_internal(
13500 pmap_t pmap,
13501 const void **kernel_entitlements)
13502 {
13503 const void *entitlements = NULL;
13504 pmap_cs_code_directory_t *cd_entry = NULL;
13505 kern_return_t ret = KERN_DENIED;
13506
13507 /* Validate the PMAP object */
13508 validate_pmap(pmap);
13509
13510 /* Ensure no kernel PMAP */
13511 if (pmap == kernel_pmap) {
13512 return KERN_NOT_FOUND;
13513 }
13514
13515 /* Attempt a shared lock on the PMAP */
13516 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13517 return KERN_ABORTED;
13518 }
13519
13520 /*
13521 * Acquire the code signature from the PMAP. This function is called when
13522 * performing an entitlement check, and since we've confirmed this isn't
13523 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13524 * with a code signature.
13525 */
13526 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13527 if (cd_entry == NULL) {
13528 ret = KERN_NOT_FOUND;
13529 goto out;
13530 }
13531
13532 entitlements = cd_entry->kernel_entitlements;
13533 if (entitlements == NULL) {
13534 ret = KERN_NOT_FOUND;
13535 goto out;
13536 }
13537
13538 /* Pin and write out the entitlements object pointer */
13539 if (kernel_entitlements != NULL) {
13540 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13541 *kernel_entitlements = entitlements;
13542 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13543 }
13544
13545 /* Successfully resolved the entitlements */
13546 ret = KERN_SUCCESS;
13547
13548 out:
13549 /* Unlock the code signature object */
13550 if (cd_entry != NULL) {
13551 lck_rw_unlock_shared(&cd_entry->rwlock);
13552 cd_entry = NULL;
13553 }
13554
13555 /* Unlock the PMAP object */
13556 pmap_unlock(pmap, PMAP_LOCK_SHARED);
13557
13558 return ret;
13559 }
13560
13561 kern_return_t
13562 pmap_resolve_kernel_entitlements(
13563 pmap_t pmap,
13564 const void **kernel_entitlements)
13565 {
13566 kern_return_t ret = KERN_DENIED;
13567
13568 do {
13569 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13570 } while (ret == KERN_ABORTED);
13571
13572 return ret;
13573 }
13574
13575 kern_return_t
13576 pmap_accelerate_entitlements_internal(
13577 pmap_cs_code_directory_t *cd_entry)
13578 {
13579 const coreentitlements_t *CoreEntitlements = NULL;
13580 const CS_SuperBlob *superblob = NULL;
13581 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13582 size_t signature_length = 0;
13583 size_t acceleration_length = 0;
13584 size_t required_length = 0;
13585 kern_return_t ret = KERN_DENIED;
13586
13587 /* Setup the CoreEntitlements interface */
13588 CoreEntitlements = &amfi->CoreEntitlements;
13589
13590 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13591
13592 /* Acquire the lock on the code directory */
13593 pmap_cs_lock_code_directory(cd_entry);
13594
13595 /*
13596 * Only reconstituted code signatures can be accelerated. This is only a policy
13597 * decision we make since this allows us to re-use any unused space within the
13598 * locked down code signature region. There is also a decent bit of validation
13599 * within the reconstitution function to ensure blobs are ordered and do not
13600 * contain any padding around them which can cause issues here.
13601 *
13602 * This also serves as a check to ensure the signature is trusted.
13603 */
13604 if (cd_entry->unneeded_code_signature_unlocked == false) {
13605 ret = KERN_DENIED;
13606 goto out;
13607 }
13608
13609 if (cd_entry->ce_ctx == NULL) {
13610 ret = KERN_SUCCESS;
13611 goto out;
13612 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13613 ret = KERN_SUCCESS;
13614 goto out;
13615 }
13616
13617 /* We only support accelerating when size <= PAGE_SIZE */
13618 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13619 if (ce_err != CoreEntitlements->kNoError) {
13620 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13621 /* Small entitlement blobs aren't eligible */
13622 ret = KERN_SUCCESS;
13623 goto out;
13624 }
13625 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13626 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13627 } else if (acceleration_length > PAGE_SIZE) {
13628 ret = KERN_ABORTED;
13629 goto out;
13630 }
13631 assert(acceleration_length > 0);
13632
13633 superblob = cd_entry->superblob;
13634 signature_length = ntohl(superblob->length);
13635
13636 /* Adjust the required length for the overhead structure -- can't overflow */
13637 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13638 if (required_length > PAGE_SIZE) {
13639 ret = KERN_ABORTED;
13640 goto out;
13641 }
13642
13643 /*
13644 * First we'll check if the code signature has enough space within the locked down
13645 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13646 * allocate the buffer, and if not, we'll just allocate an entire page from the
13647 * free list.
13648 *
13649 * When we're storing the buffer within the code signature, we also need to make
13650 * sure we account for alignment of the buffer.
13651 */
13652 const vm_address_t align_mask = sizeof(void*) - 1;
13653 size_t required_length_within_sig = required_length + align_mask;
13654
13655 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13656 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13657 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13658
13659 /* We need to resolve to the physical aperture */
13660 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13661 acceleration_buf = (void*)phystokv(phys_addr);
13662
13663 /* Ensure the offset within the page wasn't lost */
13664 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13665
13666 acceleration_buf->allocated = false;
13667 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13668 } else {
13669 if (required_length <= pmap_cs_blob_limit) {
13670 struct pmap_cs_blob *bucket = NULL;
13671 size_t bucket_size = 0;
13672
13673 /* Allocate a buffer from the blob allocator */
13674 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13675 if (ret != KERN_SUCCESS) {
13676 goto out;
13677 }
13678 acceleration_buf = (void*)bucket->blob;
13679 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13680 } else {
13681 pmap_paddr_t phys_addr = 0;
13682 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13683 if (ret != KERN_SUCCESS) {
13684 goto out;
13685 }
13686 acceleration_buf = (void*)phystokv(phys_addr);
13687 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13688 }
13689 acceleration_buf->allocated = true;
13690 }
13691 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13692 acceleration_buf->length = acceleration_length;
13693
13694 /* Take the acceleration buffer lock */
13695 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13696
13697 /* Setup the global acceleration buffer state */
13698 pmap_cs_acceleration_buf = acceleration_buf;
13699
13700 /* Accelerate the entitlements */
13701 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
13702 if (ce_err != CoreEntitlements->kNoError) {
13703 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
13704 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13705 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
13706 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
13707 }
13708
13709 /*
13710 * The global acceleration buffer lock is unlocked by the allocation function itself
13711 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
13712 * an assert that the lock is unlocked here since another thread could have acquired
13713 * it by now.
13714 */
13715 ret = KERN_SUCCESS;
13716
13717 out:
13718 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13719 return ret;
13720 }
13721
13722 kern_return_t
13723 pmap_accelerate_entitlements(
13724 pmap_cs_code_directory_t *cd_entry)
13725 {
13726 kern_return_t ret = KERN_DENIED;
13727
13728 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13729 while (ret == KERN_RESOURCE_SHORTAGE) {
13730 /* Allocate a page for the PPL */
13731 pmap_alloc_page_for_ppl(0);
13732
13733 /* Try again */
13734 ret = pmap_accelerate_entitlements_ppl(cd_entry);
13735 }
13736
13737 return ret;
13738 }
13739
13740 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13741
13742 MARK_AS_PMAP_TEXT bool
13743 pmap_lookup_in_loaded_trust_caches_internal(
13744 const uint8_t cdhash[CS_CDHASH_LEN])
13745 {
13746 kern_return_t kr = KERN_NOT_FOUND;
13747
13748 #if PMAP_CS_PPL_MONITOR
13749 /*
13750 * If we have the PPL monitor, then this function can only be called from
13751 * within the PPL. Calling it directly would've caused a panic, so we can
13752 * assume that we're in the PPL here.
13753 */
13754 uint8_t cdhash_safe[CS_CDHASH_LEN];
13755 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13756
13757 kr = pmap_query_trust_cache_safe(
13758 kTCQueryTypeLoadable,
13759 cdhash_safe,
13760 NULL);
13761 #else
13762 kr = query_trust_cache(
13763 kTCQueryTypeLoadable,
13764 cdhash,
13765 NULL);
13766 #endif
13767
13768 if (kr == KERN_SUCCESS) {
13769 return true;
13770 }
13771 return false;
13772 }
13773
13774 bool
13775 pmap_lookup_in_loaded_trust_caches(
13776 const uint8_t cdhash[CS_CDHASH_LEN])
13777 {
13778 #if XNU_MONITOR
13779 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
13780 #else
13781 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
13782 #endif
13783 }
13784
13785 MARK_AS_PMAP_TEXT uint32_t
13786 pmap_lookup_in_static_trust_cache_internal(
13787 const uint8_t cdhash[CS_CDHASH_LEN])
13788 {
13789 TrustCacheQueryToken_t query_token = {0};
13790 kern_return_t kr = KERN_NOT_FOUND;
13791 uint64_t flags = 0;
13792 uint8_t hash_type = 0;
13793
13794 #if PMAP_CS_PPL_MONITOR
13795 /*
13796 * If we have the PPL monitor, then this function can only be called from
13797 * within the PPL. Calling it directly would've caused a panic, so we can
13798 * assume that we're in the PPL here.
13799 */
13800 uint8_t cdhash_safe[CS_CDHASH_LEN];
13801 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
13802
13803 kr = pmap_query_trust_cache_safe(
13804 kTCQueryTypeStatic,
13805 cdhash_safe,
13806 &query_token);
13807 #else
13808 kr = query_trust_cache(
13809 kTCQueryTypeStatic,
13810 cdhash,
13811 &query_token);
13812 #endif
13813
13814 if (kr == KERN_SUCCESS) {
13815 amfi->TrustCache.queryGetFlags(&query_token, &flags);
13816 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
13817
13818 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
13819 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
13820 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
13821 }
13822
13823 return 0;
13824 }
13825
13826 uint32_t
13827 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
13828 {
13829 #if XNU_MONITOR
13830 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
13831 #else
13832 return pmap_lookup_in_static_trust_cache_internal(cdhash);
13833 #endif
13834 }
13835
13836 #if PMAP_CS_INCLUDE_CODE_SIGNING
13837
13838 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
13839 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
13840
13841 MARK_AS_PMAP_TEXT void
13842 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13843 {
13844
13845 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13846 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
13847 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13848
13849 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
13850 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
13851 }
13852
13853 MARK_AS_PMAP_TEXT bool
13854 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
13855 {
13856 bool match = false;
13857
13858 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
13859 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
13860 match = true;
13861 }
13862 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
13863
13864 if (match) {
13865 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
13866 }
13867
13868 return match;
13869 }
13870
13871 void
13872 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13873 {
13874 #if XNU_MONITOR
13875 pmap_set_compilation_service_cdhash_ppl(cdhash);
13876 #else
13877 pmap_set_compilation_service_cdhash_internal(cdhash);
13878 #endif
13879 }
13880
13881 bool
13882 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
13883 {
13884 #if XNU_MONITOR
13885 return pmap_match_compilation_service_cdhash_ppl(cdhash);
13886 #else
13887 return pmap_match_compilation_service_cdhash_internal(cdhash);
13888 #endif
13889 }
13890
13891 /*
13892 * As part of supporting local signing on the device, we need the PMAP layer
13893 * to store the local signing key so that PMAP_CS can validate with it. We
13894 * store it at the PMAP layer such that it is accessible to both AMFI and
13895 * PMAP_CS should they need it.
13896 */
13897 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
13898 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
13899
13900 MARK_AS_PMAP_TEXT void
13901 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13902 {
13903 bool key_set = false;
13904
13905 /*
13906 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
13907 * a successful exchange means that the local signing public key has _not_ been
13908 * set. In case the key has been set, we panic as we would never expect the
13909 * kernel to attempt to set the key more than once.
13910 */
13911 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
13912
13913 if (key_set) {
13914 panic("attempted to set the local signing public key multiple times");
13915 }
13916
13917 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
13918 pmap_cs_log_info("set local signing public key");
13919 }
13920
13921 void
13922 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
13923 {
13924 #if XNU_MONITOR
13925 return pmap_set_local_signing_public_key_ppl(public_key);
13926 #else
13927 return pmap_set_local_signing_public_key_internal(public_key);
13928 #endif
13929 }
13930
13931 uint8_t*
13932 pmap_get_local_signing_public_key(void)
13933 {
13934 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
13935
13936 if (key_set) {
13937 return pmap_local_signing_public_key;
13938 }
13939
13940 return NULL;
13941 }
13942
13943 /*
13944 * Locally signed applications need to be explicitly authorized by an entitled application
13945 * before we allow them to run.
13946 */
13947 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
13948 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
13949
13950 MARK_AS_PMAP_TEXT void
13951 pmap_unrestrict_local_signing_internal(
13952 const uint8_t cdhash[CS_CDHASH_LEN])
13953 {
13954
13955 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13956 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13957 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13958
13959 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
13960 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
13961 }
13962
13963 void
13964 pmap_unrestrict_local_signing(
13965 const uint8_t cdhash[CS_CDHASH_LEN])
13966 {
13967 #if XNU_MONITOR
13968 return pmap_unrestrict_local_signing_ppl(cdhash);
13969 #else
13970 return pmap_unrestrict_local_signing_internal(cdhash);
13971 #endif
13972 }
13973
13974 #if PMAP_CS
13975 MARK_AS_PMAP_TEXT static void
13976 pmap_restrict_local_signing(void)
13977 {
13978 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13979 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
13980 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13981 }
13982
13983 MARK_AS_PMAP_TEXT static bool
13984 pmap_local_signing_restricted(
13985 const uint8_t cdhash[CS_CDHASH_LEN])
13986 {
13987 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
13988 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
13989 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
13990
13991 return ret != 0;
13992 }
13993
13994 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
13995 #endif
13996
13997 MARK_AS_PMAP_TEXT void
13998 pmap_footprint_suspend_internal(
13999 vm_map_t map,
14000 boolean_t suspend)
14001 {
14002 #if DEVELOPMENT || DEBUG
14003 if (suspend) {
14004 current_thread()->pmap_footprint_suspended = TRUE;
14005 map->pmap->footprint_was_suspended = TRUE;
14006 } else {
14007 current_thread()->pmap_footprint_suspended = FALSE;
14008 }
14009 #else /* DEVELOPMENT || DEBUG */
14010 (void) map;
14011 (void) suspend;
14012 #endif /* DEVELOPMENT || DEBUG */
14013 }
14014
14015 void
14016 pmap_footprint_suspend(
14017 vm_map_t map,
14018 boolean_t suspend)
14019 {
14020 #if XNU_MONITOR
14021 pmap_footprint_suspend_ppl(map, suspend);
14022 #else
14023 pmap_footprint_suspend_internal(map, suspend);
14024 #endif
14025 }
14026
14027 MARK_AS_PMAP_TEXT void
14028 pmap_nop_internal(pmap_t pmap __unused)
14029 {
14030 validate_pmap_mutable(pmap);
14031 }
14032
14033 void
14034 pmap_nop(pmap_t pmap)
14035 {
14036 #if XNU_MONITOR
14037 pmap_nop_ppl(pmap);
14038 #else
14039 pmap_nop_internal(pmap);
14040 #endif
14041 }
14042
14043 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14044
14045 struct page_table_dump_header {
14046 uint64_t pa;
14047 uint64_t num_entries;
14048 uint64_t start_va;
14049 uint64_t end_va;
14050 };
14051
14052 static kern_return_t
14053 pmap_dump_page_tables_recurse(pmap_t pmap,
14054 const tt_entry_t *ttp,
14055 unsigned int cur_level,
14056 unsigned int level_mask,
14057 uint64_t start_va,
14058 void *buf_start,
14059 void *buf_end,
14060 size_t *bytes_copied)
14061 {
14062 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14063 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14064
14065 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14066 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14067 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14068 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14069
14070 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14071
14072 if (cur_level == pt_attr_root_level(pt_attr)) {
14073 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14074 }
14075
14076 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14077 const tt_entry_t *tt_end = &ttp[num_entries];
14078
14079 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14080 return KERN_INSUFFICIENT_BUFFER_SIZE;
14081 }
14082
14083 if (level_mask & (1U << cur_level)) {
14084 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14085 header->pa = ml_static_vtop((vm_offset_t)ttp);
14086 header->num_entries = num_entries;
14087 header->start_va = start_va;
14088 header->end_va = start_va + (num_entries * size);
14089
14090 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14091 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14092 }
14093 uint64_t current_va = start_va;
14094
14095 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14096 tt_entry_t tte = *ttep;
14097
14098 if (!(tte & valid_mask)) {
14099 continue;
14100 }
14101
14102 if ((tte & type_mask) == type_block) {
14103 continue;
14104 } else {
14105 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14106 panic("%s: corrupt entry %#llx at %p, "
14107 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14108 __FUNCTION__, tte, ttep,
14109 ttp, cur_level, bufp, buf_end);
14110 }
14111
14112 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14113
14114 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14115 level_mask, current_va, buf_start, buf_end, bytes_copied);
14116
14117 if (recurse_result != KERN_SUCCESS) {
14118 return recurse_result;
14119 }
14120 }
14121 }
14122
14123 return KERN_SUCCESS;
14124 }
14125
14126 kern_return_t
14127 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14128 {
14129 if (not_in_kdp) {
14130 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14131 }
14132 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14133 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14134 }
14135
14136 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14137
14138 kern_return_t
14139 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14140 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14141 {
14142 return KERN_NOT_SUPPORTED;
14143 }
14144 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14145
14146
14147 #ifdef CONFIG_XNUPOST
14148 #ifdef __arm64__
14149 static volatile bool pmap_test_took_fault = false;
14150
14151 static bool
14152 pmap_test_fault_handler(arm_saved_state_t * state)
14153 {
14154 bool retval = false;
14155 uint32_t esr = get_saved_state_esr(state);
14156 esr_exception_class_t class = ESR_EC(esr);
14157 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14158
14159 if ((class == ESR_EC_DABORT_EL1) &&
14160 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14161 pmap_test_took_fault = true;
14162 /* return to the instruction immediately after the call to NX page */
14163 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14164 retval = true;
14165 }
14166
14167 return retval;
14168 }
14169
14170 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14171 static NOKASAN bool
14172 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14173 {
14174 pmap_t old_pmap = NULL;
14175
14176 pmap_test_took_fault = false;
14177
14178 /*
14179 * We're potentially switching pmaps without using the normal thread
14180 * mechanism; disable interrupts and preemption to avoid any unexpected
14181 * memory accesses.
14182 */
14183 uint64_t old_int_state = pmap_interrupts_disable();
14184 mp_disable_preemption();
14185
14186 if (pmap != NULL) {
14187 old_pmap = current_pmap();
14188 pmap_switch(pmap);
14189
14190 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14191 #if __ARM_PAN_AVAILABLE__
14192 __builtin_arm_wsr("pan", 0);
14193 #endif /* __ARM_PAN_AVAILABLE__ */
14194 }
14195
14196 ml_expect_fault_begin(pmap_test_fault_handler, va);
14197
14198 if (is_write) {
14199 *((volatile uint64_t*)(va)) = 0xdec0de;
14200 } else {
14201 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14202 (void)tmp;
14203 }
14204
14205 /* Save the fault bool, and undo the gross stuff we did. */
14206 bool took_fault = pmap_test_took_fault;
14207 ml_expect_fault_end();
14208
14209 if (pmap != NULL) {
14210 #if __ARM_PAN_AVAILABLE__
14211 __builtin_arm_wsr("pan", 1);
14212 #endif /* __ARM_PAN_AVAILABLE__ */
14213
14214 pmap_switch(old_pmap);
14215 }
14216
14217 mp_enable_preemption();
14218 pmap_interrupts_restore(old_int_state);
14219 bool retval = (took_fault == should_fault);
14220 return retval;
14221 }
14222
14223 static bool
14224 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14225 {
14226 bool retval = pmap_test_access(pmap, va, should_fault, false);
14227
14228 if (!retval) {
14229 T_FAIL("%s: %s, "
14230 "pmap=%p, va=%p, should_fault=%u",
14231 __func__, should_fault ? "did not fault" : "faulted",
14232 pmap, (void*)va, (unsigned)should_fault);
14233 }
14234
14235 return retval;
14236 }
14237
14238 static bool
14239 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14240 {
14241 bool retval = pmap_test_access(pmap, va, should_fault, true);
14242
14243 if (!retval) {
14244 T_FAIL("%s: %s, "
14245 "pmap=%p, va=%p, should_fault=%u",
14246 __func__, should_fault ? "did not fault" : "faulted",
14247 pmap, (void*)va, (unsigned)should_fault);
14248 }
14249
14250 return retval;
14251 }
14252
14253 static bool
14254 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14255 {
14256 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14257 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14258
14259 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14260
14261 if (!retval) {
14262 T_FAIL("%s: bits=%u, "
14263 "pa=%p, should_be_set=%u",
14264 __func__, bits,
14265 (void*)pa, should_be_set);
14266 }
14267
14268 return retval;
14269 }
14270
14271 static __attribute__((noinline)) bool
14272 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14273 {
14274 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14275 return retval;
14276 }
14277
14278 static int
14279 pmap_test_test_config(unsigned int flags)
14280 {
14281 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14282 unsigned int map_count = 0;
14283 unsigned long page_ratio = 0;
14284 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14285
14286 if (!pmap) {
14287 panic("Failed to allocate pmap");
14288 }
14289
14290 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14291 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14292 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14293 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14294
14295 if (pmap_page_size <= native_page_size) {
14296 page_ratio = native_page_size / pmap_page_size;
14297 } else {
14298 /*
14299 * We claim to support a page_ratio of less than 1, which is
14300 * not currently supported by the pmap layer; panic.
14301 */
14302 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14303 "flags=%u",
14304 __func__, native_page_size, pmap_page_size,
14305 flags);
14306 }
14307
14308 if (PAGE_RATIO > 1) {
14309 /*
14310 * The kernel is deliberately pretending to have 16KB pages.
14311 * The pmap layer has code that supports this, so pretend the
14312 * page size is larger than it is.
14313 */
14314 pmap_page_size = PAGE_SIZE;
14315 native_page_size = PAGE_SIZE;
14316 }
14317
14318 /*
14319 * Get two pages from the VM; one to be mapped wired, and one to be
14320 * mapped nonwired.
14321 */
14322 vm_page_t unwired_vm_page = vm_page_grab();
14323 vm_page_t wired_vm_page = vm_page_grab();
14324
14325 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14326 panic("Failed to grab VM pages");
14327 }
14328
14329 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14330 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14331
14332 pmap_paddr_t pa = ptoa(pn);
14333 pmap_paddr_t wired_pa = ptoa(wired_pn);
14334
14335 /*
14336 * We'll start mappings at the second twig TT. This keeps us from only
14337 * using the first entry in each TT, which would trivially be address
14338 * 0; one of the things we will need to test is retrieving the VA for
14339 * a given PTE.
14340 */
14341 vm_map_address_t va_base = pmap_twig_size;
14342 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14343
14344 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14345 /*
14346 * Not exactly a functional failure, but this test relies on
14347 * there being a spare PTE slot we can use to pin the TT.
14348 */
14349 panic("Cannot pin translation table");
14350 }
14351
14352 /*
14353 * Create the wired mapping; this will prevent the pmap layer from
14354 * reclaiming our test TTs, which would interfere with this test
14355 * ("interfere" -> "make it panic").
14356 */
14357 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14358
14359 #if XNU_MONITOR
14360 /*
14361 * If the PPL is enabled, make sure that the kernel cannot write
14362 * to PPL memory.
14363 */
14364 if (!pmap_ppl_disable) {
14365 T_LOG("Validate that kernel cannot write to PPL memory.");
14366 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14367 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14368 }
14369 #endif
14370
14371 /*
14372 * Create read-only mappings of the nonwired page; if the pmap does
14373 * not use the same page size as the kernel, create multiple mappings
14374 * so that the kernel page is fully mapped.
14375 */
14376 for (map_count = 0; map_count < page_ratio; map_count++) {
14377 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14378 }
14379
14380 /* Validate that all the PTEs have the expected PA and VA. */
14381 for (map_count = 0; map_count < page_ratio; map_count++) {
14382 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14383
14384 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14385 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14386 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14387 }
14388
14389 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14390 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14391 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14392 }
14393 }
14394
14395 T_LOG("Validate that reads to our mapping do not fault.");
14396 pmap_test_read(pmap, va_base, false);
14397
14398 T_LOG("Validate that writes to our mapping fault.");
14399 pmap_test_write(pmap, va_base, true);
14400
14401 T_LOG("Make the first mapping writable.");
14402 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14403
14404 T_LOG("Validate that writes to our mapping do not fault.");
14405 pmap_test_write(pmap, va_base, false);
14406
14407
14408 T_LOG("Make the first mapping execute-only");
14409 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14410
14411
14412 T_LOG("Validate that reads to our mapping do not fault.");
14413 pmap_test_read(pmap, va_base, false);
14414
14415 T_LOG("Validate that writes to our mapping fault.");
14416 pmap_test_write(pmap, va_base, true);
14417
14418
14419 /*
14420 * For page ratios of greater than 1: validate that writes to the other
14421 * mappings still fault. Remove the mappings afterwards (we're done
14422 * with page ratio testing).
14423 */
14424 for (map_count = 1; map_count < page_ratio; map_count++) {
14425 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14426 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14427 }
14428
14429 T_LOG("Mark the page unreferenced and unmodified.");
14430 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14431 pmap_test_check_refmod(pa, 0);
14432
14433 /*
14434 * Begin testing the ref/mod state machine. Re-enter the mapping with
14435 * different protection/fault_type settings, and confirm that the
14436 * ref/mod state matches our expectations at each step.
14437 */
14438 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14439 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14440 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14441
14442 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14443 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14444 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14445 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14446
14447 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14448 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14449 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14450 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14451
14452 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14453 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14454 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14455
14456 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14457 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14458 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14459 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14460
14461 /*
14462 * Shared memory testing; we'll have two mappings; one read-only,
14463 * one read-write.
14464 */
14465 vm_map_address_t rw_base = va_base;
14466 vm_map_address_t ro_base = va_base + pmap_page_size;
14467
14468 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14469 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14470
14471 /*
14472 * Test that we take faults as expected for unreferenced/unmodified
14473 * pages. Also test the arm_fast_fault interface, to ensure that
14474 * mapping permissions change as expected.
14475 */
14476 T_LOG("!ref/!mod: expect no access");
14477 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14478 pmap_test_read_write(pmap, ro_base, false, false);
14479 pmap_test_read_write(pmap, rw_base, false, false);
14480
14481 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14482 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14483 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14484 pmap_test_read_write(pmap, ro_base, true, false);
14485 pmap_test_read_write(pmap, rw_base, true, false);
14486
14487 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14488 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14489 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14490 pmap_test_read_write(pmap, ro_base, true, false);
14491 pmap_test_read_write(pmap, rw_base, true, true);
14492
14493 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14494 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14495 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14496 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14497 pmap_test_read_write(pmap, ro_base, true, false);
14498 pmap_test_read_write(pmap, rw_base, true, true);
14499
14500 T_LOG("RW protect both mappings; should not change protections.");
14501 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14502 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14503 pmap_test_read_write(pmap, ro_base, true, false);
14504 pmap_test_read_write(pmap, rw_base, true, true);
14505
14506 T_LOG("Read protect both mappings; RW mapping should become RO.");
14507 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14508 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14509 pmap_test_read_write(pmap, ro_base, true, false);
14510 pmap_test_read_write(pmap, rw_base, true, false);
14511
14512 T_LOG("RW protect the page; mappings should not change protections.");
14513 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14514 pmap_page_protect(pn, VM_PROT_ALL);
14515 pmap_test_read_write(pmap, ro_base, true, false);
14516 pmap_test_read_write(pmap, rw_base, true, true);
14517
14518 T_LOG("Read protect the page; RW mapping should become RO.");
14519 pmap_page_protect(pn, VM_PROT_READ);
14520 pmap_test_read_write(pmap, ro_base, true, false);
14521 pmap_test_read_write(pmap, rw_base, true, false);
14522
14523 T_LOG("Validate that disconnect removes all known mappings of the page.");
14524 pmap_disconnect(pn);
14525 if (!pmap_verify_free(pn)) {
14526 T_FAIL("Page still has mappings");
14527 }
14528
14529 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14530 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14531 pmap_destroy(pmap);
14532
14533 T_LOG("Release the pages back to the VM.");
14534 vm_page_lock_queues();
14535 vm_page_free(unwired_vm_page);
14536 vm_page_free(wired_vm_page);
14537 vm_page_unlock_queues();
14538
14539 T_LOG("Testing successful!");
14540 return 0;
14541 }
14542 #endif /* __arm64__ */
14543
14544 kern_return_t
14545 pmap_test(void)
14546 {
14547 T_LOG("Starting pmap_tests");
14548 #ifdef __arm64__
14549 int flags = 0;
14550 flags |= PMAP_CREATE_64BIT;
14551
14552 #if __ARM_MIXED_PAGE_SIZE__
14553 T_LOG("Testing VM_PAGE_SIZE_4KB");
14554 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14555 T_LOG("Testing VM_PAGE_SIZE_16KB");
14556 pmap_test_test_config(flags);
14557 #else /* __ARM_MIXED_PAGE_SIZE__ */
14558 pmap_test_test_config(flags);
14559 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14560
14561 #endif /* __arm64__ */
14562 T_PASS("completed pmap_test successfully");
14563 return KERN_SUCCESS;
14564 }
14565 #endif /* CONFIG_XNUPOST */
14566
14567 /*
14568 * The following function should never make it to RELEASE code, since
14569 * it provides a way to get the PPL to modify text pages.
14570 */
14571 #if DEVELOPMENT || DEBUG
14572
14573 #define ARM_UNDEFINED_INSN 0xe7f000f0
14574 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14575
14576 /**
14577 * Forcibly overwrite executable text with an illegal instruction.
14578 *
14579 * @note Only used for xnu unit testing.
14580 *
14581 * @param pa The physical address to corrupt.
14582 *
14583 * @return KERN_SUCCESS on success.
14584 */
14585 kern_return_t
14586 pmap_test_text_corruption(pmap_paddr_t pa)
14587 {
14588 #if XNU_MONITOR
14589 return pmap_test_text_corruption_ppl(pa);
14590 #else /* XNU_MONITOR */
14591 return pmap_test_text_corruption_internal(pa);
14592 #endif /* XNU_MONITOR */
14593 }
14594
14595 MARK_AS_PMAP_TEXT kern_return_t
14596 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14597 {
14598 vm_offset_t va = phystokv(pa);
14599 unsigned int pai = pa_index(pa);
14600
14601 assert(pa_valid(pa));
14602
14603 pvh_lock(pai);
14604
14605 pv_entry_t **pv_h = pai_to_pvh(pai);
14606 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14607 #if defined(PVH_FLAG_EXEC)
14608 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14609
14610 if (need_ap_twiddle) {
14611 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14612 }
14613 #endif /* defined(PVH_FLAG_EXEC) */
14614
14615 /*
14616 * The low bit in an instruction address indicates a THUMB instruction
14617 */
14618 if (va & 1) {
14619 va &= ~(vm_offset_t)1;
14620 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14621 } else {
14622 *(uint32_t *)va = ARM_UNDEFINED_INSN;
14623 }
14624
14625 #if defined(PVH_FLAG_EXEC)
14626 if (need_ap_twiddle) {
14627 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14628 }
14629 #endif /* defined(PVH_FLAG_EXEC) */
14630
14631 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14632
14633 pvh_unlock(pai);
14634
14635 return KERN_SUCCESS;
14636 }
14637
14638 #endif /* DEVELOPMENT || DEBUG */
14639