1 /*
2 * Copyright (c) 2011-2021, 2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <string.h>
29 #include <stdlib.h>
30 #include <mach_assert.h>
31 #include <mach_ldebug.h>
32
33 #include <mach/shared_region.h>
34 #include <mach/vm_param.h>
35 #include <mach/vm_prot.h>
36 #include <mach/vm_map.h>
37 #include <mach/machine/vm_param.h>
38 #include <mach/machine/vm_types.h>
39
40 #include <mach/boolean.h>
41 #include <kern/bits.h>
42 #include <kern/ecc.h>
43 #include <kern/thread.h>
44 #include <kern/sched.h>
45 #include <kern/zalloc.h>
46 #include <kern/zalloc_internal.h>
47 #include <kern/kalloc.h>
48 #include <kern/spl.h>
49 #include <kern/startup.h>
50 #include <kern/trustcache.h>
51
52 #include <os/overflow.h>
53
54 #include <vm/pmap.h>
55 #include <vm/pmap_cs.h>
56 #include <vm/vm_map.h>
57 #include <vm/vm_kern.h>
58 #include <vm/vm_protos.h>
59 #include <vm/vm_object.h>
60 #include <vm/vm_page.h>
61 #include <vm/vm_pageout.h>
62 #include <vm/cpm.h>
63
64 #include <libkern/img4/interface.h>
65 #include <libkern/amfi/amfi.h>
66 #include <libkern/section_keywords.h>
67 #include <sys/errno.h>
68 #include <sys/code_signing.h>
69 #include <sys/trust_caches.h>
70
71 #include <machine/atomic.h>
72 #include <machine/thread.h>
73 #include <machine/lowglobals.h>
74
75 #include <arm/caches_internal.h>
76 #include <arm/cpu_data.h>
77 #include <arm/cpu_data_internal.h>
78 #include <arm/cpu_capabilities.h>
79 #include <arm/cpu_number.h>
80 #include <arm/machine_cpu.h>
81 #include <arm/misc_protos.h>
82 #include <arm/pmap/pmap_internal.h>
83 #include <arm/trap_internal.h>
84
85 #include <arm64/proc_reg.h>
86 #include <pexpert/arm64/boot.h>
87 #include <arm64/ppl/sart.h>
88 #include <arm64/ppl/uat.h>
89
90 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
91 #include <arm64/amcc_rorgn.h>
92 #endif // defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
93
94 #include <pexpert/device_tree.h>
95
96 #include <san/kasan.h>
97 #include <sys/cdefs.h>
98
99 #if defined(HAS_APPLE_PAC)
100 #include <ptrauth.h>
101 #endif
102
103 #ifdef CONFIG_XNUPOST
104 #include <tests/xnupost.h>
105 #endif
106
107
108 #if HIBERNATION
109 #include <IOKit/IOHibernatePrivate.h>
110 #endif /* HIBERNATION */
111
112 #ifdef __ARM64_PMAP_SUBPAGE_L1__
113 #define PMAP_ROOT_ALLOC_SIZE (((ARM_TT_L1_INDEX_MASK >> ARM_TT_L1_SHIFT) + 1) * sizeof(tt_entry_t))
114 #else
115 #define PMAP_ROOT_ALLOC_SIZE (ARM_PGBYTES)
116 #endif
117
118 #if __ARM_VMSA__ != 8
119 #error Unknown __ARM_VMSA__
120 #endif
121
122 #define ARRAY_LEN(x) (sizeof (x) / sizeof (x[0]))
123
124 extern u_int32_t random(void); /* from <libkern/libkern.h> */
125
126 static bool alloc_asid(pmap_t pmap);
127 static void free_asid(pmap_t pmap);
128 static void flush_mmu_tlb_region_asid_async(vm_offset_t va, size_t length, pmap_t pmap, bool last_level_only, bool strong);
129 static void flush_mmu_tlb_full_asid_async(pmap_t pmap);
130 static pt_entry_t wimg_to_pte(unsigned int wimg, pmap_paddr_t pa);
131
132 const struct page_table_ops native_pt_ops =
133 {
134 .alloc_id = alloc_asid,
135 .free_id = free_asid,
136 .flush_tlb_region_async = flush_mmu_tlb_region_asid_async,
137 .flush_tlb_async = flush_mmu_tlb_full_asid_async,
138 .wimg_to_pte = wimg_to_pte,
139 };
140
141 const struct page_table_level_info pmap_table_level_info_16k[] =
142 {
143 [0] = {
144 .size = ARM_16K_TT_L0_SIZE,
145 .offmask = ARM_16K_TT_L0_OFFMASK,
146 .shift = ARM_16K_TT_L0_SHIFT,
147 .index_mask = ARM_16K_TT_L0_INDEX_MASK,
148 .valid_mask = ARM_TTE_VALID,
149 .type_mask = ARM_TTE_TYPE_MASK,
150 .type_block = ARM_TTE_TYPE_BLOCK
151 },
152 [1] = {
153 .size = ARM_16K_TT_L1_SIZE,
154 .offmask = ARM_16K_TT_L1_OFFMASK,
155 .shift = ARM_16K_TT_L1_SHIFT,
156 .index_mask = ARM_16K_TT_L1_INDEX_MASK,
157 .valid_mask = ARM_TTE_VALID,
158 .type_mask = ARM_TTE_TYPE_MASK,
159 .type_block = ARM_TTE_TYPE_BLOCK
160 },
161 [2] = {
162 .size = ARM_16K_TT_L2_SIZE,
163 .offmask = ARM_16K_TT_L2_OFFMASK,
164 .shift = ARM_16K_TT_L2_SHIFT,
165 .index_mask = ARM_16K_TT_L2_INDEX_MASK,
166 .valid_mask = ARM_TTE_VALID,
167 .type_mask = ARM_TTE_TYPE_MASK,
168 .type_block = ARM_TTE_TYPE_BLOCK
169 },
170 [3] = {
171 .size = ARM_16K_TT_L3_SIZE,
172 .offmask = ARM_16K_TT_L3_OFFMASK,
173 .shift = ARM_16K_TT_L3_SHIFT,
174 .index_mask = ARM_16K_TT_L3_INDEX_MASK,
175 .valid_mask = ARM_PTE_TYPE_VALID,
176 .type_mask = ARM_PTE_TYPE_MASK,
177 .type_block = ARM_TTE_TYPE_L3BLOCK
178 }
179 };
180
181 const struct page_table_level_info pmap_table_level_info_4k[] =
182 {
183 [0] = {
184 .size = ARM_4K_TT_L0_SIZE,
185 .offmask = ARM_4K_TT_L0_OFFMASK,
186 .shift = ARM_4K_TT_L0_SHIFT,
187 .index_mask = ARM_4K_TT_L0_INDEX_MASK,
188 .valid_mask = ARM_TTE_VALID,
189 .type_mask = ARM_TTE_TYPE_MASK,
190 .type_block = ARM_TTE_TYPE_BLOCK
191 },
192 [1] = {
193 .size = ARM_4K_TT_L1_SIZE,
194 .offmask = ARM_4K_TT_L1_OFFMASK,
195 .shift = ARM_4K_TT_L1_SHIFT,
196 .index_mask = ARM_4K_TT_L1_INDEX_MASK,
197 .valid_mask = ARM_TTE_VALID,
198 .type_mask = ARM_TTE_TYPE_MASK,
199 .type_block = ARM_TTE_TYPE_BLOCK
200 },
201 [2] = {
202 .size = ARM_4K_TT_L2_SIZE,
203 .offmask = ARM_4K_TT_L2_OFFMASK,
204 .shift = ARM_4K_TT_L2_SHIFT,
205 .index_mask = ARM_4K_TT_L2_INDEX_MASK,
206 .valid_mask = ARM_TTE_VALID,
207 .type_mask = ARM_TTE_TYPE_MASK,
208 .type_block = ARM_TTE_TYPE_BLOCK
209 },
210 [3] = {
211 .size = ARM_4K_TT_L3_SIZE,
212 .offmask = ARM_4K_TT_L3_OFFMASK,
213 .shift = ARM_4K_TT_L3_SHIFT,
214 .index_mask = ARM_4K_TT_L3_INDEX_MASK,
215 .valid_mask = ARM_PTE_TYPE_VALID,
216 .type_mask = ARM_PTE_TYPE_MASK,
217 .type_block = ARM_TTE_TYPE_L3BLOCK
218 }
219 };
220
221 const struct page_table_attr pmap_pt_attr_4k = {
222 .pta_level_info = pmap_table_level_info_4k,
223 .pta_root_level = (T0SZ_BOOT - 16) / 9,
224 #if __ARM_MIXED_PAGE_SIZE__
225 .pta_commpage_level = PMAP_TT_L2_LEVEL,
226 #else /* __ARM_MIXED_PAGE_SIZE__ */
227 #if __ARM_16K_PG__
228 .pta_commpage_level = PMAP_TT_L2_LEVEL,
229 #else /* __ARM_16K_PG__ */
230 .pta_commpage_level = PMAP_TT_L1_LEVEL,
231 #endif /* __ARM_16K_PG__ */
232 #endif /* __ARM_MIXED_PAGE_SIZE__ */
233 .pta_max_level = PMAP_TT_L3_LEVEL,
234 .pta_ops = &native_pt_ops,
235 .ap_ro = ARM_PTE_AP(AP_RORO),
236 .ap_rw = ARM_PTE_AP(AP_RWRW),
237 .ap_rona = ARM_PTE_AP(AP_RONA),
238 .ap_rwna = ARM_PTE_AP(AP_RWNA),
239 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
240 .ap_x = ARM_PTE_PNX,
241 #if __ARM_MIXED_PAGE_SIZE__
242 .pta_tcr_value = TCR_EL1_4KB,
243 #endif /* __ARM_MIXED_PAGE_SIZE__ */
244 .pta_page_size = 4096,
245 .pta_pagezero_size = 4096,
246 .pta_page_shift = 12,
247 };
248
249 const struct page_table_attr pmap_pt_attr_16k = {
250 .pta_level_info = pmap_table_level_info_16k,
251 .pta_root_level = PMAP_TT_L1_LEVEL,
252 .pta_commpage_level = PMAP_TT_L2_LEVEL,
253 .pta_max_level = PMAP_TT_L3_LEVEL,
254 .pta_ops = &native_pt_ops,
255 .ap_ro = ARM_PTE_AP(AP_RORO),
256 .ap_rw = ARM_PTE_AP(AP_RWRW),
257 .ap_rona = ARM_PTE_AP(AP_RONA),
258 .ap_rwna = ARM_PTE_AP(AP_RWNA),
259 .ap_xn = ARM_PTE_PNX | ARM_PTE_NX,
260 .ap_x = ARM_PTE_PNX,
261 #if __ARM_MIXED_PAGE_SIZE__
262 .pta_tcr_value = TCR_EL1_16KB,
263 #endif /* __ARM_MIXED_PAGE_SIZE__ */
264 .pta_page_size = 16384,
265 .pta_pagezero_size = 16384,
266 .pta_page_shift = 14,
267 };
268
269 #if __ARM_16K_PG__
270 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_16k;
271 #else /* !__ARM_16K_PG__ */
272 const struct page_table_attr * const native_pt_attr = &pmap_pt_attr_4k;
273 #endif /* !__ARM_16K_PG__ */
274
275
276 #if MACH_ASSERT
277 int vm_footprint_suspend_allowed = 1;
278
279 extern int pmap_ledgers_panic;
280 extern int pmap_ledgers_panic_leeway;
281
282 #endif /* MACH_ASSERT */
283
284 #if DEVELOPMENT || DEBUG
285 #define PMAP_FOOTPRINT_SUSPENDED(pmap) \
286 (current_thread()->pmap_footprint_suspended)
287 #else /* DEVELOPMENT || DEBUG */
288 #define PMAP_FOOTPRINT_SUSPENDED(pmap) (FALSE)
289 #endif /* DEVELOPMENT || DEBUG */
290
291
292 /*
293 * Represents a tlb range that will be flushed before exiting
294 * the ppl.
295 * Used by phys_attribute_clear_range to defer flushing pages in
296 * this range until the end of the operation.
297 */
298 typedef struct pmap_tlb_flush_range {
299 pmap_t ptfr_pmap;
300 vm_map_address_t ptfr_start;
301 vm_map_address_t ptfr_end;
302 bool ptfr_flush_needed;
303 } pmap_tlb_flush_range_t;
304
305 #if XNU_MONITOR
306 /*
307 * PPL External References.
308 */
309 extern vm_offset_t segPPLDATAB;
310 extern unsigned long segSizePPLDATA;
311 extern vm_offset_t segPPLTEXTB;
312 extern unsigned long segSizePPLTEXT;
313 extern vm_offset_t segPPLDATACONSTB;
314 extern unsigned long segSizePPLDATACONST;
315
316
317 /*
318 * PPL Global Variables
319 */
320
321 #if (DEVELOPMENT || DEBUG) || CONFIG_CSR_FROM_DT
322 /* Indicates if the PPL will enforce mapping policies; set by -unsafe_kernel_text */
323 SECURITY_READ_ONLY_LATE(boolean_t) pmap_ppl_disable = FALSE;
324 #else
325 const boolean_t pmap_ppl_disable = FALSE;
326 #endif
327
328 /*
329 * Indicates if the PPL has started applying APRR.
330 * This variable is accessed from various assembly trampolines, so be sure to change
331 * those if you change the size or layout of this variable.
332 */
333 boolean_t pmap_ppl_locked_down MARK_AS_PMAP_DATA = FALSE;
334
335 extern void *pmap_stacks_start;
336 extern void *pmap_stacks_end;
337
338 #endif /* !XNU_MONITOR */
339
340
341
342 /* Virtual memory region for early allocation */
343 #define VREGION1_HIGH_WINDOW (PE_EARLY_BOOT_VA)
344 #define VREGION1_START ((VM_MAX_KERNEL_ADDRESS & CPUWINDOWS_BASE_MASK) - VREGION1_HIGH_WINDOW)
345 #define VREGION1_SIZE (trunc_page(VM_MAX_KERNEL_ADDRESS - (VREGION1_START)))
346
347 extern uint8_t bootstrap_pagetables[];
348
349 extern unsigned int not_in_kdp;
350
351 extern vm_offset_t first_avail;
352
353 extern vm_offset_t virtual_space_start; /* Next available kernel VA */
354 extern vm_offset_t virtual_space_end; /* End of kernel address space */
355 extern vm_offset_t static_memory_end;
356
357 extern const vm_map_address_t physmap_base;
358 extern const vm_map_address_t physmap_end;
359
360 extern int maxproc, hard_maxproc;
361
362 /* The number of address bits one TTBR can cover. */
363 #define PGTABLE_ADDR_BITS (64ULL - T0SZ_BOOT)
364
365 /*
366 * The bounds on our TTBRs. These are for sanity checking that
367 * an address is accessible by a TTBR before we attempt to map it.
368 */
369
370 /* The level of the root of a page table. */
371 const uint64_t arm64_root_pgtable_level = (3 - ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) / (ARM_PGSHIFT - TTE_SHIFT)));
372
373 /* The number of entries in the root TT of a page table. */
374 const uint64_t arm64_root_pgtable_num_ttes = (2 << ((PGTABLE_ADDR_BITS - 1 - ARM_PGSHIFT) % (ARM_PGSHIFT - TTE_SHIFT)));
375
376 struct pmap kernel_pmap_store MARK_AS_PMAP_DATA;
377 const pmap_t kernel_pmap = &kernel_pmap_store;
378
379 static SECURITY_READ_ONLY_LATE(zone_t) pmap_zone; /* zone of pmap structures */
380
381 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmaps_lock, 0);
382 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(tt1_lock, 0);
383 queue_head_t map_pmap_list MARK_AS_PMAP_DATA;
384
385 typedef struct tt_free_entry {
386 struct tt_free_entry *next;
387 } tt_free_entry_t;
388
389 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
390
391 tt_free_entry_t *free_page_size_tt_list MARK_AS_PMAP_DATA;
392 unsigned int free_page_size_tt_count MARK_AS_PMAP_DATA;
393 unsigned int free_page_size_tt_max MARK_AS_PMAP_DATA;
394 #define FREE_PAGE_SIZE_TT_MAX 4
395 tt_free_entry_t *free_two_page_size_tt_list MARK_AS_PMAP_DATA;
396 unsigned int free_two_page_size_tt_count MARK_AS_PMAP_DATA;
397 unsigned int free_two_page_size_tt_max MARK_AS_PMAP_DATA;
398 #define FREE_TWO_PAGE_SIZE_TT_MAX 4
399 tt_free_entry_t *free_tt_list MARK_AS_PMAP_DATA;
400 unsigned int free_tt_count MARK_AS_PMAP_DATA;
401 unsigned int free_tt_max MARK_AS_PMAP_DATA;
402
403 #define TT_FREE_ENTRY_NULL ((tt_free_entry_t *) 0)
404
405 unsigned int inuse_user_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf user pagetable pages, in units of PAGE_SIZE */
406 unsigned int inuse_user_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf user pagetable pages, in units of PAGE_SIZE */
407 unsigned int inuse_user_tteroot_count MARK_AS_PMAP_DATA = 0; /* root user pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
408 unsigned int inuse_kernel_ttepages_count MARK_AS_PMAP_DATA = 0; /* non-root, non-leaf kernel pagetable pages, in units of PAGE_SIZE */
409 unsigned int inuse_kernel_ptepages_count MARK_AS_PMAP_DATA = 0; /* leaf kernel pagetable pages, in units of PAGE_SIZE */
410 unsigned int inuse_kernel_tteroot_count MARK_AS_PMAP_DATA = 0; /* root kernel pagetables, in units of PMAP_ROOT_ALLOC_SIZE */
411
412 SECURITY_READ_ONLY_LATE(tt_entry_t *) invalid_tte = 0;
413 SECURITY_READ_ONLY_LATE(pmap_paddr_t) invalid_ttep = 0;
414
415 SECURITY_READ_ONLY_LATE(tt_entry_t *) cpu_tte = 0; /* set by arm_vm_init() - keep out of bss */
416 SECURITY_READ_ONLY_LATE(pmap_paddr_t) cpu_ttep = 0; /* set by arm_vm_init() - phys tte addr */
417
418 /* Lock group used for all pmap object locks. */
419 lck_grp_t pmap_lck_grp MARK_AS_PMAP_DATA;
420
421 #if DEVELOPMENT || DEBUG
422 int nx_enabled = 1; /* enable no-execute protection */
423 int allow_data_exec = 0; /* No apps may execute data */
424 int allow_stack_exec = 0; /* No apps may execute from the stack */
425 unsigned long pmap_asid_flushes MARK_AS_PMAP_DATA = 0;
426 unsigned long pmap_asid_hits MARK_AS_PMAP_DATA = 0;
427 unsigned long pmap_asid_misses MARK_AS_PMAP_DATA = 0;
428 #else /* DEVELOPMENT || DEBUG */
429 const int nx_enabled = 1; /* enable no-execute protection */
430 const int allow_data_exec = 0; /* No apps may execute data */
431 const int allow_stack_exec = 0; /* No apps may execute from the stack */
432 #endif /* DEVELOPMENT || DEBUG */
433
434 /**
435 * This variable is set true during hibernation entry to protect pmap data structures
436 * during image copying, and reset false on hibernation exit.
437 */
438 bool hib_entry_pmap_lockdown MARK_AS_PMAP_DATA = false;
439
440 #if MACH_ASSERT
441 static void pmap_check_ledgers(pmap_t pmap);
442 #else
443 static inline void
pmap_check_ledgers(__unused pmap_t pmap)444 pmap_check_ledgers(__unused pmap_t pmap)
445 {
446 }
447 #endif /* MACH_ASSERT */
448
449 /**
450 * This helper function ensures that potentially-long-running batched PPL operations are
451 * called in preemptible context before entering the PPL, so that the PPL call may
452 * periodically exit to allow pending urgent ASTs to be taken.
453 */
454 static inline void
pmap_verify_preemptible(void)455 pmap_verify_preemptible(void)
456 {
457 assert(preemption_enabled() || (startup_phase < STARTUP_SUB_EARLY_BOOT));
458 }
459
460 SIMPLE_LOCK_DECLARE(phys_backup_lock, 0);
461
462 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_first_phys = (pmap_paddr_t) 0;
463 SECURITY_READ_ONLY_LATE(pmap_paddr_t) vm_last_phys = (pmap_paddr_t) 0;
464
465 SECURITY_READ_ONLY_LATE(boolean_t) pmap_initialized = FALSE; /* Has pmap_init completed? */
466
467 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm_pmap_max_offset_default = 0x0;
468 #if defined(__arm64__)
469 /* end of shared region + 512MB for various purposes */
470 #define ARM64_MIN_MAX_ADDRESS (SHARED_REGION_BASE_ARM64 + SHARED_REGION_SIZE_ARM64 + 0x20000000)
471 _Static_assert((ARM64_MIN_MAX_ADDRESS > SHARED_REGION_BASE_ARM64) && (ARM64_MIN_MAX_ADDRESS <= MACH_VM_MAX_ADDRESS),
472 "Minimum address space size outside allowable range");
473
474 // Max offset is 15.375GB for devices with "large" memory config
475 #define ARM64_MAX_OFFSET_DEVICE_LARGE (ARM64_MIN_MAX_ADDRESS + 0x138000000)
476 // Max offset is 11.375GB for devices with "small" memory config
477 #define ARM64_MAX_OFFSET_DEVICE_SMALL (ARM64_MIN_MAX_ADDRESS + 0x38000000)
478
479
480 _Static_assert((ARM64_MAX_OFFSET_DEVICE_LARGE > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_LARGE <= MACH_VM_MAX_ADDRESS),
481 "Large device address space size outside allowable range");
482 _Static_assert((ARM64_MAX_OFFSET_DEVICE_SMALL > ARM64_MIN_MAX_ADDRESS) && (ARM64_MAX_OFFSET_DEVICE_SMALL <= MACH_VM_MAX_ADDRESS),
483 "Small device address space size outside allowable range");
484
485 # ifdef XNU_TARGET_OS_OSX
486 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = MACH_VM_MAX_ADDRESS;
487 # else
488 SECURITY_READ_ONLY_LATE(vm_map_offset_t) arm64_pmap_max_offset_default = 0x0;
489 # endif
490 #endif /* __arm64__ */
491
492 #if PMAP_PANIC_DEV_WIMG_ON_MANAGED && (DEVELOPMENT || DEBUG)
493 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = TRUE;
494 #else
495 SECURITY_READ_ONLY_LATE(boolean_t) pmap_panic_dev_wimg_on_managed = FALSE;
496 #endif
497
498 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(asid_lock, 0);
499 SECURITY_READ_ONLY_LATE(uint32_t) pmap_max_asids = 0;
500 SECURITY_READ_ONLY_LATE(uint16_t) asid_chunk_size = 0;
501 SECURITY_READ_ONLY_LATE(static bitmap_t*) asid_bitmap;
502 #if !HAS_16BIT_ASID
503 SECURITY_READ_ONLY_LATE(int) pmap_asid_plru = 1;
504 static bitmap_t asid_plru_bitmap[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA;
505 static uint64_t asid_plru_generation[BITMAP_LEN(MAX_HW_ASIDS)] MARK_AS_PMAP_DATA = {0};
506 static uint64_t asid_plru_gencount MARK_AS_PMAP_DATA = 0;
507 #else
508 static uint16_t last_allocated_asid = 0;
509 #endif /* !HAS_16BIT_ASID */
510
511
512 #if __ARM_MIXED_PAGE_SIZE__
513 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_4k;
514 #endif
515 SECURITY_READ_ONLY_LATE(pmap_t) commpage_pmap_default;
516 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_text_kva = 0;
517 SECURITY_READ_ONLY_LATE(static vm_address_t) commpage_ro_data_kva = 0;
518
519 /* PTE Define Macros */
520
521 #define ARM_PTE_IS_COMPRESSED(x, p) \
522 ((((x) & 0x3) == 0) && /* PTE is not valid... */ \
523 ((x) & ARM_PTE_COMPRESSED) && /* ...has "compressed" marker" */ \
524 ((!((x) & ~ARM_PTE_COMPRESSED_MASK)) || /* ...no other bits */ \
525 (panic("compressed PTE %p 0x%llx has extra bits 0x%llx: corrupted?", \
526 (p), (x), (x) & ~ARM_PTE_COMPRESSED_MASK), FALSE)))
527
528 #define pte_is_wired(pte) \
529 (((pte) & ARM_PTE_WIRED_MASK) == ARM_PTE_WIRED)
530
531 #define pte_was_writeable(pte) \
532 (((pte) & ARM_PTE_WRITEABLE) == ARM_PTE_WRITEABLE)
533
534 #define pte_set_was_writeable(pte, was_writeable) \
535 do { \
536 if ((was_writeable)) { \
537 (pte) |= ARM_PTE_WRITEABLE; \
538 } else { \
539 (pte) &= ~ARM_PTE_WRITEABLE; \
540 } \
541 } while(0)
542
543 static inline void
pte_set_wired(pmap_t pmap,pt_entry_t * ptep,boolean_t wired)544 pte_set_wired(pmap_t pmap, pt_entry_t *ptep, boolean_t wired)
545 {
546 if (wired) {
547 *ptep |= ARM_PTE_WIRED;
548 } else {
549 *ptep &= ~ARM_PTE_WIRED;
550 }
551 /*
552 * Do not track wired page count for kernel pagetable pages. Kernel mappings are
553 * not guaranteed to have PTDs in the first place, and kernel pagetable pages are
554 * never reclaimed.
555 */
556 if (pmap == kernel_pmap) {
557 return;
558 }
559 unsigned short *ptd_wiredcnt_ptr;
560 ptd_wiredcnt_ptr = &(ptep_get_info(ptep)->wiredcnt);
561 if (wired) {
562 os_atomic_add(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
563 } else {
564 unsigned short prev_wired = os_atomic_sub_orig(ptd_wiredcnt_ptr, (unsigned short)1, relaxed);
565 if (__improbable(prev_wired == 0)) {
566 panic("pmap %p (pte %p): wired count underflow", pmap, ptep);
567 }
568 }
569 }
570
571 #if HAS_FEAT_XS
572
573 static inline bool
pte_is_xs(const pt_attr_t * pt_attr,pt_entry_t pte)574 pte_is_xs(const pt_attr_t *pt_attr, pt_entry_t pte)
575 {
576 if (__improbable(pt_attr->stage2)) {
577 return false;
578 }
579 switch (ARM_PTE_EXTRACT_ATTRINDX(pte)) {
580 case CACHE_ATTRINDX_POSTED_XS:
581 case CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS:
582 return true;
583 default:
584 return false;
585 }
586 }
587
588 #endif /* HAS_FEAT_XS */
589
590 #define PMAP_UPDATE_TLBS(pmap, s, e, strong, last_level_only) { \
591 pmap_get_pt_ops(pmap)->flush_tlb_region_async(s, (size_t)((e) - (s)), pmap, last_level_only, strong); \
592 arm64_sync_tlb(strong); \
593 }
594
595 /*
596 * Synchronize updates to PTEs that were previously invalid or had the AF bit cleared,
597 * therefore not requiring TLBI. Use a store-load barrier to ensure subsequent loads
598 * will observe the updated PTE.
599 */
600 #define FLUSH_PTE() \
601 __builtin_arm_dmb(DMB_ISH);
602
603 /*
604 * Synchronize updates to PTEs that were previously valid and thus may be cached in
605 * TLBs. DSB is required to ensure the PTE stores have completed prior to the ensuing
606 * TLBI. This should only require a store-store barrier, as subsequent accesses in
607 * program order will not issue until the DSB completes. Prior loads may be reordered
608 * after the barrier, but their behavior should not be materially affected by the
609 * reordering. For fault-driven PTE updates such as COW, PTE contents should not
610 * matter for loads until the access is re-driven well after the TLB update is
611 * synchronized. For "involuntary" PTE access restriction due to paging lifecycle,
612 * we should be in a position to handle access faults. For "voluntary" PTE access
613 * restriction due to unmapping or protection, the decision to restrict access should
614 * have a data dependency on prior loads in order to avoid a data race.
615 */
616 #define FLUSH_PTE_STRONG() \
617 __builtin_arm_dsb(DSB_ISHST);
618
619 /**
620 * Write enough page table entries to map a single VM page. On systems where the
621 * VM page size does not match the hardware page size, multiple page table
622 * entries will need to be written.
623 *
624 * @note This function does not emit a barrier to ensure these page table writes
625 * have completed before continuing. This is commonly needed. In the case
626 * where a DMB or DSB barrier is needed, then use the write_pte() and
627 * write_pte_strong() functions respectively instead of this one.
628 *
629 * @param ptep Pointer to the first page table entry to update.
630 * @param pte The value to write into each page table entry. In the case that
631 * multiple PTEs are updated to a non-empty value, then the address
632 * in this value will automatically be incremented for each PTE
633 * write.
634 */
635 static void
write_pte_fast(pt_entry_t * ptep,pt_entry_t pte)636 write_pte_fast(pt_entry_t *ptep, pt_entry_t pte)
637 {
638 /**
639 * The PAGE_SHIFT (and in turn, the PAGE_RATIO) can be a variable on some
640 * systems, which is why it's checked at runtime instead of compile time.
641 * The "unreachable" warning needs to be suppressed because it still is a
642 * compile time constant on some systems.
643 */
644 __unreachable_ok_push
645 if (TEST_PAGE_RATIO_4) {
646 if (((uintptr_t)ptep) & 0x1f) {
647 panic("%s: PTE write is unaligned, ptep=%p, pte=%p",
648 __func__, ptep, (void*)pte);
649 }
650
651 if ((pte & ~ARM_PTE_COMPRESSED_MASK) == ARM_PTE_EMPTY) {
652 /**
653 * If we're writing an empty/compressed PTE value, then don't
654 * auto-increment the address for each PTE write.
655 */
656 *ptep = pte;
657 *(ptep + 1) = pte;
658 *(ptep + 2) = pte;
659 *(ptep + 3) = pte;
660 } else {
661 *ptep = pte;
662 *(ptep + 1) = pte | 0x1000;
663 *(ptep + 2) = pte | 0x2000;
664 *(ptep + 3) = pte | 0x3000;
665 }
666 } else {
667 *ptep = pte;
668 }
669 __unreachable_ok_pop
670 }
671
672 /**
673 * Writes enough page table entries to map a single VM page and then ensures
674 * those writes complete by executing a Data Memory Barrier.
675 *
676 * @note The DMB issued by this function is not strong enough to protect against
677 * TLB invalidates from being reordered above the PTE writes. If a TLBI
678 * instruction is going to immediately be called after this write, it's
679 * recommended to call write_pte_strong() instead of this function.
680 *
681 * See the function header for write_pte_fast() for more details on the
682 * parameters.
683 */
684 void
write_pte(pt_entry_t * ptep,pt_entry_t pte)685 write_pte(pt_entry_t *ptep, pt_entry_t pte)
686 {
687 write_pte_fast(ptep, pte);
688 FLUSH_PTE();
689 }
690
691 /**
692 * Writes enough page table entries to map a single VM page and then ensures
693 * those writes complete by executing a Data Synchronization Barrier. This
694 * barrier provides stronger guarantees than the DMB executed by write_pte().
695 *
696 * @note This function is useful if you're going to immediately flush the TLB
697 * after making the PTE write. A DSB is required to protect against the
698 * TLB invalidate being reordered before the PTE write.
699 *
700 * See the function header for write_pte_fast() for more details on the
701 * parameters.
702 */
703 static void
write_pte_strong(pt_entry_t * ptep,pt_entry_t pte)704 write_pte_strong(pt_entry_t *ptep, pt_entry_t pte)
705 {
706 write_pte_fast(ptep, pte);
707 FLUSH_PTE_STRONG();
708 }
709
710 /**
711 * Retrieve the pmap structure for the thread running on the current CPU.
712 */
713 pmap_t
current_pmap()714 current_pmap()
715 {
716 const pmap_t current = vm_map_pmap(current_thread()->map);
717
718 assert(current != NULL);
719
720 #if XNU_MONITOR
721 /**
722 * On PPL-enabled systems, it's important that PPL policy decisions aren't
723 * decided by kernel-writable memory. This function is used in various parts
724 * of the PPL, and besides validating that the pointer returned by this
725 * function is indeed a pmap structure, it's also important to ensure that
726 * it's actually the current thread's pmap. This is because different pmaps
727 * will have access to different entitlements based on the code signature of
728 * their loaded process. So if a different user pmap is set in the current
729 * thread structure (in an effort to bypass code signing restrictions), even
730 * though the structure would validate correctly as it is a real pmap
731 * structure, it should fail here.
732 *
733 * This only needs to occur for user pmaps because the kernel pmap's root
734 * page table is always the same as TTBR1 (it's set during bootstrap and not
735 * changed so it'd be redundant to check), and its code signing fields are
736 * always set to NULL. The PMAP CS logic won't operate on the kernel pmap so
737 * it shouldn't be possible to set those fields. Due to that, an attacker
738 * setting the current thread's pmap to the kernel pmap as a way to bypass
739 * this check won't accomplish anything as it doesn't provide any extra code
740 * signing entitlements.
741 */
742 if ((current != kernel_pmap) &&
743 ((get_mmu_ttb() & TTBR_BADDR_MASK) != (current->ttep))) {
744 panic_plain("%s: Current thread's pmap doesn't match up with TTBR0 "
745 "%#llx %#llx", __func__, get_mmu_ttb(), current->ttep);
746 }
747 #endif /* XNU_MONITOR */
748
749 return current;
750 }
751
752 #if DEVELOPMENT || DEBUG
753
754 /*
755 * Trace levels are controlled by a bitmask in which each
756 * level can be enabled/disabled by the (1<<level) position
757 * in the boot arg
758 * Level 0: PPL extension functionality
759 * Level 1: pmap lifecycle (create/destroy/switch)
760 * Level 2: mapping lifecycle (enter/remove/protect/nest/unnest)
761 * Level 3: internal state management (attributes/fast-fault)
762 * Level 4-7: TTE traces for paging levels 0-3. TTBs are traced at level 4.
763 */
764
765 SECURITY_READ_ONLY_LATE(unsigned int) pmap_trace_mask = 0;
766
767 #define PMAP_TRACE(level, ...) \
768 if (__improbable((1 << (level)) & pmap_trace_mask)) { \
769 KDBG_RELEASE(__VA_ARGS__); \
770 }
771 #else /* DEVELOPMENT || DEBUG */
772
773 #define PMAP_TRACE(level, ...)
774
775 #endif /* DEVELOPMENT || DEBUG */
776
777
778 /*
779 * Internal function prototypes (forward declarations).
780 */
781
782 static vm_map_size_t pmap_user_va_size(pmap_t pmap);
783
784 static void pmap_set_reference(ppnum_t pn);
785
786 pmap_paddr_t pmap_vtophys(pmap_t pmap, addr64_t va);
787
788 static void pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr);
789
790 static kern_return_t pmap_expand(
791 pmap_t, vm_map_address_t, unsigned int options, unsigned int level);
792
793 static int pmap_remove_range(
794 pmap_t, vm_map_address_t, pt_entry_t *, pt_entry_t *);
795
796 static tt_entry_t *pmap_tt1_allocate(
797 pmap_t, vm_size_t, unsigned int);
798
799 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
800
801 static void pmap_tt1_deallocate(
802 pmap_t, tt_entry_t *, vm_size_t, unsigned int);
803
804 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
805
806 static kern_return_t pmap_tt_allocate(
807 pmap_t, tt_entry_t **, unsigned int, unsigned int);
808
809 #define PMAP_TT_ALLOCATE_NOWAIT 0x1
810
811 const unsigned int arm_hardware_page_size = ARM_PGBYTES;
812 const unsigned int arm_pt_desc_size = sizeof(pt_desc_t);
813 const unsigned int arm_pt_root_size = PMAP_ROOT_ALLOC_SIZE;
814
815 #define PMAP_TT_DEALLOCATE_NOBLOCK 0x1
816
817
818 static void pmap_unmap_commpage(
819 pmap_t pmap);
820
821 static boolean_t
822 pmap_is_64bit(pmap_t);
823
824
825 static void pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t);
826
827 static void pmap_update_pp_attr_wimg_bits_locked(unsigned int, unsigned int);
828
829 static bool pmap_update_cache_attributes_locked(
830 ppnum_t, unsigned, bool);
831
832 static boolean_t arm_clear_fast_fault(
833 ppnum_t ppnum,
834 vm_prot_t fault_type,
835 pt_entry_t *pte_p);
836
837 static void pmap_trim_self(pmap_t pmap);
838 static void pmap_trim_subord(pmap_t subord);
839
840
841 /*
842 * Temporary prototypes, while we wait for pmap_enter to move to taking an
843 * address instead of a page number.
844 */
845 static kern_return_t
846 pmap_enter_addr(
847 pmap_t pmap,
848 vm_map_address_t v,
849 pmap_paddr_t pa,
850 vm_prot_t prot,
851 vm_prot_t fault_type,
852 unsigned int flags,
853 boolean_t wired);
854
855 kern_return_t
856 pmap_enter_options_addr(
857 pmap_t pmap,
858 vm_map_address_t v,
859 pmap_paddr_t pa,
860 vm_prot_t prot,
861 vm_prot_t fault_type,
862 unsigned int flags,
863 boolean_t wired,
864 unsigned int options,
865 __unused void *arg,
866 __unused pmap_mapping_type_t mapping_type);
867
868 #ifdef CONFIG_XNUPOST
869 kern_return_t pmap_test(void);
870 #endif /* CONFIG_XNUPOST */
871
872 PMAP_SUPPORT_PROTOTYPES(
873 kern_return_t,
874 arm_fast_fault, (pmap_t pmap,
875 vm_map_address_t va,
876 vm_prot_t fault_type,
877 bool was_af_fault,
878 bool from_user), ARM_FAST_FAULT_INDEX);
879
880 PMAP_SUPPORT_PROTOTYPES(
881 boolean_t,
882 arm_force_fast_fault, (ppnum_t ppnum,
883 vm_prot_t allow_mode,
884 int options), ARM_FORCE_FAST_FAULT_INDEX);
885
886 MARK_AS_PMAP_TEXT static boolean_t
887 arm_force_fast_fault_with_flush_range(
888 ppnum_t ppnum,
889 vm_prot_t allow_mode,
890 int options,
891 pmap_tlb_flush_range_t *flush_range);
892
893 /**
894 * Definition of the states driving the batch cache attributes update
895 * state machine.
896 */
897 typedef struct {
898 uint64_t page_index : 32, /* The page index to be operated on */
899 state : 8, /* The current state of the update machine */
900 tlb_flush_pass_needed : 1, /* Tracking whether the tlb flush pass is necessary */
901 rt_cache_flush_pass_needed : 1, /* Tracking whether the cache flush pass is necessary */
902 :0;
903 } batch_set_cache_attr_state_t;
904
905 /* Possible values of the "state" field. */
906 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS 1
907 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS 2
908 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS 3
909 #define PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE 4
910
911 static_assert(sizeof(batch_set_cache_attr_state_t) == sizeof(uint64_t));
912
913 PMAP_SUPPORT_PROTOTYPES(
914 batch_set_cache_attr_state_t,
915 pmap_batch_set_cache_attributes, (
916 #if XNU_MONITOR
917 volatile upl_page_info_t *user_page_list,
918 #else /* !XNU_MONITOR */
919 upl_page_info_array_t user_page_list,
920 #endif /* XNU_MONITOR */
921 batch_set_cache_attr_state_t state,
922 unsigned int page_cnt,
923 unsigned int cacheattr), PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX);
924
925 PMAP_SUPPORT_PROTOTYPES(
926 kern_return_t,
927 pmap_change_wiring, (pmap_t pmap,
928 vm_map_address_t v,
929 boolean_t wired), PMAP_CHANGE_WIRING_INDEX);
930
931 PMAP_SUPPORT_PROTOTYPES(
932 pmap_t,
933 pmap_create_options, (ledger_t ledger,
934 vm_map_size_t size,
935 unsigned int flags,
936 kern_return_t * kr), PMAP_CREATE_INDEX);
937
938 PMAP_SUPPORT_PROTOTYPES(
939 void,
940 pmap_destroy, (pmap_t pmap), PMAP_DESTROY_INDEX);
941
942 PMAP_SUPPORT_PROTOTYPES(
943 kern_return_t,
944 pmap_enter_options, (pmap_t pmap,
945 vm_map_address_t v,
946 pmap_paddr_t pa,
947 vm_prot_t prot,
948 vm_prot_t fault_type,
949 unsigned int flags,
950 boolean_t wired,
951 unsigned int options), PMAP_ENTER_OPTIONS_INDEX);
952
953 PMAP_SUPPORT_PROTOTYPES(
954 pmap_paddr_t,
955 pmap_find_pa, (pmap_t pmap,
956 addr64_t va), PMAP_FIND_PA_INDEX);
957
958 PMAP_SUPPORT_PROTOTYPES(
959 kern_return_t,
960 pmap_insert_commpage, (pmap_t pmap), PMAP_INSERT_COMMPAGE_INDEX);
961
962
963 PMAP_SUPPORT_PROTOTYPES(
964 boolean_t,
965 pmap_is_empty, (pmap_t pmap,
966 vm_map_offset_t va_start,
967 vm_map_offset_t va_end), PMAP_IS_EMPTY_INDEX);
968
969
970 PMAP_SUPPORT_PROTOTYPES(
971 unsigned int,
972 pmap_map_cpu_windows_copy, (ppnum_t pn,
973 vm_prot_t prot,
974 unsigned int wimg_bits), PMAP_MAP_CPU_WINDOWS_COPY_INDEX);
975
976 PMAP_SUPPORT_PROTOTYPES(
977 void,
978 pmap_ro_zone_memcpy, (zone_id_t zid,
979 vm_offset_t va,
980 vm_offset_t offset,
981 const vm_offset_t new_data,
982 vm_size_t new_data_size), PMAP_RO_ZONE_MEMCPY_INDEX);
983
984 PMAP_SUPPORT_PROTOTYPES(
985 uint64_t,
986 pmap_ro_zone_atomic_op, (zone_id_t zid,
987 vm_offset_t va,
988 vm_offset_t offset,
989 zro_atomic_op_t op,
990 uint64_t value), PMAP_RO_ZONE_ATOMIC_OP_INDEX);
991
992 PMAP_SUPPORT_PROTOTYPES(
993 void,
994 pmap_ro_zone_bzero, (zone_id_t zid,
995 vm_offset_t va,
996 vm_offset_t offset,
997 vm_size_t size), PMAP_RO_ZONE_BZERO_INDEX);
998
999 PMAP_SUPPORT_PROTOTYPES(
1000 vm_map_offset_t,
1001 pmap_nest, (pmap_t grand,
1002 pmap_t subord,
1003 addr64_t vstart,
1004 uint64_t size,
1005 vm_map_offset_t vrestart,
1006 kern_return_t * krp), PMAP_NEST_INDEX);
1007
1008 PMAP_SUPPORT_PROTOTYPES(
1009 void,
1010 pmap_page_protect_options, (ppnum_t ppnum,
1011 vm_prot_t prot,
1012 unsigned int options,
1013 void *arg), PMAP_PAGE_PROTECT_OPTIONS_INDEX);
1014
1015 PMAP_SUPPORT_PROTOTYPES(
1016 vm_map_address_t,
1017 pmap_protect_options, (pmap_t pmap,
1018 vm_map_address_t start,
1019 vm_map_address_t end,
1020 vm_prot_t prot,
1021 unsigned int options,
1022 void *args), PMAP_PROTECT_OPTIONS_INDEX);
1023
1024 PMAP_SUPPORT_PROTOTYPES(
1025 kern_return_t,
1026 pmap_query_page_info, (pmap_t pmap,
1027 vm_map_offset_t va,
1028 int *disp_p), PMAP_QUERY_PAGE_INFO_INDEX);
1029
1030 PMAP_SUPPORT_PROTOTYPES(
1031 mach_vm_size_t,
1032 pmap_query_resident, (pmap_t pmap,
1033 vm_map_address_t start,
1034 vm_map_address_t end,
1035 mach_vm_size_t * compressed_bytes_p), PMAP_QUERY_RESIDENT_INDEX);
1036
1037 PMAP_SUPPORT_PROTOTYPES(
1038 void,
1039 pmap_reference, (pmap_t pmap), PMAP_REFERENCE_INDEX);
1040
1041 PMAP_SUPPORT_PROTOTYPES(
1042 vm_map_address_t,
1043 pmap_remove_options, (pmap_t pmap,
1044 vm_map_address_t start,
1045 vm_map_address_t end,
1046 int options), PMAP_REMOVE_OPTIONS_INDEX);
1047
1048
1049 PMAP_SUPPORT_PROTOTYPES(
1050 void,
1051 pmap_set_cache_attributes, (ppnum_t pn,
1052 unsigned int cacheattr), PMAP_SET_CACHE_ATTRIBUTES_INDEX);
1053
1054 PMAP_SUPPORT_PROTOTYPES(
1055 void,
1056 pmap_update_compressor_page, (ppnum_t pn,
1057 unsigned int prev_cacheattr, unsigned int new_cacheattr), PMAP_UPDATE_COMPRESSOR_PAGE_INDEX);
1058
1059 PMAP_SUPPORT_PROTOTYPES(
1060 void,
1061 pmap_set_nested, (pmap_t pmap), PMAP_SET_NESTED_INDEX);
1062
1063 #if MACH_ASSERT || XNU_MONITOR
1064 PMAP_SUPPORT_PROTOTYPES(
1065 void,
1066 pmap_set_process, (pmap_t pmap,
1067 int pid,
1068 char *procname), PMAP_SET_PROCESS_INDEX);
1069 #endif
1070
1071 PMAP_SUPPORT_PROTOTYPES(
1072 void,
1073 pmap_unmap_cpu_windows_copy, (unsigned int index), PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX);
1074
1075 PMAP_SUPPORT_PROTOTYPES(
1076 vm_map_offset_t,
1077 pmap_unnest_options, (pmap_t grand,
1078 addr64_t vaddr,
1079 uint64_t size,
1080 vm_map_offset_t vrestart,
1081 unsigned int option), PMAP_UNNEST_OPTIONS_INDEX);
1082
1083 PMAP_SUPPORT_PROTOTYPES(
1084 void,
1085 phys_attribute_set, (ppnum_t pn,
1086 unsigned int bits), PHYS_ATTRIBUTE_SET_INDEX);
1087
1088 PMAP_SUPPORT_PROTOTYPES(
1089 void,
1090 phys_attribute_clear, (ppnum_t pn,
1091 unsigned int bits,
1092 int options,
1093 void *arg), PHYS_ATTRIBUTE_CLEAR_INDEX);
1094
1095 #if __ARM_RANGE_TLBI__
1096 PMAP_SUPPORT_PROTOTYPES(
1097 vm_map_address_t,
1098 phys_attribute_clear_range, (pmap_t pmap,
1099 vm_map_address_t start,
1100 vm_map_address_t end,
1101 unsigned int bits,
1102 unsigned int options), PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX);
1103 #endif /* __ARM_RANGE_TLBI__ */
1104
1105
1106 PMAP_SUPPORT_PROTOTYPES(
1107 void,
1108 pmap_switch, (pmap_t pmap), PMAP_SWITCH_INDEX);
1109
1110 PMAP_SUPPORT_PROTOTYPES(
1111 void,
1112 pmap_clear_user_ttb, (void), PMAP_CLEAR_USER_TTB_INDEX);
1113
1114 PMAP_SUPPORT_PROTOTYPES(
1115 void,
1116 pmap_set_vm_map_cs_enforced, (pmap_t pmap, bool new_value), PMAP_SET_VM_MAP_CS_ENFORCED_INDEX);
1117
1118 PMAP_SUPPORT_PROTOTYPES(
1119 void,
1120 pmap_set_tpro, (pmap_t pmap), PMAP_SET_TPRO_INDEX);
1121
1122 PMAP_SUPPORT_PROTOTYPES(
1123 void,
1124 pmap_set_jit_entitled, (pmap_t pmap), PMAP_SET_JIT_ENTITLED_INDEX);
1125
1126 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1127 PMAP_SUPPORT_PROTOTYPES(
1128 void,
1129 pmap_disable_user_jop, (pmap_t pmap), PMAP_DISABLE_USER_JOP_INDEX);
1130 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1131
1132 /* Definition of the states used by pmap_trim(). */
1133 typedef enum {
1134 /* Validates the inputs and computes the bounds of the pmaps. This state can also jump directly to DONE state in some cases. */
1135 PMAP_TRIM_STATE_START = 0,
1136
1137 /* Trims the range from the start of the shared region to the "true" start of that of the grand pmap. */
1138 PMAP_TRIM_STATE_GRAND_BEFORE,
1139
1140 /* Trims the range from the "true" end of the shared region to the end of that of the grand pmap. */
1141 PMAP_TRIM_STATE_GRAND_AFTER,
1142
1143 /* Decreases the subord's "no-bound" reference by one. If that becomes zero, trims the subord. */
1144 PMAP_TRIM_STATE_SUBORD,
1145
1146 /* Marks that trimming is finished. */
1147 PMAP_TRIM_STATE_DONE,
1148
1149 /* Sentry enum for sanity checks. */
1150 PMAP_TRIM_STATE_COUNT,
1151 } pmap_trim_state_t;
1152
1153 PMAP_SUPPORT_PROTOTYPES(
1154 pmap_trim_state_t,
1155 pmap_trim, (pmap_t grand, pmap_t subord, addr64_t vstart, uint64_t size, pmap_trim_state_t state), PMAP_TRIM_INDEX);
1156
1157 #if HAS_APPLE_PAC
1158 PMAP_SUPPORT_PROTOTYPES(
1159 void *,
1160 pmap_sign_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_SIGN_USER_PTR);
1161 PMAP_SUPPORT_PROTOTYPES(
1162 void *,
1163 pmap_auth_user_ptr, (void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key), PMAP_AUTH_USER_PTR);
1164 #endif /* HAS_APPLE_PAC */
1165
1166
1167
1168
1169 PMAP_SUPPORT_PROTOTYPES(
1170 kern_return_t,
1171 pmap_check_trust_cache_runtime_for_uuid, (const uint8_t check_uuid[kUUIDSize]),
1172 PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX);
1173
1174 PMAP_SUPPORT_PROTOTYPES(
1175 kern_return_t,
1176 pmap_load_trust_cache_with_type, (TCType_t type,
1177 const vm_address_t pmap_img4_payload,
1178 const vm_size_t pmap_img4_payload_len,
1179 const vm_address_t img4_manifest,
1180 const vm_size_t img4_manifest_len,
1181 const vm_address_t img4_aux_manifest,
1182 const vm_size_t img4_aux_manifest_len), PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX);
1183
1184 PMAP_SUPPORT_PROTOTYPES(
1185 void,
1186 pmap_toggle_developer_mode, (bool state), PMAP_TOGGLE_DEVELOPER_MODE_INDEX);
1187
1188 PMAP_SUPPORT_PROTOTYPES(
1189 kern_return_t,
1190 pmap_query_trust_cache, (TCQueryType_t query_type,
1191 const uint8_t cdhash[kTCEntryHashSize],
1192 TrustCacheQueryToken_t * query_token), PMAP_QUERY_TRUST_CACHE_INDEX);
1193
1194 PMAP_SUPPORT_PROTOTYPES(
1195 errno_t,
1196 pmap_image4_monitor_trap, (image4_cs_trap_t selector,
1197 const void *input_data,
1198 size_t input_size), PMAP_IMAGE4_MONITOR_TRAP_INDEX);
1199
1200 #if PMAP_CS_INCLUDE_CODE_SIGNING
1201
1202 PMAP_SUPPORT_PROTOTYPES(
1203 kern_return_t,
1204 pmap_register_provisioning_profile, (const vm_address_t payload_addr,
1205 const vm_size_t payload_size), PMAP_REGISTER_PROVISIONING_PROFILE_INDEX);
1206
1207 PMAP_SUPPORT_PROTOTYPES(
1208 kern_return_t,
1209 pmap_unregister_provisioning_profile, (pmap_cs_profile_t * profile_obj),
1210 PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX);
1211
1212 PMAP_SUPPORT_PROTOTYPES(
1213 kern_return_t,
1214 pmap_associate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry,
1215 pmap_cs_profile_t * profile_obj),
1216 PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX);
1217
1218 PMAP_SUPPORT_PROTOTYPES(
1219 kern_return_t,
1220 pmap_disassociate_provisioning_profile, (pmap_cs_code_directory_t * cd_entry),
1221 PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX);
1222
1223 PMAP_SUPPORT_PROTOTYPES(
1224 kern_return_t,
1225 pmap_associate_kernel_entitlements, (pmap_cs_code_directory_t * cd_entry,
1226 const void *kernel_entitlements),
1227 PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX);
1228
1229 PMAP_SUPPORT_PROTOTYPES(
1230 kern_return_t,
1231 pmap_resolve_kernel_entitlements, (pmap_t pmap,
1232 const void **kernel_entitlements),
1233 PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX);
1234
1235 PMAP_SUPPORT_PROTOTYPES(
1236 kern_return_t,
1237 pmap_accelerate_entitlements, (pmap_cs_code_directory_t * cd_entry),
1238 PMAP_ACCELERATE_ENTITLEMENTS_INDEX);
1239
1240 PMAP_SUPPORT_PROTOTYPES(
1241 kern_return_t,
1242 pmap_cs_allow_invalid, (pmap_t pmap),
1243 PMAP_CS_ALLOW_INVALID_INDEX);
1244
1245 PMAP_SUPPORT_PROTOTYPES(
1246 void,
1247 pmap_set_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1248 PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX);
1249
1250 PMAP_SUPPORT_PROTOTYPES(
1251 bool,
1252 pmap_match_compilation_service_cdhash, (const uint8_t cdhash[CS_CDHASH_LEN]),
1253 PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX);
1254
1255 PMAP_SUPPORT_PROTOTYPES(
1256 void,
1257 pmap_set_local_signing_public_key, (const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE]),
1258 PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX);
1259
1260 PMAP_SUPPORT_PROTOTYPES(
1261 void,
1262 pmap_unrestrict_local_signing, (const uint8_t cdhash[CS_CDHASH_LEN]),
1263 PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX);
1264
1265 #endif
1266
1267 PMAP_SUPPORT_PROTOTYPES(
1268 uint32_t,
1269 pmap_lookup_in_static_trust_cache, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX);
1270
1271 PMAP_SUPPORT_PROTOTYPES(
1272 bool,
1273 pmap_lookup_in_loaded_trust_caches, (const uint8_t cdhash[CS_CDHASH_LEN]), PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX);
1274
1275 PMAP_SUPPORT_PROTOTYPES(
1276 void,
1277 pmap_nop, (pmap_t pmap), PMAP_NOP_INDEX);
1278
1279 void pmap_footprint_suspend(vm_map_t map,
1280 boolean_t suspend);
1281 PMAP_SUPPORT_PROTOTYPES(
1282 void,
1283 pmap_footprint_suspend, (vm_map_t map,
1284 boolean_t suspend),
1285 PMAP_FOOTPRINT_SUSPEND_INDEX);
1286
1287
1288
1289
1290 #if DEVELOPMENT || DEBUG
1291 PMAP_SUPPORT_PROTOTYPES(
1292 kern_return_t,
1293 pmap_test_text_corruption, (pmap_paddr_t),
1294 PMAP_TEST_TEXT_CORRUPTION_INDEX);
1295 #endif /* DEVELOPMENT || DEBUG */
1296
1297 /*
1298 * The low global vector page is mapped at a fixed alias.
1299 * Since the page size is 16k for H8 and newer we map the globals to a 16k
1300 * aligned address. Readers of the globals (e.g. lldb, panic server) need
1301 * to check both addresses anyway for backward compatibility. So for now
1302 * we leave H6 and H7 where they were.
1303 */
1304 #if (ARM_PGSHIFT == 14)
1305 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x4000)
1306 #else
1307 #define LOWGLOBAL_ALIAS (LOW_GLOBAL_BASE_ADDRESS + 0x2000)
1308 #endif
1309
1310
1311 long long alloc_tteroot_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1312 long long alloc_ttepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1313 long long alloc_ptepages_count __attribute__((aligned(8))) MARK_AS_PMAP_DATA = 0LL;
1314
1315 #if XNU_MONITOR
1316
1317 #if __has_feature(ptrauth_calls)
1318 #define __ptrauth_ppl_handler __ptrauth(ptrauth_key_function_pointer, true, 0)
1319 #else
1320 #define __ptrauth_ppl_handler
1321 #endif
1322
1323 /*
1324 * Table of function pointers used for PPL dispatch.
1325 */
1326 const void * __ptrauth_ppl_handler const ppl_handler_table[PMAP_COUNT] = {
1327 [ARM_FAST_FAULT_INDEX] = arm_fast_fault_internal,
1328 [ARM_FORCE_FAST_FAULT_INDEX] = arm_force_fast_fault_internal,
1329 [MAPPING_FREE_PRIME_INDEX] = mapping_free_prime_internal,
1330 [PHYS_ATTRIBUTE_CLEAR_INDEX] = phys_attribute_clear_internal,
1331 [PHYS_ATTRIBUTE_SET_INDEX] = phys_attribute_set_internal,
1332 [PMAP_BATCH_SET_CACHE_ATTRIBUTES_INDEX] = pmap_batch_set_cache_attributes_internal,
1333 [PMAP_CHANGE_WIRING_INDEX] = pmap_change_wiring_internal,
1334 [PMAP_CREATE_INDEX] = pmap_create_options_internal,
1335 [PMAP_DESTROY_INDEX] = pmap_destroy_internal,
1336 [PMAP_ENTER_OPTIONS_INDEX] = pmap_enter_options_internal,
1337 [PMAP_FIND_PA_INDEX] = pmap_find_pa_internal,
1338 [PMAP_INSERT_COMMPAGE_INDEX] = pmap_insert_commpage_internal,
1339 [PMAP_IS_EMPTY_INDEX] = pmap_is_empty_internal,
1340 [PMAP_MAP_CPU_WINDOWS_COPY_INDEX] = pmap_map_cpu_windows_copy_internal,
1341 [PMAP_RO_ZONE_MEMCPY_INDEX] = pmap_ro_zone_memcpy_internal,
1342 [PMAP_RO_ZONE_ATOMIC_OP_INDEX] = pmap_ro_zone_atomic_op_internal,
1343 [PMAP_RO_ZONE_BZERO_INDEX] = pmap_ro_zone_bzero_internal,
1344 [PMAP_MARK_PAGE_AS_PMAP_PAGE_INDEX] = pmap_mark_page_as_ppl_page_internal,
1345 [PMAP_NEST_INDEX] = pmap_nest_internal,
1346 [PMAP_PAGE_PROTECT_OPTIONS_INDEX] = pmap_page_protect_options_internal,
1347 [PMAP_PROTECT_OPTIONS_INDEX] = pmap_protect_options_internal,
1348 [PMAP_QUERY_PAGE_INFO_INDEX] = pmap_query_page_info_internal,
1349 [PMAP_QUERY_RESIDENT_INDEX] = pmap_query_resident_internal,
1350 [PMAP_REFERENCE_INDEX] = pmap_reference_internal,
1351 [PMAP_REMOVE_OPTIONS_INDEX] = pmap_remove_options_internal,
1352 [PMAP_SET_CACHE_ATTRIBUTES_INDEX] = pmap_set_cache_attributes_internal,
1353 [PMAP_UPDATE_COMPRESSOR_PAGE_INDEX] = pmap_update_compressor_page_internal,
1354 [PMAP_SET_NESTED_INDEX] = pmap_set_nested_internal,
1355 [PMAP_SET_PROCESS_INDEX] = pmap_set_process_internal,
1356 [PMAP_SWITCH_INDEX] = pmap_switch_internal,
1357 [PMAP_CLEAR_USER_TTB_INDEX] = pmap_clear_user_ttb_internal,
1358 [PMAP_UNMAP_CPU_WINDOWS_COPY_INDEX] = pmap_unmap_cpu_windows_copy_internal,
1359 [PMAP_UNNEST_OPTIONS_INDEX] = pmap_unnest_options_internal,
1360 [PMAP_FOOTPRINT_SUSPEND_INDEX] = pmap_footprint_suspend_internal,
1361 [PMAP_CPU_DATA_INIT_INDEX] = pmap_cpu_data_init_internal,
1362 [PMAP_RELEASE_PAGES_TO_KERNEL_INDEX] = pmap_release_ppl_pages_to_kernel_internal,
1363 [PMAP_SET_VM_MAP_CS_ENFORCED_INDEX] = pmap_set_vm_map_cs_enforced_internal,
1364 [PMAP_SET_JIT_ENTITLED_INDEX] = pmap_set_jit_entitled_internal,
1365 [PMAP_SET_TPRO_INDEX] = pmap_set_tpro_internal,
1366 [PMAP_LOOKUP_IN_STATIC_TRUST_CACHE_INDEX] = pmap_lookup_in_static_trust_cache_internal,
1367 [PMAP_LOOKUP_IN_LOADED_TRUST_CACHES_INDEX] = pmap_lookup_in_loaded_trust_caches_internal,
1368 [PMAP_CHECK_TRUST_CACHE_RUNTIME_FOR_UUID_INDEX] = pmap_check_trust_cache_runtime_for_uuid_internal,
1369 [PMAP_LOAD_TRUST_CACHE_WITH_TYPE_INDEX] = pmap_load_trust_cache_with_type_internal,
1370 [PMAP_QUERY_TRUST_CACHE_INDEX] = pmap_query_trust_cache_internal,
1371 [PMAP_TOGGLE_DEVELOPER_MODE_INDEX] = pmap_toggle_developer_mode_internal,
1372 [PMAP_IMAGE4_MONITOR_TRAP_INDEX] = pmap_image4_monitor_trap_internal,
1373 #if PMAP_CS_INCLUDE_CODE_SIGNING
1374 [PMAP_SET_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_set_compilation_service_cdhash_internal,
1375 [PMAP_MATCH_COMPILATION_SERVICE_CDHASH_INDEX] = pmap_match_compilation_service_cdhash_internal,
1376 [PMAP_SET_LOCAL_SIGNING_PUBLIC_KEY_INDEX] = pmap_set_local_signing_public_key_internal,
1377 [PMAP_UNRESTRICT_LOCAL_SIGNING_INDEX] = pmap_unrestrict_local_signing_internal,
1378 [PMAP_REGISTER_PROVISIONING_PROFILE_INDEX] = pmap_register_provisioning_profile_internal,
1379 [PMAP_UNREGISTER_PROVISIONING_PROFILE_INDEX] = pmap_unregister_provisioning_profile_internal,
1380 [PMAP_ASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_associate_provisioning_profile_internal,
1381 [PMAP_DISASSOCIATE_PROVISIONING_PROFILE_INDEX] = pmap_disassociate_provisioning_profile_internal,
1382 [PMAP_ASSOCIATE_KERNEL_ENTITLEMENTS_INDEX] = pmap_associate_kernel_entitlements_internal,
1383 [PMAP_RESOLVE_KERNEL_ENTITLEMENTS_INDEX] = pmap_resolve_kernel_entitlements_internal,
1384 [PMAP_ACCELERATE_ENTITLEMENTS_INDEX] = pmap_accelerate_entitlements_internal,
1385 #endif
1386 [PMAP_TRIM_INDEX] = pmap_trim_internal,
1387 [PMAP_LEDGER_VERIFY_SIZE_INDEX] = pmap_ledger_verify_size_internal,
1388 [PMAP_LEDGER_ALLOC_INDEX] = pmap_ledger_alloc_internal,
1389 [PMAP_LEDGER_FREE_INDEX] = pmap_ledger_free_internal,
1390 #if HAS_APPLE_PAC
1391 [PMAP_SIGN_USER_PTR] = pmap_sign_user_ptr_internal,
1392 [PMAP_AUTH_USER_PTR] = pmap_auth_user_ptr_internal,
1393 #endif /* HAS_APPLE_PAC */
1394 #if __ARM_RANGE_TLBI__
1395 [PHYS_ATTRIBUTE_CLEAR_RANGE_INDEX] = phys_attribute_clear_range_internal,
1396 #endif /* __ARM_RANGE_TLBI__ */
1397 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
1398 [PMAP_DISABLE_USER_JOP_INDEX] = pmap_disable_user_jop_internal,
1399 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
1400 [PMAP_NOP_INDEX] = pmap_nop_internal,
1401
1402 #if DEVELOPMENT || DEBUG
1403 [PMAP_TEST_TEXT_CORRUPTION_INDEX] = pmap_test_text_corruption_internal,
1404 #endif /* DEVELOPMENT || DEBUG */
1405
1406 };
1407 #endif
1408
1409 #if XNU_MONITOR
1410 /**
1411 * A convenience function for setting protections on a single physical
1412 * aperture or static region mapping without invalidating the TLB.
1413 *
1414 * @note This function does not perform any TLB invalidations. That must be done
1415 * separately to be able to safely use the updated mapping.
1416 *
1417 * @note This function understands the difference between the VM page size and
1418 * the kernel page size and will update multiple PTEs if the sizes differ.
1419 * In other words, enough PTEs will always get updated to change the
1420 * permissions on a PAGE_SIZE amount of memory.
1421 *
1422 * @note The PVH lock for the physical page represented by this mapping must
1423 * already be locked.
1424 *
1425 * @note This function assumes the caller has already verified that the PTE
1426 * pointer does indeed point to a physical aperture or static region page
1427 * table. Please validate your inputs before passing it along to this
1428 * function.
1429 *
1430 * @param ptep Pointer to the physical aperture or static region page table to
1431 * update with a new XPRR index.
1432 * @param expected_perm The XPRR index that is expected to already exist at the
1433 * current mapping. If the current index doesn't match this
1434 * then the system will panic.
1435 * @param new_perm The new XPRR index to update the mapping with.
1436 */
1437 MARK_AS_PMAP_TEXT static void
pmap_set_pte_xprr_perm(pt_entry_t * const ptep,unsigned int expected_perm,unsigned int new_perm)1438 pmap_set_pte_xprr_perm(
1439 pt_entry_t * const ptep,
1440 unsigned int expected_perm,
1441 unsigned int new_perm)
1442 {
1443 assert(ptep != NULL);
1444
1445 pt_entry_t spte = *ptep;
1446 pvh_assert_locked(pa_index(pte_to_pa(spte)));
1447
1448 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1449 panic_plain("%s: invalid XPRR index, ptep=%p, new_perm=%u, expected_perm=%u",
1450 __func__, ptep, new_perm, expected_perm);
1451 }
1452
1453 /**
1454 * The PTE involved should be valid, should not have the hint bit set, and
1455 * should have the expected XPRR index.
1456 */
1457 if (__improbable((spte & ARM_PTE_TYPE_MASK) == ARM_PTE_TYPE_FAULT)) {
1458 panic_plain("%s: physical aperture or static region PTE is invalid, "
1459 "ptep=%p, spte=%#llx, new_perm=%u, expected_perm=%u",
1460 __func__, ptep, spte, new_perm, expected_perm);
1461 }
1462
1463 if (__improbable(spte & ARM_PTE_HINT_MASK)) {
1464 panic_plain("%s: physical aperture or static region PTE has hint bit "
1465 "set, ptep=%p, spte=0x%llx, new_perm=%u, expected_perm=%u",
1466 __func__, ptep, spte, new_perm, expected_perm);
1467 }
1468
1469 if (__improbable(pte_to_xprr_perm(spte) != expected_perm)) {
1470 panic("%s: perm=%llu does not match expected_perm, spte=0x%llx, "
1471 "ptep=%p, new_perm=%u, expected_perm=%u",
1472 __func__, pte_to_xprr_perm(spte), spte, ptep, new_perm, expected_perm);
1473 }
1474
1475 pt_entry_t template = spte;
1476 template &= ~ARM_PTE_XPRR_MASK;
1477 template |= xprr_perm_to_pte(new_perm);
1478
1479 write_pte_strong(ptep, template);
1480 }
1481
1482 /**
1483 * Update the protections on a single physical aperture mapping and invalidate
1484 * the TLB so the mapping can be used.
1485 *
1486 * @note The PVH lock for the physical page must already be locked.
1487 *
1488 * @param pai The physical address index of the page whose physical aperture
1489 * mapping will be updated with new permissions.
1490 * @param expected_perm The XPRR index that is expected to already exist at the
1491 * current mapping. If the current index doesn't match this
1492 * then the system will panic.
1493 * @param new_perm The new XPRR index to update the mapping with.
1494 */
1495 MARK_AS_PMAP_TEXT void
pmap_set_xprr_perm(unsigned int pai,unsigned int expected_perm,unsigned int new_perm)1496 pmap_set_xprr_perm(
1497 unsigned int pai,
1498 unsigned int expected_perm,
1499 unsigned int new_perm)
1500 {
1501 pvh_assert_locked(pai);
1502
1503 const vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
1504 pt_entry_t * const ptep = pmap_pte(kernel_pmap, kva);
1505
1506 pmap_set_pte_xprr_perm(ptep, expected_perm, new_perm);
1507
1508 native_pt_ops.flush_tlb_region_async(kva, PAGE_SIZE, kernel_pmap, true, false);
1509 sync_tlb_flush();
1510 }
1511
1512 /**
1513 * Update the protections on a range of physical aperture or static region
1514 * mappings and invalidate the TLB so the mappings can be used.
1515 *
1516 * @note Static region mappings can only be updated before machine_lockdown().
1517 * Physical aperture mappings can be updated at any time.
1518 *
1519 * @param start The starting virtual address of the static region or physical
1520 * aperture range whose permissions will be updated.
1521 * @param end The final (inclusive) virtual address of the static region or
1522 * physical aperture range whose permissions will be updated.
1523 * @param expected_perm The XPRR index that is expected to already exist at the
1524 * current mappings. If the current indices don't match
1525 * this then the system will panic.
1526 * @param new_perm The new XPRR index to update the mappings with.
1527 */
1528 MARK_AS_PMAP_TEXT static void
pmap_set_range_xprr_perm(vm_address_t start,vm_address_t end,unsigned int expected_perm,unsigned int new_perm)1529 pmap_set_range_xprr_perm(
1530 vm_address_t start,
1531 vm_address_t end,
1532 unsigned int expected_perm,
1533 unsigned int new_perm)
1534 {
1535 /**
1536 * Validate our arguments; any invalid argument will be grounds for a panic.
1537 */
1538 if (__improbable((start | end) & ARM_PGMASK)) {
1539 panic_plain("%s: start or end not page aligned, "
1540 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1541 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1542 }
1543
1544 if (__improbable(start > end)) {
1545 panic("%s: start > end, start=%p, end=%p, new_perm=%u, expected_perm=%u",
1546 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1547 }
1548
1549 const bool in_physmap = (start >= physmap_base) && (end < physmap_end);
1550 const bool in_static = (start >= gVirtBase) && (end < static_memory_end);
1551
1552 if (__improbable(!(in_physmap || in_static))) {
1553 panic_plain("%s: address not in static region or physical aperture, "
1554 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1555 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1556 }
1557
1558 if (__improbable((new_perm > XPRR_MAX_PERM) || (expected_perm > XPRR_MAX_PERM))) {
1559 panic_plain("%s: invalid XPRR index, "
1560 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1561 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1562 }
1563
1564 /*
1565 * Walk over the PTEs for the given range, and set the protections on those
1566 * PTEs. Each iteration of this loop will update all of the leaf PTEs within
1567 * one twig entry (whichever twig entry currently maps "va").
1568 */
1569 vm_address_t va = start;
1570 while (va < end) {
1571 /**
1572 * Get the last VA that the twig entry for "va" maps. All of the leaf
1573 * PTEs from va to tte_va_end will have their permissions updated.
1574 */
1575 vm_address_t tte_va_end =
1576 (va + pt_attr_twig_size(native_pt_attr)) & ~pt_attr_twig_offmask(native_pt_attr);
1577
1578 if (tte_va_end > end) {
1579 tte_va_end = end;
1580 }
1581
1582 tt_entry_t *ttep = pmap_tte(kernel_pmap, va);
1583
1584 if (ttep == NULL) {
1585 panic_plain("%s: physical aperture or static region tte is NULL, "
1586 "start=%p, end=%p, new_perm=%u, expected_perm=%u",
1587 __func__, (void *)start, (void *)end, new_perm, expected_perm);
1588 }
1589
1590 tt_entry_t tte = *ttep;
1591
1592 if ((tte & ARM_TTE_TYPE_MASK) != ARM_TTE_TYPE_TABLE) {
1593 panic_plain("%s: tte=0x%llx is not a table type entry, "
1594 "start=%p, end=%p, new_perm=%u, expected_perm=%u", __func__,
1595 tte, (void *)start, (void *)end, new_perm, expected_perm);
1596 }
1597
1598 /* Walk over the given L3 page table page and update the PTEs. */
1599 pt_entry_t * const ptep = (pt_entry_t *)ttetokv(tte);
1600 pt_entry_t * const begin_ptep = &ptep[pte_index(native_pt_attr, va)];
1601 const uint64_t num_ptes = (tte_va_end - va) >> pt_attr_leaf_shift(native_pt_attr);
1602 pt_entry_t * const end_ptep = begin_ptep + num_ptes;
1603
1604 /**
1605 * The current PTE pointer is incremented by the page ratio (ratio of
1606 * VM page size to kernel hardware page size) because one call to
1607 * pmap_set_pte_xprr_perm() will update all PTE entries required to map
1608 * a PAGE_SIZE worth of hardware pages.
1609 */
1610 for (pt_entry_t *cur_ptep = begin_ptep; cur_ptep < end_ptep;
1611 cur_ptep += PAGE_RATIO, va += PAGE_SIZE) {
1612 unsigned int pai = pa_index(pte_to_pa(*cur_ptep));
1613 pvh_lock(pai);
1614 pmap_set_pte_xprr_perm(cur_ptep, expected_perm, new_perm);
1615 pvh_unlock(pai);
1616 }
1617
1618 va = tte_va_end;
1619 }
1620
1621 PMAP_UPDATE_TLBS(kernel_pmap, start, end, false, true);
1622 }
1623
1624 #endif /* XNU_MONITOR */
1625
1626 static inline void
PMAP_ZINFO_PALLOC(pmap_t pmap,int bytes)1627 PMAP_ZINFO_PALLOC(
1628 pmap_t pmap, int bytes)
1629 {
1630 pmap_ledger_credit(pmap, task_ledgers.tkm_private, bytes);
1631 }
1632
1633 static inline void
PMAP_ZINFO_PFREE(pmap_t pmap,int bytes)1634 PMAP_ZINFO_PFREE(
1635 pmap_t pmap,
1636 int bytes)
1637 {
1638 pmap_ledger_debit(pmap, task_ledgers.tkm_private, bytes);
1639 }
1640
1641 void
pmap_tt_ledger_credit(pmap_t pmap,vm_size_t size)1642 pmap_tt_ledger_credit(
1643 pmap_t pmap,
1644 vm_size_t size)
1645 {
1646 if (pmap != kernel_pmap) {
1647 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, size);
1648 pmap_ledger_credit(pmap, task_ledgers.page_table, size);
1649 }
1650 }
1651
1652 void
pmap_tt_ledger_debit(pmap_t pmap,vm_size_t size)1653 pmap_tt_ledger_debit(
1654 pmap_t pmap,
1655 vm_size_t size)
1656 {
1657 if (pmap != kernel_pmap) {
1658 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, size);
1659 pmap_ledger_debit(pmap, task_ledgers.page_table, size);
1660 }
1661 }
1662
1663 static inline void
pmap_update_plru(uint16_t asid_index __unused)1664 pmap_update_plru(uint16_t asid_index __unused)
1665 {
1666 #if !HAS_16BIT_ASID
1667 if (__probable(pmap_asid_plru)) {
1668 unsigned plru_index = asid_index >> 6;
1669 if (__improbable(os_atomic_andnot(&asid_plru_bitmap[plru_index], (1ULL << (asid_index & 63)), relaxed) == 0)) {
1670 asid_plru_generation[plru_index] = ++asid_plru_gencount;
1671 asid_plru_bitmap[plru_index] = ((plru_index == (MAX_HW_ASIDS >> 6)) ? ~(1ULL << 63) : UINT64_MAX);
1672 }
1673 }
1674 #endif /* !HAS_16BIT_ASID */
1675 }
1676
1677 static bool
alloc_asid(pmap_t pmap)1678 alloc_asid(pmap_t pmap)
1679 {
1680 int vasid = -1;
1681 uint16_t hw_asid;
1682
1683 pmap_simple_lock(&asid_lock);
1684
1685 #if !HAS_16BIT_ASID
1686 if (__probable(pmap_asid_plru)) {
1687 unsigned plru_index = 0;
1688 uint64_t lowest_gen = asid_plru_generation[0];
1689 uint64_t lowest_gen_bitmap = asid_plru_bitmap[0];
1690 for (unsigned i = 1; i < (sizeof(asid_plru_generation) / sizeof(asid_plru_generation[0])); ++i) {
1691 if (asid_plru_generation[i] < lowest_gen) {
1692 plru_index = i;
1693 lowest_gen = asid_plru_generation[i];
1694 lowest_gen_bitmap = asid_plru_bitmap[i];
1695 }
1696 }
1697
1698 for (; plru_index < BITMAP_LEN(pmap_max_asids); plru_index += ((MAX_HW_ASIDS + 1) >> 6)) {
1699 uint64_t temp_plru = lowest_gen_bitmap & asid_bitmap[plru_index];
1700 if (temp_plru) {
1701 vasid = (plru_index << 6) + lsb_first(temp_plru);
1702 #if DEVELOPMENT || DEBUG
1703 ++pmap_asid_hits;
1704 #endif
1705 break;
1706 }
1707 }
1708 }
1709 #else
1710 /**
1711 * For 16-bit ASID targets, we assume a 1:1 correspondence between ASIDs and active tasks and
1712 * therefore allocate directly from the ASID bitmap instead of using the pLRU allocator.
1713 * However, we first try to allocate starting from the position of the most-recently allocated
1714 * ASID. This is done both as an allocator performance optimization (as it avoids crowding the
1715 * lower bit positions and then re-checking those same lower positions every time we allocate
1716 * an ASID) as well as a security mitigation to increase the temporal distance between ASID
1717 * reuse. This increases the difficulty of leveraging ASID reuse to train branch predictor
1718 * logic, without requiring prohibitively expensive RCTX instructions.
1719 */
1720 vasid = bitmap_lsb_next(&asid_bitmap[0], pmap_max_asids, last_allocated_asid);
1721 #endif /* !HAS_16BIT_ASID */
1722 if (__improbable(vasid < 0)) {
1723 // bitmap_first() returns highest-order bits first, but a 0-based scheme works
1724 // slightly better with the collision detection scheme used by pmap_switch_internal().
1725 vasid = bitmap_lsb_first(&asid_bitmap[0], pmap_max_asids);
1726 #if DEVELOPMENT || DEBUG
1727 ++pmap_asid_misses;
1728 #endif
1729 }
1730 if (__improbable(vasid < 0)) {
1731 pmap_simple_unlock(&asid_lock);
1732 return false;
1733 }
1734 assert((uint32_t)vasid < pmap_max_asids);
1735 assert(bitmap_test(&asid_bitmap[0], (unsigned int)vasid));
1736 bitmap_clear(&asid_bitmap[0], (unsigned int)vasid);
1737 #if HAS_16BIT_ASID
1738 last_allocated_asid = (uint16_t)vasid;
1739 #endif /* HAS_16BIT_ASID */
1740 pmap_simple_unlock(&asid_lock);
1741 hw_asid = (uint16_t)(vasid % asid_chunk_size);
1742 pmap->sw_asid = (uint8_t)(vasid / asid_chunk_size);
1743 if (__improbable(hw_asid == MAX_HW_ASIDS)) {
1744 /* If we took a PLRU "miss" and ended up with a hardware ASID we can't actually support,
1745 * reassign to a reserved VASID. */
1746 assert(pmap->sw_asid < UINT8_MAX);
1747 pmap->sw_asid = UINT8_MAX;
1748 /* Allocate from the high end of the hardware ASID range to reduce the likelihood of
1749 * aliasing with vital system processes, which are likely to have lower ASIDs. */
1750 hw_asid = MAX_HW_ASIDS - 1 - (uint16_t)(vasid / asid_chunk_size);
1751 assert(hw_asid < MAX_HW_ASIDS);
1752 }
1753 pmap_update_plru(hw_asid);
1754 hw_asid += 1; // Account for ASID 0, which is reserved for the kernel
1755 #if __ARM_KERNEL_PROTECT__
1756 hw_asid <<= 1; // We're really handing out 2 hardware ASIDs, one for EL0 and one for EL1 access
1757 #endif
1758 pmap->hw_asid = hw_asid;
1759 return true;
1760 }
1761
1762 static void
free_asid(pmap_t pmap)1763 free_asid(pmap_t pmap)
1764 {
1765 unsigned int vasid;
1766 uint16_t hw_asid = os_atomic_xchg(&pmap->hw_asid, 0, relaxed);
1767 if (__improbable(hw_asid == 0)) {
1768 return;
1769 }
1770
1771 #if __ARM_KERNEL_PROTECT__
1772 hw_asid >>= 1;
1773 #endif
1774 hw_asid -= 1;
1775
1776 #if HAS_16BIT_ASID
1777 vasid = hw_asid;
1778 #else
1779 if (__improbable(pmap->sw_asid == UINT8_MAX)) {
1780 vasid = ((MAX_HW_ASIDS - 1 - hw_asid) * asid_chunk_size) + MAX_HW_ASIDS;
1781 } else {
1782 vasid = ((unsigned int)pmap->sw_asid * asid_chunk_size) + hw_asid;
1783 }
1784
1785 if (__probable(pmap_asid_plru)) {
1786 os_atomic_or(&asid_plru_bitmap[hw_asid >> 6], (1ULL << (hw_asid & 63)), relaxed);
1787 }
1788 #endif /* HAS_16BIT_ASID */
1789 pmap_simple_lock(&asid_lock);
1790 assert(!bitmap_test(&asid_bitmap[0], vasid));
1791 bitmap_set(&asid_bitmap[0], vasid);
1792 pmap_simple_unlock(&asid_lock);
1793 }
1794
1795
1796 boolean_t
pmap_valid_address(pmap_paddr_t addr)1797 pmap_valid_address(
1798 pmap_paddr_t addr)
1799 {
1800 return pa_valid(addr);
1801 }
1802
1803
1804
1805
1806
1807
1808 /*
1809 * Map memory at initialization. The physical addresses being
1810 * mapped are not managed and are never unmapped.
1811 *
1812 * For now, VM is already on, we only need to map the
1813 * specified memory.
1814 */
1815 vm_map_address_t
pmap_map(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,unsigned int flags)1816 pmap_map(
1817 vm_map_address_t virt,
1818 vm_offset_t start,
1819 vm_offset_t end,
1820 vm_prot_t prot,
1821 unsigned int flags)
1822 {
1823 kern_return_t kr;
1824 vm_size_t ps;
1825
1826 ps = PAGE_SIZE;
1827 while (start < end) {
1828 kr = pmap_enter(kernel_pmap, virt, (ppnum_t)atop(start),
1829 prot, VM_PROT_NONE, flags, FALSE, PMAP_MAPPING_TYPE_INFER);
1830
1831 if (kr != KERN_SUCCESS) {
1832 panic("%s: failed pmap_enter, "
1833 "virt=%p, start_addr=%p, end_addr=%p, prot=%#x, flags=%#x",
1834 __FUNCTION__,
1835 (void *) virt, (void *) start, (void *) end, prot, flags);
1836 }
1837
1838 virt += ps;
1839 start += ps;
1840 }
1841 return virt;
1842 }
1843
1844 #if XNU_MONITOR
1845 /**
1846 * Remove kernel writeablity from an IO PTE value if the page is owned by
1847 * guarded mode software.
1848 *
1849 * @param paddr The physical address of the page which has to be non-DRAM.
1850 * @param tmplate The PTE value to be evaluated.
1851 *
1852 * @return A new PTE value with permission bits modified.
1853 */
1854 static inline
1855 pt_entry_t
pmap_construct_io_pte(pmap_paddr_t paddr,pt_entry_t tmplate)1856 pmap_construct_io_pte(pmap_paddr_t paddr, pt_entry_t tmplate)
1857 {
1858 assert(!pa_valid(paddr));
1859
1860 const unsigned int wimg_bits = pmap_cache_attributes((ppnum_t)atop(paddr));
1861
1862 if ((wimg_bits & PP_ATTR_MONITOR) && !pmap_ppl_disable) {
1863 /* PPL to own the page by converting KERN_RW to PPL_RW. */
1864 const uint64_t xprr_perm = pte_to_xprr_perm(tmplate);
1865 switch (xprr_perm) {
1866 case XPRR_KERN_RO_PERM:
1867 break;
1868 case XPRR_KERN_RW_PERM:
1869 tmplate &= ~ARM_PTE_XPRR_MASK;
1870 tmplate |= xprr_perm_to_pte(XPRR_PPL_RW_PERM);
1871 break;
1872 default:
1873 panic("%s: Unsupported xPRR perm %llu for pte 0x%llx", __func__, xprr_perm, (uint64_t)tmplate);
1874 }
1875 }
1876
1877 return tmplate;
1878 }
1879 #endif /* XNU_MONITOR */
1880
1881 vm_map_address_t
pmap_map_bd_with_options(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot,int32_t options)1882 pmap_map_bd_with_options(
1883 vm_map_address_t virt,
1884 vm_offset_t start,
1885 vm_offset_t end,
1886 vm_prot_t prot,
1887 int32_t options)
1888 {
1889 pt_entry_t mem_attr;
1890
1891 switch (options & PMAP_MAP_BD_MASK) {
1892 case PMAP_MAP_BD_WCOMB:
1893 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
1894 mem_attr |= ARM_PTE_SH(SH_OUTER_MEMORY);
1895 break;
1896 case PMAP_MAP_BD_POSTED:
1897 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
1898 break;
1899 case PMAP_MAP_BD_POSTED_REORDERED:
1900 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
1901 break;
1902 case PMAP_MAP_BD_POSTED_COMBINED_REORDERED:
1903 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
1904 break;
1905 default:
1906 mem_attr = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
1907 break;
1908 }
1909
1910 /* not cacheable and not buffered */
1911 pt_entry_t tmplate = pa_to_pte(start)
1912 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
1913 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
1914 | mem_attr;
1915
1916 #if __ARM_KERNEL_PROTECT__
1917 tmplate |= ARM_PTE_NG;
1918 #endif /* __ARM_KERNEL_PROTECT__ */
1919
1920 vm_map_address_t vaddr = virt;
1921 vm_offset_t paddr = start;
1922 while (paddr < end) {
1923 pt_entry_t *ptep = pmap_pte(kernel_pmap, vaddr);
1924 if (ptep == PT_ENTRY_NULL) {
1925 panic("pmap_map_bd");
1926 }
1927
1928 /**
1929 * For every iteration, the paddr encoded in tmplate is incrementing,
1930 * but we always start with the original AP bits defined at the top
1931 * of the function in tmplate and only modify the AP bits in the pte
1932 * variable.
1933 */
1934 pt_entry_t pte;
1935 #if XNU_MONITOR
1936 if (!pa_valid(paddr)) {
1937 pte = pmap_construct_io_pte(paddr, tmplate);
1938 } else {
1939 pte = tmplate;
1940 }
1941 #else /* !XNU_MONITOR */
1942 pte = tmplate;
1943 #endif
1944
1945 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
1946 write_pte_strong(ptep, pte);
1947
1948 pte_increment_pa(tmplate);
1949 vaddr += PAGE_SIZE;
1950 paddr += PAGE_SIZE;
1951 }
1952
1953 if (end >= start) {
1954 flush_mmu_tlb_region(virt, (unsigned)(end - start));
1955 }
1956
1957 return vaddr;
1958 }
1959
1960 /*
1961 * Back-door routine for mapping kernel VM at initialization.
1962 * Useful for mapping memory outside the range
1963 * [vm_first_phys, vm_last_phys] (i.e., devices).
1964 * Otherwise like pmap_map.
1965 */
1966 vm_map_address_t
pmap_map_bd(vm_map_address_t virt,vm_offset_t start,vm_offset_t end,vm_prot_t prot)1967 pmap_map_bd(
1968 vm_map_address_t virt,
1969 vm_offset_t start,
1970 vm_offset_t end,
1971 vm_prot_t prot)
1972 {
1973 return pmap_map_bd_with_options(virt, start, end, prot, 0);
1974 }
1975
1976 /*
1977 * Back-door routine for mapping kernel VM at initialization.
1978 * Useful for mapping memory specific physical addresses in early
1979 * boot (i.e., before kernel_map is initialized).
1980 *
1981 * Maps are in the VM_HIGH_KERNEL_WINDOW area.
1982 */
1983
1984 vm_map_address_t
pmap_map_high_window_bd(vm_offset_t pa_start,vm_size_t len,vm_prot_t prot)1985 pmap_map_high_window_bd(
1986 vm_offset_t pa_start,
1987 vm_size_t len,
1988 vm_prot_t prot)
1989 {
1990 pt_entry_t *ptep, pte;
1991 vm_map_address_t va_start = VREGION1_START;
1992 vm_map_address_t va_max = VREGION1_START + VREGION1_SIZE;
1993 vm_map_address_t va_end;
1994 vm_map_address_t va;
1995 vm_size_t offset;
1996
1997 offset = pa_start & PAGE_MASK;
1998 pa_start -= offset;
1999 len += offset;
2000
2001 if (len > (va_max - va_start)) {
2002 panic("%s: area too large, "
2003 "pa_start=%p, len=%p, prot=0x%x",
2004 __FUNCTION__,
2005 (void*)pa_start, (void*)len, prot);
2006 }
2007
2008 scan:
2009 for (; va_start < va_max; va_start += PAGE_SIZE) {
2010 ptep = pmap_pte(kernel_pmap, va_start);
2011 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2012 if (*ptep == ARM_PTE_TYPE_FAULT) {
2013 break;
2014 }
2015 }
2016 if (va_start > va_max) {
2017 panic("%s: insufficient pages, "
2018 "pa_start=%p, len=%p, prot=0x%x",
2019 __FUNCTION__,
2020 (void*)pa_start, (void*)len, prot);
2021 }
2022
2023 for (va_end = va_start + PAGE_SIZE; va_end < va_start + len; va_end += PAGE_SIZE) {
2024 ptep = pmap_pte(kernel_pmap, va_end);
2025 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
2026 if (*ptep != ARM_PTE_TYPE_FAULT) {
2027 va_start = va_end + PAGE_SIZE;
2028 goto scan;
2029 }
2030 }
2031
2032 for (va = va_start; va < va_end; va += PAGE_SIZE, pa_start += PAGE_SIZE) {
2033 ptep = pmap_pte(kernel_pmap, va);
2034 pte = pa_to_pte(pa_start)
2035 | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX
2036 | ARM_PTE_AP((prot & VM_PROT_WRITE) ? AP_RWNA : AP_RONA)
2037 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
2038 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
2039 #if __ARM_KERNEL_PROTECT__
2040 pte |= ARM_PTE_NG;
2041 #endif /* __ARM_KERNEL_PROTECT__ */
2042 write_pte_strong(ptep, pte);
2043 }
2044 PMAP_UPDATE_TLBS(kernel_pmap, va_start, va_start + len, false, true);
2045 #if KASAN
2046 kasan_notify_address(va_start, len);
2047 #endif
2048 return va_start;
2049 }
2050
2051 static uint32_t
pmap_compute_max_asids(void)2052 pmap_compute_max_asids(void)
2053 {
2054 DTEntry entry;
2055 void const *prop = NULL;
2056 uint32_t max_asids;
2057 int err;
2058 unsigned int prop_size;
2059
2060 err = SecureDTLookupEntry(NULL, "/defaults", &entry);
2061 assert(err == kSuccess);
2062
2063 if (kSuccess != SecureDTGetProperty(entry, "pmap-max-asids", &prop, &prop_size)) {
2064 /* TODO: consider allowing maxproc limits to be scaled earlier so that
2065 * we can choose a more flexible default value here. */
2066 return MAX_ASIDS;
2067 }
2068
2069 if (prop_size != sizeof(max_asids)) {
2070 panic("pmap-max-asids property is not a 32-bit integer");
2071 }
2072
2073 max_asids = *((uint32_t const *)prop);
2074 #if HAS_16BIT_ASID
2075 if (max_asids > MAX_HW_ASIDS) {
2076 panic("pmap-max-asids 0x%x too large", max_asids);
2077 }
2078 #else
2079 /* Round up to the nearest 64 to make things a bit easier for the Pseudo-LRU allocator. */
2080 max_asids = (max_asids + 63) & ~63UL;
2081
2082 if (((max_asids + MAX_HW_ASIDS) / (MAX_HW_ASIDS + 1)) > MIN(MAX_HW_ASIDS, UINT8_MAX)) {
2083 /* currently capped by size of pmap->sw_asid */
2084 panic("pmap-max-asids 0x%x too large", max_asids);
2085 }
2086 #endif /* HAS_16BIT_ASID */
2087 if (max_asids == 0) {
2088 panic("pmap-max-asids cannot be zero");
2089 }
2090 return max_asids;
2091 }
2092
2093 #if __arm64__
2094 /*
2095 * pmap_get_arm64_prot
2096 *
2097 * return effective armv8 VMSA block protections including
2098 * table AP/PXN/XN overrides of a pmap entry
2099 *
2100 */
2101
2102 uint64_t
pmap_get_arm64_prot(pmap_t pmap,vm_offset_t addr)2103 pmap_get_arm64_prot(
2104 pmap_t pmap,
2105 vm_offset_t addr)
2106 {
2107 tt_entry_t tte = 0;
2108 unsigned int level = 0;
2109 uint64_t tte_type = 0;
2110 uint64_t effective_prot_bits = 0;
2111 uint64_t aggregate_tte = 0;
2112 uint64_t table_ap_bits = 0, table_xn = 0, table_pxn = 0;
2113 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2114
2115 for (level = pt_attr->pta_root_level; level <= pt_attr->pta_max_level; level++) {
2116 tte = *pmap_ttne(pmap, level, addr);
2117
2118 if (!(tte & ARM_TTE_VALID)) {
2119 return 0;
2120 }
2121
2122 tte_type = tte & ARM_TTE_TYPE_MASK;
2123
2124 if ((tte_type == ARM_TTE_TYPE_BLOCK) ||
2125 (level == pt_attr->pta_max_level)) {
2126 /* Block or page mapping; both have the same protection bit layout. */
2127 break;
2128 } else if (tte_type == ARM_TTE_TYPE_TABLE) {
2129 /* All of the table bits we care about are overrides, so just OR them together. */
2130 aggregate_tte |= tte;
2131 }
2132 }
2133
2134 table_ap_bits = ((aggregate_tte >> ARM_TTE_TABLE_APSHIFT) & AP_MASK);
2135 table_xn = (aggregate_tte & ARM_TTE_TABLE_XN);
2136 table_pxn = (aggregate_tte & ARM_TTE_TABLE_PXN);
2137
2138 /* Start with the PTE bits. */
2139 effective_prot_bits = tte & (ARM_PTE_APMASK | ARM_PTE_NX | ARM_PTE_PNX);
2140
2141 /* Table AP bits mask out block/page AP bits */
2142 effective_prot_bits &= ~(ARM_PTE_AP(table_ap_bits));
2143
2144 /* XN/PXN bits can be OR'd in. */
2145 effective_prot_bits |= (table_xn ? ARM_PTE_NX : 0);
2146 effective_prot_bits |= (table_pxn ? ARM_PTE_PNX : 0);
2147
2148 return effective_prot_bits;
2149 }
2150 #endif /* __arm64__ */
2151
2152 /*
2153 * Bootstrap the system enough to run with virtual memory.
2154 *
2155 * The early VM initialization code has already allocated
2156 * the first CPU's translation table and made entries for
2157 * all the one-to-one mappings to be found there.
2158 *
2159 * We must set up the kernel pmap structures, the
2160 * physical-to-virtual translation lookup tables for the
2161 * physical memory to be managed (between avail_start and
2162 * avail_end).
2163 *
2164 * Map the kernel's code and data, and allocate the system page table.
2165 * Page_size must already be set.
2166 *
2167 * Parameters:
2168 * first_avail first available physical page -
2169 * after kernel page tables
2170 * avail_start PA of first managed physical page
2171 * avail_end PA of last managed physical page
2172 */
2173
2174 void
pmap_bootstrap(vm_offset_t vstart)2175 pmap_bootstrap(
2176 vm_offset_t vstart)
2177 {
2178 vm_map_offset_t maxoffset;
2179
2180 lck_grp_init(&pmap_lck_grp, "pmap", LCK_GRP_ATTR_NULL);
2181
2182 #if XNU_MONITOR
2183
2184 #if DEVELOPMENT || DEBUG
2185 PE_parse_boot_argn("-unsafe_kernel_text", &pmap_ppl_disable, sizeof(pmap_ppl_disable));
2186 #endif
2187
2188 #if CONFIG_CSR_FROM_DT
2189 if (csr_unsafe_kernel_text) {
2190 pmap_ppl_disable = true;
2191 }
2192 #endif /* CONFIG_CSR_FROM_DT */
2193
2194 #endif /* XNU_MONITOR */
2195
2196 #if DEVELOPMENT || DEBUG
2197 if (PE_parse_boot_argn("pmap_trace", &pmap_trace_mask, sizeof(pmap_trace_mask))) {
2198 kprintf("Kernel traces for pmap operations enabled\n");
2199 }
2200 #endif
2201
2202 /*
2203 * Initialize the kernel pmap.
2204 */
2205 #if ARM_PARAMETERIZED_PMAP
2206 kernel_pmap->pmap_pt_attr = native_pt_attr;
2207 #endif /* ARM_PARAMETERIZED_PMAP */
2208 #if HAS_APPLE_PAC
2209 kernel_pmap->disable_jop = 0;
2210 #endif /* HAS_APPLE_PAC */
2211 kernel_pmap->tte = cpu_tte;
2212 kernel_pmap->ttep = cpu_ttep;
2213 kernel_pmap->min = UINT64_MAX - (1ULL << (64 - T1SZ_BOOT)) + 1;
2214 kernel_pmap->max = UINTPTR_MAX;
2215 os_atomic_init(&kernel_pmap->ref_count, 1);
2216 #if XNU_MONITOR
2217 os_atomic_init(&kernel_pmap->nested_count, 0);
2218 #endif
2219 kernel_pmap->nx_enabled = TRUE;
2220 #ifdef __arm64__
2221 kernel_pmap->is_64bit = TRUE;
2222 #else
2223 kernel_pmap->is_64bit = FALSE;
2224 #endif
2225 #if CONFIG_ROSETTA
2226 kernel_pmap->is_rosetta = FALSE;
2227 #endif
2228
2229 #if ARM_PARAMETERIZED_PMAP
2230 kernel_pmap->pmap_pt_attr = native_pt_attr;
2231 #endif /* ARM_PARAMETERIZED_PMAP */
2232
2233 kernel_pmap->nested_region_addr = 0x0ULL;
2234 kernel_pmap->nested_region_size = 0x0ULL;
2235 kernel_pmap->nested_region_unnested_table_bitmap = NULL;
2236 kernel_pmap->nested_region_unnested_table_bitmap_size = 0x0UL;
2237 kernel_pmap->type = PMAP_TYPE_KERNEL;
2238
2239 kernel_pmap->hw_asid = 0;
2240 kernel_pmap->sw_asid = 0;
2241
2242 pmap_lock_init(kernel_pmap);
2243
2244 pmap_max_asids = pmap_compute_max_asids();
2245 #if HAS_16BIT_ASID
2246 asid_chunk_size = MAX_HW_ASIDS;
2247 #else
2248 pmap_asid_plru = (pmap_max_asids > MAX_HW_ASIDS);
2249 PE_parse_boot_argn("pmap_asid_plru", &pmap_asid_plru, sizeof(pmap_asid_plru));
2250 /* Align the range of available hardware ASIDs to a multiple of 64 to enable the
2251 * masking used by the PLRU scheme. This means we must handle the case in which
2252 * the returned hardware ASID is MAX_HW_ASIDS, which we do in alloc_asid() and free_asid(). */
2253 _Static_assert(sizeof(asid_plru_bitmap[0] == sizeof(uint64_t)), "bitmap_t is not a 64-bit integer");
2254 _Static_assert(((MAX_HW_ASIDS + 1) % 64) == 0, "MAX_HW_ASIDS + 1 is not divisible by 64");
2255 asid_chunk_size = (pmap_asid_plru ? (MAX_HW_ASIDS + 1) : MAX_HW_ASIDS);
2256 #endif /* HAS_16BIT_ASIDS */
2257
2258 const vm_size_t asid_table_size = sizeof(*asid_bitmap) * BITMAP_LEN(pmap_max_asids);
2259
2260 /**
2261 * Bootstrap the core pmap data structures (e.g., pv_head_table,
2262 * pp_attr_table, etc). This function will use `avail_start` to allocate
2263 * space for these data structures.
2264 */
2265 pmap_data_bootstrap();
2266
2267 /**
2268 * Bootstrap any necessary UAT data structures and values needed from the device tree.
2269 */
2270 uat_bootstrap();
2271
2272
2273 /**
2274 * Bootstrap any necessary SART data structures and values needed from the device tree.
2275 */
2276 sart_bootstrap();
2277
2278 /**
2279 * Don't make any assumptions about the alignment of avail_start before this
2280 * point (i.e., pmap_data_bootstrap() performs allocations).
2281 */
2282 avail_start = PMAP_ALIGN(avail_start, __alignof(bitmap_t));
2283
2284 const pmap_paddr_t pmap_struct_start = avail_start;
2285
2286 asid_bitmap = (bitmap_t*)phystokv(avail_start);
2287 avail_start = round_page(avail_start + asid_table_size);
2288
2289 memset((char *)phystokv(pmap_struct_start), 0, avail_start - pmap_struct_start);
2290
2291 vm_first_phys = gPhysBase;
2292 vm_last_phys = trunc_page(avail_end);
2293
2294 queue_init(&map_pmap_list);
2295 queue_enter(&map_pmap_list, kernel_pmap, pmap_t, pmaps);
2296 free_page_size_tt_list = TT_FREE_ENTRY_NULL;
2297 free_page_size_tt_count = 0;
2298 free_page_size_tt_max = 0;
2299 free_two_page_size_tt_list = TT_FREE_ENTRY_NULL;
2300 free_two_page_size_tt_count = 0;
2301 free_two_page_size_tt_max = 0;
2302 free_tt_list = TT_FREE_ENTRY_NULL;
2303 free_tt_count = 0;
2304 free_tt_max = 0;
2305
2306 virtual_space_start = vstart;
2307 virtual_space_end = VM_MAX_KERNEL_ADDRESS;
2308
2309 bitmap_full(&asid_bitmap[0], pmap_max_asids);
2310 #if !HAS_16BIT_ASID
2311 bitmap_full(&asid_plru_bitmap[0], MAX_HW_ASIDS);
2312 // Clear the highest-order bit, which corresponds to MAX_HW_ASIDS + 1
2313 asid_plru_bitmap[MAX_HW_ASIDS >> 6] = ~(1ULL << 63);
2314 #endif /* !HAS_16BIT_ASID */
2315
2316
2317
2318 if (PE_parse_boot_argn("arm_maxoffset", &maxoffset, sizeof(maxoffset))) {
2319 maxoffset = trunc_page(maxoffset);
2320 if ((maxoffset >= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MIN))
2321 && (maxoffset <= pmap_max_offset(FALSE, ARM_PMAP_MAX_OFFSET_MAX))) {
2322 arm_pmap_max_offset_default = maxoffset;
2323 }
2324 }
2325 #if defined(__arm64__)
2326 if (PE_parse_boot_argn("arm64_maxoffset", &maxoffset, sizeof(maxoffset))) {
2327 maxoffset = trunc_page(maxoffset);
2328 if ((maxoffset >= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MIN))
2329 && (maxoffset <= pmap_max_offset(TRUE, ARM_PMAP_MAX_OFFSET_MAX))) {
2330 arm64_pmap_max_offset_default = maxoffset;
2331 }
2332 }
2333 #endif
2334
2335 PE_parse_boot_argn("pmap_panic_dev_wimg_on_managed", &pmap_panic_dev_wimg_on_managed, sizeof(pmap_panic_dev_wimg_on_managed));
2336
2337
2338 #if PMAP_CS_PPL_MONITOR
2339 /* Initialize the PPL trust cache read-write lock */
2340 lck_rw_init(&ppl_trust_cache_rt_lock, &pmap_lck_grp, 0);
2341 ppl_trust_cache_rt_lock.lck_rw_can_sleep = FALSE;
2342 #endif
2343
2344 #if MACH_ASSERT
2345 PE_parse_boot_argn("vm_footprint_suspend_allowed",
2346 &vm_footprint_suspend_allowed,
2347 sizeof(vm_footprint_suspend_allowed));
2348 #endif /* MACH_ASSERT */
2349
2350 #if KASAN
2351 /* Shadow the CPU copy windows, as they fall outside of the physical aperture */
2352 kasan_map_shadow(CPUWINDOWS_BASE, CPUWINDOWS_TOP - CPUWINDOWS_BASE, true);
2353 #endif /* KASAN */
2354
2355 /**
2356 * Ensure that avail_start is always left on a page boundary. The calling
2357 * code might not perform any alignment before allocating page tables so
2358 * this is important.
2359 */
2360 avail_start = round_page(avail_start);
2361 }
2362
2363 #if XNU_MONITOR
2364
2365 static inline void
pa_set_range_monitor(pmap_paddr_t start_pa,pmap_paddr_t end_pa)2366 pa_set_range_monitor(pmap_paddr_t start_pa, pmap_paddr_t end_pa)
2367 {
2368 pmap_paddr_t cur_pa;
2369 for (cur_pa = start_pa; cur_pa < end_pa; cur_pa += ARM_PGBYTES) {
2370 assert(pa_valid(cur_pa));
2371 ppattr_pa_set_monitor(cur_pa);
2372 }
2373 }
2374
2375 void
pa_set_range_xprr_perm(pmap_paddr_t start_pa,pmap_paddr_t end_pa,unsigned int expected_perm,unsigned int new_perm)2376 pa_set_range_xprr_perm(pmap_paddr_t start_pa,
2377 pmap_paddr_t end_pa,
2378 unsigned int expected_perm,
2379 unsigned int new_perm)
2380 {
2381 vm_offset_t start_va = phystokv(start_pa);
2382 vm_offset_t end_va = start_va + (end_pa - start_pa);
2383
2384 pa_set_range_monitor(start_pa, end_pa);
2385 pmap_set_range_xprr_perm(start_va, end_va, expected_perm, new_perm);
2386 }
2387
2388 static void
pmap_lockdown_kc(void)2389 pmap_lockdown_kc(void)
2390 {
2391 extern vm_offset_t vm_kernelcache_base;
2392 extern vm_offset_t vm_kernelcache_top;
2393 pmap_paddr_t start_pa = kvtophys_nofail(vm_kernelcache_base);
2394 pmap_paddr_t end_pa = start_pa + (vm_kernelcache_top - vm_kernelcache_base);
2395 pmap_paddr_t cur_pa = start_pa;
2396 vm_offset_t cur_va = vm_kernelcache_base;
2397 while (cur_pa < end_pa) {
2398 vm_size_t range_size = end_pa - cur_pa;
2399 vm_offset_t ptov_va = phystokv_range(cur_pa, &range_size);
2400 if (ptov_va != cur_va) {
2401 /*
2402 * If the physical address maps back to a virtual address that is non-linear
2403 * w.r.t. the kernelcache, that means it corresponds to memory that will be
2404 * reclaimed by the OS and should therefore not be locked down.
2405 */
2406 cur_pa += range_size;
2407 cur_va += range_size;
2408 continue;
2409 }
2410 unsigned int pai = pa_index(cur_pa);
2411 pv_entry_t **pv_h = pai_to_pvh(pai);
2412
2413 vm_offset_t pvh_flags = pvh_get_flags(pv_h);
2414
2415 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
2416 panic("pai %d already locked down", pai);
2417 }
2418
2419 pvh_set_flags(pv_h, pvh_flags | PVH_FLAG_LOCKDOWN_KC);
2420 cur_pa += ARM_PGBYTES;
2421 cur_va += ARM_PGBYTES;
2422 }
2423 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
2424 extern uint64_t ctrr_ro_test;
2425 extern uint64_t ctrr_nx_test;
2426 pmap_paddr_t exclude_pages[] = {kvtophys_nofail((vm_offset_t)&ctrr_ro_test), kvtophys_nofail((vm_offset_t)&ctrr_nx_test)};
2427 for (unsigned i = 0; i < (sizeof(exclude_pages) / sizeof(exclude_pages[0])); ++i) {
2428 pv_entry_t **pv_h = pai_to_pvh(pa_index(exclude_pages[i]));
2429 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_LOCKDOWN_KC);
2430 }
2431 #endif
2432 }
2433
2434 void
pmap_static_allocations_done(void)2435 pmap_static_allocations_done(void)
2436 {
2437 pmap_paddr_t monitor_start_pa;
2438 pmap_paddr_t monitor_end_pa;
2439
2440 /*
2441 * Protect the bootstrap (V=P and V->P) page tables.
2442 *
2443 * These bootstrap allocations will be used primarily for page tables.
2444 * If we wish to secure the page tables, we need to start by marking
2445 * these bootstrap allocations as pages that we want to protect.
2446 */
2447 monitor_start_pa = kvtophys_nofail((vm_offset_t)&bootstrap_pagetables);
2448 monitor_end_pa = monitor_start_pa + BOOTSTRAP_TABLE_SIZE;
2449
2450 /* The bootstrap page tables are mapped RW at boostrap. */
2451 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_KERN_RO_PERM);
2452
2453 /*
2454 * We use avail_start as a pointer to the first address that has not
2455 * been reserved for bootstrap, so we know which pages to give to the
2456 * virtual memory layer.
2457 */
2458 monitor_start_pa = first_avail_phys;
2459 monitor_end_pa = avail_start;
2460
2461 /* The other bootstrap allocations are mapped RW at bootstrap. */
2462 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2463
2464 /*
2465 * The RO page tables are mapped RW in arm_vm_init() and later restricted
2466 * to RO in arm_vm_prot_finalize(), which is called after this function.
2467 * Here we only need to mark the underlying physical pages as PPL-owned to ensure
2468 * they can't be allocated for other uses. We don't need a special xPRR
2469 * protection index, as there is no PPL_RO index, and these pages are ultimately
2470 * protected by KTRR/CTRR. Furthermore, use of PPL_RW for these pages would
2471 * expose us to a functional issue on H11 devices where CTRR shifts the APRR
2472 * lookup table index to USER_XO before APRR is applied, leading the hardware
2473 * to believe we are dealing with an user XO page upon performing a translation.
2474 */
2475 monitor_start_pa = kvtophys_nofail((vm_offset_t)&ropagetable_begin);
2476 monitor_end_pa = monitor_start_pa + ((vm_offset_t)&ropagetable_end - (vm_offset_t)&ropagetable_begin);
2477 pa_set_range_monitor(monitor_start_pa, monitor_end_pa);
2478
2479 monitor_start_pa = kvtophys_nofail(segPPLDATAB);
2480 monitor_end_pa = monitor_start_pa + segSizePPLDATA;
2481
2482 /* PPL data is RW for the PPL, RO for the kernel. */
2483 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RW_PERM, XPRR_PPL_RW_PERM);
2484
2485 monitor_start_pa = kvtophys_nofail(segPPLTEXTB);
2486 monitor_end_pa = monitor_start_pa + segSizePPLTEXT;
2487
2488 /* PPL text is RX for the PPL, RO for the kernel. */
2489 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RX_PERM, XPRR_PPL_RX_PERM);
2490
2491
2492 /*
2493 * In order to support DTrace, the save areas for the PPL must be
2494 * writable. This is due to the fact that DTrace will try to update
2495 * register state.
2496 */
2497 if (pmap_ppl_disable) {
2498 vm_offset_t monitor_start_va = phystokv(ppl_cpu_save_area_start);
2499 vm_offset_t monitor_end_va = monitor_start_va + (ppl_cpu_save_area_end - ppl_cpu_save_area_start);
2500
2501 pmap_set_range_xprr_perm(monitor_start_va, monitor_end_va, XPRR_PPL_RW_PERM, XPRR_KERN_RW_PERM);
2502 }
2503
2504
2505 if (segSizePPLDATACONST > 0) {
2506 monitor_start_pa = kvtophys_nofail(segPPLDATACONSTB);
2507 monitor_end_pa = monitor_start_pa + segSizePPLDATACONST;
2508
2509 pa_set_range_xprr_perm(monitor_start_pa, monitor_end_pa, XPRR_KERN_RO_PERM, XPRR_KERN_RO_PERM);
2510 }
2511
2512 /*
2513 * Mark the original physical aperture mapping for the PPL stack pages RO as an additional security
2514 * precaution. The real RW mappings are at a different location with guard pages.
2515 */
2516 pa_set_range_xprr_perm(pmap_stacks_start_pa, pmap_stacks_end_pa, XPRR_PPL_RW_PERM, XPRR_KERN_RO_PERM);
2517
2518 /* Prevent remapping of the kernelcache */
2519 pmap_lockdown_kc();
2520 }
2521
2522 void
pmap_lockdown_ppl(void)2523 pmap_lockdown_ppl(void)
2524 {
2525 /* Mark the PPL as being locked down. */
2526
2527 mp_disable_preemption(); // for _nopreempt locking operations
2528 pmap_ppl_lockdown_page(commpage_ro_data_kva, PVH_FLAG_LOCKDOWN_KC, false);
2529 if (commpage_text_kva != 0) {
2530 pmap_ppl_lockdown_page_with_prot(commpage_text_kva, PVH_FLAG_LOCKDOWN_KC,
2531 false, VM_PROT_READ | VM_PROT_EXECUTE);
2532 }
2533 mp_enable_preemption();
2534
2535 /* Write-protect the kernel RO commpage. */
2536 #error "XPRR configuration error"
2537 }
2538 #endif /* XNU_MONITOR */
2539
2540 void
pmap_virtual_space(vm_offset_t * startp,vm_offset_t * endp)2541 pmap_virtual_space(
2542 vm_offset_t *startp,
2543 vm_offset_t *endp
2544 )
2545 {
2546 *startp = virtual_space_start;
2547 *endp = virtual_space_end;
2548 }
2549
2550
2551 boolean_t
pmap_virtual_region(unsigned int region_select,vm_map_offset_t * startp,vm_map_size_t * size)2552 pmap_virtual_region(
2553 unsigned int region_select,
2554 vm_map_offset_t *startp,
2555 vm_map_size_t *size
2556 )
2557 {
2558 boolean_t ret = FALSE;
2559 #if defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)
2560 if (region_select == 0) {
2561 /*
2562 * In this config, the bootstrap mappings should occupy their own L2
2563 * TTs, as they should be immutable after boot. Having the associated
2564 * TTEs and PTEs in their own pages allows us to lock down those pages,
2565 * while allowing the rest of the kernel address range to be remapped.
2566 */
2567 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2568 #if defined(ARM_LARGE_MEMORY)
2569 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2570 #else
2571 *size = ((VM_MAX_KERNEL_ADDRESS - *startp) & ~PAGE_MASK);
2572 #endif
2573 ret = TRUE;
2574 }
2575
2576 #if defined(ARM_LARGE_MEMORY)
2577 if (region_select == 1) {
2578 *startp = VREGION1_START;
2579 *size = VREGION1_SIZE;
2580 ret = TRUE;
2581 }
2582 #endif
2583 #else /* !(defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR)) */
2584 #if defined(ARM_LARGE_MEMORY)
2585 /* For large memory systems with no KTRR/CTRR such as virtual machines */
2586 if (region_select == 0) {
2587 *startp = LOW_GLOBAL_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK;
2588 *size = ((KERNEL_PMAP_HEAP_RANGE_START - *startp) & ~PAGE_MASK);
2589 ret = TRUE;
2590 }
2591
2592 if (region_select == 1) {
2593 *startp = VREGION1_START;
2594 *size = VREGION1_SIZE;
2595 ret = TRUE;
2596 }
2597 #else /* !defined(ARM_LARGE_MEMORY) */
2598 unsigned long low_global_vr_mask = 0;
2599 vm_map_size_t low_global_vr_size = 0;
2600
2601 if (region_select == 0) {
2602 /* Round to avoid overlapping with the V=P area; round to at least the L2 block size. */
2603 if (!TEST_PAGE_SIZE_4K) {
2604 *startp = gVirtBase & 0xFFFFFFFFFE000000;
2605 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFE000000)) + ~0xFFFFFFFFFE000000) & 0xFFFFFFFFFE000000;
2606 } else {
2607 *startp = gVirtBase & 0xFFFFFFFFFF800000;
2608 *size = ((virtual_space_start - (gVirtBase & 0xFFFFFFFFFF800000)) + ~0xFFFFFFFFFF800000) & 0xFFFFFFFFFF800000;
2609 }
2610 ret = TRUE;
2611 }
2612 if (region_select == 1) {
2613 *startp = VREGION1_START;
2614 *size = VREGION1_SIZE;
2615 ret = TRUE;
2616 }
2617 /* We need to reserve a range that is at least the size of an L2 block mapping for the low globals */
2618 if (!TEST_PAGE_SIZE_4K) {
2619 low_global_vr_mask = 0xFFFFFFFFFE000000;
2620 low_global_vr_size = 0x2000000;
2621 } else {
2622 low_global_vr_mask = 0xFFFFFFFFFF800000;
2623 low_global_vr_size = 0x800000;
2624 }
2625
2626 if (((gVirtBase & low_global_vr_mask) != LOW_GLOBAL_BASE_ADDRESS) && (region_select == 2)) {
2627 *startp = LOW_GLOBAL_BASE_ADDRESS;
2628 *size = low_global_vr_size;
2629 ret = TRUE;
2630 }
2631
2632 if (region_select == 3) {
2633 /* In this config, we allow the bootstrap mappings to occupy the same
2634 * page table pages as the heap.
2635 */
2636 *startp = VM_MIN_KERNEL_ADDRESS;
2637 *size = LOW_GLOBAL_BASE_ADDRESS - *startp;
2638 ret = TRUE;
2639 }
2640 #endif /* defined(ARM_LARGE_MEMORY) */
2641 #endif /* defined(KERNEL_INTEGRITY_KTRR) || defined(KERNEL_INTEGRITY_CTRR) */
2642 return ret;
2643 }
2644
2645 /*
2646 * Routines to track and allocate physical pages during early boot.
2647 * On most systems that memory runs from first_avail through to avail_end
2648 * with no gaps.
2649 *
2650 * If the system supports ECC and ecc_bad_pages_count > 0, we
2651 * need to skip those pages.
2652 */
2653
2654 static unsigned int avail_page_count = 0;
2655 static bool need_ram_ranges_init = true;
2656
2657
2658 /**
2659 * Checks to see if a given page is in
2660 * the array of known bad pages
2661 *
2662 * @param ppn page number to check
2663 */
2664 bool
pmap_is_bad_ram(__unused ppnum_t ppn)2665 pmap_is_bad_ram(__unused ppnum_t ppn)
2666 {
2667 return false;
2668 }
2669
2670 /**
2671 * Prepare bad ram pages to be skipped.
2672 */
2673
2674 /*
2675 * Initialize the count of available pages. No lock needed here,
2676 * as this code is called while kernel boot up is single threaded.
2677 */
2678 static void
initialize_ram_ranges(void)2679 initialize_ram_ranges(void)
2680 {
2681 pmap_paddr_t first = first_avail;
2682 pmap_paddr_t end = avail_end;
2683
2684 assert(first <= end);
2685 assert(first == (first & ~PAGE_MASK));
2686 assert(end == (end & ~PAGE_MASK));
2687 avail_page_count = atop(end - first);
2688
2689 need_ram_ranges_init = false;
2690 }
2691
2692 unsigned int
pmap_free_pages(void)2693 pmap_free_pages(
2694 void)
2695 {
2696 if (need_ram_ranges_init) {
2697 initialize_ram_ranges();
2698 }
2699 return avail_page_count;
2700 }
2701
2702 unsigned int
pmap_free_pages_span(void)2703 pmap_free_pages_span(
2704 void)
2705 {
2706 if (need_ram_ranges_init) {
2707 initialize_ram_ranges();
2708 }
2709 return (unsigned int)atop(avail_end - first_avail);
2710 }
2711
2712
2713 boolean_t
pmap_next_page_hi(ppnum_t * pnum,__unused boolean_t might_free)2714 pmap_next_page_hi(
2715 ppnum_t * pnum,
2716 __unused boolean_t might_free)
2717 {
2718 return pmap_next_page(pnum);
2719 }
2720
2721
2722 boolean_t
pmap_next_page(ppnum_t * pnum)2723 pmap_next_page(
2724 ppnum_t *pnum)
2725 {
2726 if (need_ram_ranges_init) {
2727 initialize_ram_ranges();
2728 }
2729
2730
2731 if (first_avail != avail_end) {
2732 *pnum = (ppnum_t)atop(first_avail);
2733 first_avail += PAGE_SIZE;
2734 assert(avail_page_count > 0);
2735 --avail_page_count;
2736 return TRUE;
2737 }
2738 assert(avail_page_count == 0);
2739 return FALSE;
2740 }
2741
2742
2743 /*
2744 * Initialize the pmap module.
2745 * Called by vm_init, to initialize any structures that the pmap
2746 * system needs to map virtual memory.
2747 */
2748 void
pmap_init(void)2749 pmap_init(
2750 void)
2751 {
2752 /*
2753 * Protect page zero in the kernel map.
2754 * (can be overruled by permanent transltion
2755 * table entries at page zero - see arm_vm_init).
2756 */
2757 vm_protect(kernel_map, 0, PAGE_SIZE, TRUE, VM_PROT_NONE);
2758
2759 pmap_initialized = TRUE;
2760
2761 /*
2762 * Create the zone of physical maps
2763 * and the physical-to-virtual entries.
2764 */
2765 pmap_zone = zone_create_ext("pmap", sizeof(struct pmap),
2766 ZC_ZFREE_CLEARMEM, ZONE_ID_PMAP, NULL);
2767
2768
2769 /*
2770 * Initialize the pmap object (for tracking the vm_page_t
2771 * structures for pages we allocate to be page tables in
2772 * pmap_expand().
2773 */
2774 _vm_object_allocate(mem_size, pmap_object);
2775 pmap_object->copy_strategy = MEMORY_OBJECT_COPY_NONE;
2776
2777 /*
2778 * The values of [hard_]maxproc may have been scaled, make sure
2779 * they are still less than the value of pmap_max_asids.
2780 */
2781 if ((uint32_t)maxproc > pmap_max_asids) {
2782 maxproc = pmap_max_asids;
2783 }
2784 if ((uint32_t)hard_maxproc > pmap_max_asids) {
2785 hard_maxproc = pmap_max_asids;
2786 }
2787 }
2788
2789 /**
2790 * Verify that a given physical page contains no mappings (outside of the
2791 * default physical aperture mapping).
2792 *
2793 * @param ppnum Physical page number to check there are no mappings to.
2794 *
2795 * @return True if there are no mappings, false otherwise or if the page is not
2796 * kernel-managed.
2797 */
2798 bool
pmap_verify_free(ppnum_t ppnum)2799 pmap_verify_free(ppnum_t ppnum)
2800 {
2801 const pmap_paddr_t pa = ptoa(ppnum);
2802
2803 assert(pa != vm_page_fictitious_addr);
2804
2805 /* Only mappings to kernel-managed physical memory are tracked. */
2806 if (!pa_valid(pa)) {
2807 return false;
2808 }
2809
2810 const unsigned int pai = pa_index(pa);
2811 pv_entry_t **pvh = pai_to_pvh(pai);
2812
2813 return pvh_test_type(pvh, PVH_TYPE_NULL);
2814 }
2815
2816 #if MACH_ASSERT
2817 /**
2818 * Verify that a given physical page contains no mappings (outside of the
2819 * default physical aperture mapping) and if it does, then panic.
2820 *
2821 * @note It's recommended to use pmap_verify_free() directly when operating in
2822 * the PPL since the PVH lock isn't getting grabbed here (due to this code
2823 * normally being called from outside of the PPL, and the pv_head_table
2824 * can't be modified outside of the PPL).
2825 *
2826 * @param ppnum Physical page number to check there are no mappings to.
2827 */
2828 void
pmap_assert_free(ppnum_t ppnum)2829 pmap_assert_free(ppnum_t ppnum)
2830 {
2831 const pmap_paddr_t pa = ptoa(ppnum);
2832
2833 /* Only mappings to kernel-managed physical memory are tracked. */
2834 if (__probable(!pa_valid(pa) || pmap_verify_free(ppnum))) {
2835 return;
2836 }
2837
2838 const unsigned int pai = pa_index(pa);
2839 pv_entry_t **pvh = pai_to_pvh(pai);
2840
2841 /**
2842 * This function is always called from outside of the PPL. Because of this,
2843 * the PVH entry can't be locked. This function is generally only called
2844 * before the VM reclaims a physical page and shouldn't be creating new
2845 * mappings. Even if a new mapping is created while parsing the hierarchy,
2846 * the worst case is that the system will panic in another way, and we were
2847 * already about to panic anyway.
2848 */
2849
2850 /**
2851 * Since pmap_verify_free() returned false, that means there is at least one
2852 * mapping left. Let's get some extra info on the first mapping we find to
2853 * dump in the panic string (the common case is that there is one spare
2854 * mapping that was never unmapped).
2855 */
2856 pt_entry_t *first_ptep = PT_ENTRY_NULL;
2857
2858 if (pvh_test_type(pvh, PVH_TYPE_PTEP)) {
2859 first_ptep = pvh_ptep(pvh);
2860 } else if (pvh_test_type(pvh, PVH_TYPE_PVEP)) {
2861 pv_entry_t *pvep = pvh_pve_list(pvh);
2862
2863 /* Each PVE can contain multiple PTEs. Let's find the first one. */
2864 for (int pve_ptep_idx = 0; pve_ptep_idx < PTE_PER_PVE; pve_ptep_idx++) {
2865 first_ptep = pve_get_ptep(pvep, pve_ptep_idx);
2866 if (first_ptep != PT_ENTRY_NULL) {
2867 break;
2868 }
2869 }
2870
2871 /* The PVE should have at least one valid PTE. */
2872 assert(first_ptep != PT_ENTRY_NULL);
2873 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
2874 panic("%s: Physical page is being used as a page table at PVH %p (pai: %d)",
2875 __func__, pvh, pai);
2876 } else {
2877 /**
2878 * The mapping disappeared between here and the pmap_verify_free() call.
2879 * The only way that can happen is if the VM was racing this call with
2880 * a call that unmaps PTEs. Operations on this page should not be
2881 * occurring at the same time as this check, and unfortunately we can't
2882 * lock the PVH entry to prevent it, so just panic instead.
2883 */
2884 panic("%s: Mapping was detected but is now gone. Is the VM racing this "
2885 "call with an operation that unmaps PTEs? PVH %p (pai: %d)",
2886 __func__, pvh, pai);
2887 }
2888
2889 /* Panic with a unique string identifying the first bad mapping and owner. */
2890 {
2891 /* First PTE is mapped by the main CPUs. */
2892 pmap_t pmap = ptep_get_pmap(first_ptep);
2893 const char *type = (pmap == kernel_pmap) ? "Kernel" : "User";
2894
2895 panic("%s: Found at least one mapping to %#llx. First PTEP (%p) is a "
2896 "%s CPU mapping (pmap: %p)",
2897 __func__, (uint64_t)pa, first_ptep, type, pmap);
2898 }
2899 }
2900 #endif
2901
2902
2903 static vm_size_t
pmap_root_alloc_size(pmap_t pmap)2904 pmap_root_alloc_size(pmap_t pmap)
2905 {
2906 #pragma unused(pmap)
2907 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
2908 unsigned int root_level = pt_attr_root_level(pt_attr);
2909 return ((pt_attr_ln_index_mask(pt_attr, root_level) >> pt_attr_ln_shift(pt_attr, root_level)) + 1) * sizeof(tt_entry_t);
2910 }
2911
2912
2913 /*
2914 * Create and return a physical map.
2915 *
2916 * If the size specified for the map
2917 * is zero, the map is an actual physical
2918 * map, and may be referenced by the
2919 * hardware.
2920 *
2921 * If the size specified is non-zero,
2922 * the map will be used in software only, and
2923 * is bounded by that size.
2924 */
2925 MARK_AS_PMAP_TEXT pmap_t
pmap_create_options_internal(ledger_t ledger,vm_map_size_t size,unsigned int flags,kern_return_t * kr)2926 pmap_create_options_internal(
2927 ledger_t ledger,
2928 vm_map_size_t size,
2929 unsigned int flags,
2930 kern_return_t *kr)
2931 {
2932 unsigned i;
2933 unsigned tte_index_max;
2934 pmap_t p;
2935 bool is_64bit = flags & PMAP_CREATE_64BIT;
2936 #if defined(HAS_APPLE_PAC)
2937 bool disable_jop = flags & PMAP_CREATE_DISABLE_JOP;
2938 #endif /* defined(HAS_APPLE_PAC) */
2939 kern_return_t local_kr = KERN_SUCCESS;
2940
2941 if (size != 0) {
2942 {
2943 // Size parameter should only be set for stage 2.
2944 return PMAP_NULL;
2945 }
2946 }
2947
2948 if (0 != (flags & ~PMAP_CREATE_KNOWN_FLAGS)) {
2949 return PMAP_NULL;
2950 }
2951
2952 #if XNU_MONITOR
2953 if ((local_kr = pmap_alloc_pmap(&p)) != KERN_SUCCESS) {
2954 goto pmap_create_fail;
2955 }
2956
2957 assert(p != PMAP_NULL);
2958
2959 if (ledger) {
2960 pmap_ledger_validate(ledger);
2961 pmap_ledger_retain(ledger);
2962 }
2963 #else
2964 /*
2965 * Allocate a pmap struct from the pmap_zone. Then allocate
2966 * the translation table of the right size for the pmap.
2967 */
2968 if ((p = (pmap_t) zalloc(pmap_zone)) == PMAP_NULL) {
2969 local_kr = KERN_RESOURCE_SHORTAGE;
2970 goto pmap_create_fail;
2971 }
2972 #endif
2973
2974 p->ledger = ledger;
2975
2976
2977 p->pmap_vm_map_cs_enforced = false;
2978 p->min = 0;
2979
2980
2981 #if CONFIG_ROSETTA
2982 if (flags & PMAP_CREATE_ROSETTA) {
2983 p->is_rosetta = TRUE;
2984 } else {
2985 p->is_rosetta = FALSE;
2986 }
2987 #endif /* CONFIG_ROSETTA */
2988
2989 #if defined(HAS_APPLE_PAC)
2990 p->disable_jop = disable_jop;
2991 #endif /* defined(HAS_APPLE_PAC) */
2992
2993 p->nested_region_true_start = 0;
2994 p->nested_region_true_end = ~0;
2995
2996 p->nx_enabled = true;
2997 p->is_64bit = is_64bit;
2998 p->nested_pmap = PMAP_NULL;
2999 p->type = PMAP_TYPE_USER;
3000
3001 #if ARM_PARAMETERIZED_PMAP
3002 /* Default to the native pt_attr */
3003 p->pmap_pt_attr = native_pt_attr;
3004 #endif /* ARM_PARAMETERIZED_PMAP */
3005 #if __ARM_MIXED_PAGE_SIZE__
3006 if (flags & PMAP_CREATE_FORCE_4K_PAGES) {
3007 p->pmap_pt_attr = &pmap_pt_attr_4k;
3008 }
3009 #endif /* __ARM_MIXED_PAGE_SIZE__ */
3010 p->max = pmap_user_va_size(p);
3011
3012 if (!pmap_get_pt_ops(p)->alloc_id(p)) {
3013 local_kr = KERN_NO_SPACE;
3014 goto id_alloc_fail;
3015 }
3016
3017 pmap_lock_init(p);
3018
3019 p->tt_entry_free = (tt_entry_t *)0;
3020 tte_index_max = ((unsigned)pmap_root_alloc_size(p) / sizeof(tt_entry_t));
3021
3022
3023 #if XNU_MONITOR
3024 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), PMAP_TT_ALLOCATE_NOWAIT);
3025 #else
3026 p->tte = pmap_tt1_allocate(p, pmap_root_alloc_size(p), 0);
3027 #endif
3028 if (!(p->tte)) {
3029 local_kr = KERN_RESOURCE_SHORTAGE;
3030 goto tt1_alloc_fail;
3031 }
3032
3033 p->ttep = ml_static_vtop((vm_offset_t)p->tte);
3034 PMAP_TRACE(4, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(p), VM_KERNEL_ADDRHIDE(p->min), VM_KERNEL_ADDRHIDE(p->max), p->ttep);
3035
3036 /* nullify the translation table */
3037 for (i = 0; i < tte_index_max; i++) {
3038 p->tte[i] = ARM_TTE_TYPE_FAULT;
3039 }
3040
3041 FLUSH_PTE();
3042
3043 /*
3044 * initialize the rest of the structure
3045 */
3046 p->nested_region_addr = 0x0ULL;
3047 p->nested_region_size = 0x0ULL;
3048 p->nested_region_unnested_table_bitmap = NULL;
3049 p->nested_region_unnested_table_bitmap_size = 0x0UL;
3050
3051 p->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
3052 p->nested_no_bounds_refcnt = 0;
3053 p->nested_bounds_set = false;
3054
3055
3056 #if MACH_ASSERT
3057 p->pmap_pid = 0;
3058 strlcpy(p->pmap_procname, "<nil>", sizeof(p->pmap_procname));
3059 #endif /* MACH_ASSERT */
3060 #if DEVELOPMENT || DEBUG
3061 p->footprint_was_suspended = FALSE;
3062 #endif /* DEVELOPMENT || DEBUG */
3063
3064 #if XNU_MONITOR
3065 os_atomic_init(&p->nested_count, 0);
3066 assert(os_atomic_load(&p->ref_count, relaxed) == 0);
3067 /* Ensure prior updates to the new pmap are visible before the non-zero ref_count is visible */
3068 os_atomic_thread_fence(release);
3069 #endif
3070 os_atomic_init(&p->ref_count, 1);
3071 pmap_simple_lock(&pmaps_lock);
3072 queue_enter(&map_pmap_list, p, pmap_t, pmaps);
3073 pmap_simple_unlock(&pmaps_lock);
3074
3075 /*
3076 * Certain ledger balances can be adjusted outside the PVH lock in pmap_enter(),
3077 * which can lead to a concurrent disconnect operation making the balance
3078 * transiently negative. The ledger should still ultimately balance out,
3079 * which we still check upon pmap destruction.
3080 */
3081 ledger_disable_panic_on_negative(p->ledger, task_ledgers.phys_footprint);
3082 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal);
3083 ledger_disable_panic_on_negative(p->ledger, task_ledgers.internal_compressed);
3084 ledger_disable_panic_on_negative(p->ledger, task_ledgers.iokit_mapped);
3085 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting);
3086 ledger_disable_panic_on_negative(p->ledger, task_ledgers.alternate_accounting_compressed);
3087 ledger_disable_panic_on_negative(p->ledger, task_ledgers.external);
3088 ledger_disable_panic_on_negative(p->ledger, task_ledgers.reusable);
3089 ledger_disable_panic_on_negative(p->ledger, task_ledgers.wired_mem);
3090
3091 return p;
3092
3093 tt1_alloc_fail:
3094 pmap_get_pt_ops(p)->free_id(p);
3095 id_alloc_fail:
3096 #if XNU_MONITOR
3097 pmap_free_pmap(p);
3098
3099 if (ledger) {
3100 pmap_ledger_release(ledger);
3101 }
3102 #else
3103 zfree(pmap_zone, p);
3104 #endif
3105 pmap_create_fail:
3106 #if XNU_MONITOR
3107 pmap_pin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3108 #endif
3109 *kr = local_kr;
3110 #if XNU_MONITOR
3111 pmap_unpin_kernel_pages((vm_offset_t)kr, sizeof(*kr));
3112 #endif
3113 return PMAP_NULL;
3114 }
3115
3116 pmap_t
pmap_create_options(ledger_t ledger,vm_map_size_t size,unsigned int flags)3117 pmap_create_options(
3118 ledger_t ledger,
3119 vm_map_size_t size,
3120 unsigned int flags)
3121 {
3122 pmap_t pmap;
3123 kern_return_t kr = KERN_SUCCESS;
3124
3125 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_START, size, flags);
3126
3127 ledger_reference(ledger);
3128
3129 #if XNU_MONITOR
3130 for (;;) {
3131 pmap = pmap_create_options_ppl(ledger, size, flags, &kr);
3132 if (kr != KERN_RESOURCE_SHORTAGE) {
3133 break;
3134 }
3135 assert(pmap == PMAP_NULL);
3136 pmap_alloc_page_for_ppl(0);
3137 kr = KERN_SUCCESS;
3138 }
3139 #else
3140 pmap = pmap_create_options_internal(ledger, size, flags, &kr);
3141 #endif
3142
3143 if (pmap == PMAP_NULL) {
3144 ledger_dereference(ledger);
3145 }
3146
3147 PMAP_TRACE(1, PMAP_CODE(PMAP__CREATE) | DBG_FUNC_END, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3148
3149 return pmap;
3150 }
3151
3152 #if XNU_MONITOR
3153 /*
3154 * This symbol remains in place when the PPL is enabled so that the dispatch
3155 * table does not change from development to release configurations.
3156 */
3157 #endif
3158 #if MACH_ASSERT || XNU_MONITOR
3159 MARK_AS_PMAP_TEXT void
pmap_set_process_internal(__unused pmap_t pmap,__unused int pid,__unused char * procname)3160 pmap_set_process_internal(
3161 __unused pmap_t pmap,
3162 __unused int pid,
3163 __unused char *procname)
3164 {
3165 #if MACH_ASSERT
3166 if (pmap == NULL || pmap->pmap_pid == -1) {
3167 return;
3168 }
3169
3170 validate_pmap_mutable(pmap);
3171
3172 pmap->pmap_pid = pid;
3173 strlcpy(pmap->pmap_procname, procname, sizeof(pmap->pmap_procname));
3174 #endif /* MACH_ASSERT */
3175 }
3176 #endif /* MACH_ASSERT || XNU_MONITOR */
3177
3178 #if MACH_ASSERT
3179 void
pmap_set_process(pmap_t pmap,int pid,char * procname)3180 pmap_set_process(
3181 pmap_t pmap,
3182 int pid,
3183 char *procname)
3184 {
3185 #if XNU_MONITOR
3186 pmap_set_process_ppl(pmap, pid, procname);
3187 #else
3188 pmap_set_process_internal(pmap, pid, procname);
3189 #endif
3190 }
3191 #endif /* MACH_ASSERT */
3192
3193 /*
3194 * pmap_deallocate_all_leaf_tts:
3195 *
3196 * Recursive function for deallocating all leaf TTEs. Walks the given TT,
3197 * removing and deallocating all TTEs.
3198 */
3199 MARK_AS_PMAP_TEXT static void
pmap_deallocate_all_leaf_tts(pmap_t pmap,tt_entry_t * first_ttep,unsigned level)3200 pmap_deallocate_all_leaf_tts(pmap_t pmap, tt_entry_t * first_ttep, unsigned level)
3201 {
3202 tt_entry_t tte = ARM_TTE_EMPTY;
3203 tt_entry_t * ttep = NULL;
3204 tt_entry_t * last_ttep = NULL;
3205
3206 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3207
3208 assert(level < pt_attr_leaf_level(pt_attr));
3209
3210 last_ttep = &first_ttep[ttn_index(pt_attr, ~0, level)];
3211
3212 for (ttep = first_ttep; ttep <= last_ttep; ttep++) {
3213 tte = *ttep;
3214
3215 if (!(tte & ARM_TTE_VALID)) {
3216 continue;
3217 }
3218
3219 if ((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_BLOCK) {
3220 panic("%s: found block mapping, ttep=%p, tte=%p, "
3221 "pmap=%p, first_ttep=%p, level=%u",
3222 __FUNCTION__, ttep, (void *)tte,
3223 pmap, first_ttep, level);
3224 }
3225
3226 /* Must be valid, type table */
3227 if (level < pt_attr_twig_level(pt_attr)) {
3228 /* If we haven't reached the twig level, recurse to the next level. */
3229 pmap_deallocate_all_leaf_tts(pmap, (tt_entry_t *)phystokv((tte) & ARM_TTE_TABLE_MASK), level + 1);
3230 }
3231
3232 /* Remove the TTE. */
3233 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3234 pmap_tte_deallocate(pmap, 0, 0, false, ttep, level);
3235 }
3236 }
3237
3238 /*
3239 * We maintain stats and ledgers so that a task's physical footprint is:
3240 * phys_footprint = ((internal - alternate_accounting)
3241 * + (internal_compressed - alternate_accounting_compressed)
3242 * + iokit_mapped
3243 * + purgeable_nonvolatile
3244 * + purgeable_nonvolatile_compressed
3245 * + page_table)
3246 * where "alternate_accounting" includes "iokit" and "purgeable" memory.
3247 */
3248
3249 /*
3250 * Retire the given physical map from service.
3251 * Should only be called if the map contains
3252 * no valid mappings.
3253 */
3254 MARK_AS_PMAP_TEXT void
pmap_destroy_internal(pmap_t pmap)3255 pmap_destroy_internal(
3256 pmap_t pmap)
3257 {
3258 if (pmap == PMAP_NULL) {
3259 return;
3260 }
3261
3262 validate_pmap(pmap);
3263
3264 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3265
3266 int32_t ref_count = os_atomic_dec(&pmap->ref_count, relaxed);
3267 if (ref_count > 0) {
3268 return;
3269 } else if (__improbable(ref_count < 0)) {
3270 panic("pmap %p: refcount underflow", pmap);
3271 } else if (__improbable(pmap == kernel_pmap)) {
3272 panic("pmap %p: attempt to destroy kernel pmap", pmap);
3273 } else if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
3274 panic("pmap %p: attempt to destroy commpage pmap", pmap);
3275 }
3276
3277 #if XNU_MONITOR
3278 /*
3279 * Issue a store-load barrier to ensure the checks of nested_count and the per-CPU
3280 * pmaps below will not be speculated ahead of the decrement of ref_count above.
3281 * That ensures that if the pmap is currently in use elsewhere, this path will
3282 * either observe it in use and panic, or PMAP_VALIDATE_MUTABLE will observe a
3283 * ref_count of 0 and panic.
3284 */
3285 os_atomic_thread_fence(seq_cst);
3286 if (__improbable(os_atomic_load(&pmap->nested_count, relaxed) != 0)) {
3287 panic("pmap %p: attempt to destroy while nested", pmap);
3288 }
3289 const int max_cpu = ml_get_max_cpu_number();
3290 for (unsigned int i = 0; i <= max_cpu; ++i) {
3291 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3292 if (cpu_data == NULL) {
3293 continue;
3294 }
3295 if (__improbable(os_atomic_load(&cpu_data->inflight_pmap, relaxed) == pmap)) {
3296 panic("pmap %p: attempting to destroy while in-flight on cpu %llu", pmap, (uint64_t)i);
3297 } else if (__improbable(os_atomic_load(&cpu_data->active_pmap, relaxed) == pmap)) {
3298 panic("pmap %p: attempting to destroy while active on cpu %llu", pmap, (uint64_t)i);
3299 }
3300 }
3301 #endif
3302 pmap_unmap_commpage(pmap);
3303
3304 pmap_simple_lock(&pmaps_lock);
3305 queue_remove(&map_pmap_list, pmap, pmap_t, pmaps);
3306 pmap_simple_unlock(&pmaps_lock);
3307
3308 pmap_trim_self(pmap);
3309
3310 /*
3311 * Free the memory maps, then the
3312 * pmap structure.
3313 */
3314 pmap_deallocate_all_leaf_tts(pmap, pmap->tte, pt_attr_root_level(pt_attr));
3315
3316
3317
3318 if (pmap->tte) {
3319 pmap_tt1_deallocate(pmap, pmap->tte, pmap_root_alloc_size(pmap), 0);
3320 pmap->tte = (tt_entry_t *) NULL;
3321 pmap->ttep = 0;
3322 }
3323
3324 assert((tt_free_entry_t*)pmap->tt_entry_free == NULL);
3325
3326 if (__improbable(pmap->type == PMAP_TYPE_NESTED)) {
3327 pmap_get_pt_ops(pmap)->flush_tlb_region_async(pmap->nested_region_addr, pmap->nested_region_size, pmap, false, false);
3328 sync_tlb_flush();
3329 } else {
3330 pmap_get_pt_ops(pmap)->flush_tlb_async(pmap);
3331 sync_tlb_flush();
3332 /* return its asid to the pool */
3333 pmap_get_pt_ops(pmap)->free_id(pmap);
3334 if (pmap->nested_pmap != NULL) {
3335 #if XNU_MONITOR
3336 os_atomic_dec(&pmap->nested_pmap->nested_count, relaxed);
3337 #endif
3338 /* release the reference we hold on the nested pmap */
3339 pmap_destroy_internal(pmap->nested_pmap);
3340 }
3341 }
3342
3343 pmap_check_ledgers(pmap);
3344
3345 if (pmap->nested_region_unnested_table_bitmap) {
3346 #if XNU_MONITOR
3347 pmap_pages_free(kvtophys_nofail((vm_offset_t)(pmap->nested_region_unnested_table_bitmap)), PAGE_SIZE);
3348 #else
3349 kfree_data(pmap->nested_region_unnested_table_bitmap,
3350 pmap->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
3351 #endif
3352 }
3353
3354 #if XNU_MONITOR
3355 if (pmap->ledger) {
3356 pmap_ledger_release(pmap->ledger);
3357 }
3358
3359 pmap_lock_destroy(pmap);
3360 pmap_free_pmap(pmap);
3361 #else
3362 pmap_lock_destroy(pmap);
3363 zfree(pmap_zone, pmap);
3364 #endif
3365 }
3366
3367 void
pmap_destroy(pmap_t pmap)3368 pmap_destroy(
3369 pmap_t pmap)
3370 {
3371 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
3372
3373 ledger_t ledger = pmap->ledger;
3374
3375 #if XNU_MONITOR
3376 pmap_destroy_ppl(pmap);
3377
3378 pmap_ledger_check_balance(pmap);
3379 #else
3380 pmap_destroy_internal(pmap);
3381 #endif
3382
3383 ledger_dereference(ledger);
3384
3385 PMAP_TRACE(1, PMAP_CODE(PMAP__DESTROY) | DBG_FUNC_END);
3386 }
3387
3388
3389 /*
3390 * Add a reference to the specified pmap.
3391 */
3392 MARK_AS_PMAP_TEXT void
pmap_reference_internal(pmap_t pmap)3393 pmap_reference_internal(
3394 pmap_t pmap)
3395 {
3396 if (pmap != PMAP_NULL) {
3397 validate_pmap_mutable(pmap);
3398 os_atomic_inc(&pmap->ref_count, relaxed);
3399 }
3400 }
3401
3402 void
pmap_reference(pmap_t pmap)3403 pmap_reference(
3404 pmap_t pmap)
3405 {
3406 #if XNU_MONITOR
3407 pmap_reference_ppl(pmap);
3408 #else
3409 pmap_reference_internal(pmap);
3410 #endif
3411 }
3412
3413 static tt_entry_t *
pmap_tt1_allocate(pmap_t pmap,vm_size_t size,unsigned option)3414 pmap_tt1_allocate(
3415 pmap_t pmap,
3416 vm_size_t size,
3417 unsigned option)
3418 {
3419 tt_entry_t *tt1 = NULL;
3420 tt_free_entry_t *tt1_free;
3421 pmap_paddr_t pa;
3422 vm_address_t va;
3423 vm_address_t va_end;
3424 kern_return_t ret;
3425
3426 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3427 size = PAGE_SIZE;
3428 }
3429
3430 pmap_simple_lock(&tt1_lock);
3431 if ((size == PAGE_SIZE) && (free_page_size_tt_count != 0)) {
3432 free_page_size_tt_count--;
3433 tt1 = (tt_entry_t *)free_page_size_tt_list;
3434 free_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3435 } else if ((size == 2 * PAGE_SIZE) && (free_two_page_size_tt_count != 0)) {
3436 free_two_page_size_tt_count--;
3437 tt1 = (tt_entry_t *)free_two_page_size_tt_list;
3438 free_two_page_size_tt_list = ((tt_free_entry_t *)tt1)->next;
3439 } else if ((size < PAGE_SIZE) && (free_tt_count != 0)) {
3440 free_tt_count--;
3441 tt1 = (tt_entry_t *)free_tt_list;
3442 free_tt_list = (tt_free_entry_t *)((tt_free_entry_t *)tt1)->next;
3443 }
3444
3445 pmap_simple_unlock(&tt1_lock);
3446
3447 if (tt1 != NULL) {
3448 pmap_tt_ledger_credit(pmap, size);
3449 return (tt_entry_t *)tt1;
3450 }
3451
3452 ret = pmap_pages_alloc_zeroed(&pa, (unsigned)((size < PAGE_SIZE)? PAGE_SIZE : size), ((option & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0));
3453
3454 if (ret == KERN_RESOURCE_SHORTAGE) {
3455 return (tt_entry_t *)0;
3456 }
3457
3458 #if XNU_MONITOR
3459 assert(pa);
3460 #endif
3461
3462 if (size < PAGE_SIZE) {
3463 va = phystokv(pa) + size;
3464 tt_free_entry_t *local_free_list = (tt_free_entry_t*)va;
3465 tt_free_entry_t *next_free = NULL;
3466 for (va_end = phystokv(pa) + PAGE_SIZE; va < va_end; va = va + size) {
3467 tt1_free = (tt_free_entry_t *)va;
3468 tt1_free->next = next_free;
3469 next_free = tt1_free;
3470 }
3471 pmap_simple_lock(&tt1_lock);
3472 local_free_list->next = free_tt_list;
3473 free_tt_list = next_free;
3474 free_tt_count += ((PAGE_SIZE / size) - 1);
3475 if (free_tt_count > free_tt_max) {
3476 free_tt_max = free_tt_count;
3477 }
3478 pmap_simple_unlock(&tt1_lock);
3479 }
3480
3481 /* Always report root allocations in units of PMAP_ROOT_ALLOC_SIZE, which can be obtained by sysctl arm_pt_root_size.
3482 * Depending on the device, this can vary between 512b and 16K. */
3483 OSAddAtomic((uint32_t)(size / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3484 OSAddAtomic64(size / PMAP_ROOT_ALLOC_SIZE, &alloc_tteroot_count);
3485 pmap_tt_ledger_credit(pmap, size);
3486
3487 return (tt_entry_t *) phystokv(pa);
3488 }
3489
3490 static void
pmap_tt1_deallocate(pmap_t pmap,tt_entry_t * tt,vm_size_t size,unsigned option)3491 pmap_tt1_deallocate(
3492 pmap_t pmap,
3493 tt_entry_t *tt,
3494 vm_size_t size,
3495 unsigned option)
3496 {
3497 tt_free_entry_t *tt_entry;
3498
3499 if ((size < PAGE_SIZE) && (size != PMAP_ROOT_ALLOC_SIZE)) {
3500 size = PAGE_SIZE;
3501 }
3502
3503 tt_entry = (tt_free_entry_t *)tt;
3504 assert(not_in_kdp);
3505 pmap_simple_lock(&tt1_lock);
3506
3507 if (size < PAGE_SIZE) {
3508 free_tt_count++;
3509 if (free_tt_count > free_tt_max) {
3510 free_tt_max = free_tt_count;
3511 }
3512 tt_entry->next = free_tt_list;
3513 free_tt_list = tt_entry;
3514 }
3515
3516 if (size == PAGE_SIZE) {
3517 free_page_size_tt_count++;
3518 if (free_page_size_tt_count > free_page_size_tt_max) {
3519 free_page_size_tt_max = free_page_size_tt_count;
3520 }
3521 tt_entry->next = free_page_size_tt_list;
3522 free_page_size_tt_list = tt_entry;
3523 }
3524
3525 if (size == 2 * PAGE_SIZE) {
3526 free_two_page_size_tt_count++;
3527 if (free_two_page_size_tt_count > free_two_page_size_tt_max) {
3528 free_two_page_size_tt_max = free_two_page_size_tt_count;
3529 }
3530 tt_entry->next = free_two_page_size_tt_list;
3531 free_two_page_size_tt_list = tt_entry;
3532 }
3533
3534 if (option & PMAP_TT_DEALLOCATE_NOBLOCK) {
3535 pmap_simple_unlock(&tt1_lock);
3536 pmap_tt_ledger_debit(pmap, size);
3537 return;
3538 }
3539
3540 while (free_page_size_tt_count > FREE_PAGE_SIZE_TT_MAX) {
3541 free_page_size_tt_count--;
3542 tt = (tt_entry_t *)free_page_size_tt_list;
3543 free_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3544
3545 pmap_simple_unlock(&tt1_lock);
3546
3547 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), PAGE_SIZE);
3548
3549 OSAddAtomic(-(int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3550
3551 pmap_simple_lock(&tt1_lock);
3552 }
3553
3554 while (free_two_page_size_tt_count > FREE_TWO_PAGE_SIZE_TT_MAX) {
3555 free_two_page_size_tt_count--;
3556 tt = (tt_entry_t *)free_two_page_size_tt_list;
3557 free_two_page_size_tt_list = ((tt_free_entry_t *)tt)->next;
3558
3559 pmap_simple_unlock(&tt1_lock);
3560
3561 pmap_pages_free(ml_static_vtop((vm_offset_t)tt), 2 * PAGE_SIZE);
3562
3563 OSAddAtomic(-2 * (int32_t)(PAGE_SIZE / PMAP_ROOT_ALLOC_SIZE), (pmap == kernel_pmap ? &inuse_kernel_tteroot_count : &inuse_user_tteroot_count));
3564
3565 pmap_simple_lock(&tt1_lock);
3566 }
3567 pmap_simple_unlock(&tt1_lock);
3568 pmap_tt_ledger_debit(pmap, size);
3569 }
3570
3571 MARK_AS_PMAP_TEXT static kern_return_t
pmap_tt_allocate(pmap_t pmap,tt_entry_t ** ttp,unsigned int level,unsigned int options)3572 pmap_tt_allocate(
3573 pmap_t pmap,
3574 tt_entry_t **ttp,
3575 unsigned int level,
3576 unsigned int options)
3577 {
3578 pmap_paddr_t pa;
3579 *ttp = NULL;
3580
3581 /* Traverse the tt_entry_free list to find a free tt_entry */
3582 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3583 return KERN_ABORTED;
3584 }
3585
3586 if ((tt_free_entry_t *)pmap->tt_entry_free != NULL) {
3587 tt_free_entry_t *tt_free_cur, *tt_free_next;
3588
3589 tt_free_cur = ((tt_free_entry_t *)pmap->tt_entry_free);
3590 tt_free_next = tt_free_cur->next;
3591 tt_free_cur->next = NULL;
3592 *ttp = (tt_entry_t *)tt_free_cur;
3593 pmap->tt_entry_free = (tt_entry_t *)tt_free_next;
3594 }
3595 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3596
3597 /* Only do the heavylifting here when we don't have a free tt_entry. */
3598 if (*ttp == NULL) {
3599 pt_desc_t *ptdp;
3600
3601 /*
3602 * Allocate a VM page for the level x page table entries.
3603 */
3604 while (pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0)) != KERN_SUCCESS) {
3605 if (options & PMAP_OPTIONS_NOWAIT) {
3606 return KERN_RESOURCE_SHORTAGE;
3607 }
3608 VM_PAGE_WAIT();
3609 }
3610
3611 /* Allocate a new Page Table Descriptor for the newly allocated page table. */
3612 while ((ptdp = ptd_alloc(pmap)) == NULL) {
3613 if (options & PMAP_OPTIONS_NOWAIT) {
3614 /* Deallocate all allocated resources so far. */
3615 pmap_pages_free(pa, PAGE_SIZE);
3616 return KERN_RESOURCE_SHORTAGE;
3617 }
3618 VM_PAGE_WAIT();
3619 }
3620
3621 if (level < pt_attr_leaf_level(pmap_get_pt_attr(pmap))) {
3622 OSAddAtomic64(1, &alloc_ttepages_count);
3623 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3624 } else {
3625 OSAddAtomic64(1, &alloc_ptepages_count);
3626 OSAddAtomic(1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3627 }
3628
3629 pmap_tt_ledger_credit(pmap, PAGE_SIZE);
3630
3631 PMAP_ZINFO_PALLOC(pmap, PAGE_SIZE);
3632
3633 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), ptdp, PVH_TYPE_PTDP);
3634 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
3635 pvh_set_flags(pai_to_pvh(pa_index(pa)), 0);
3636
3637 uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap));
3638 if (PAGE_SIZE > pmap_page_size) {
3639 vm_address_t va;
3640 vm_address_t va_end;
3641
3642 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
3643 /* Deallocate all allocated resources so far. */
3644 pvh_update_head_unlocked(pai_to_pvh(pa_index(pa)), PV_ENTRY_NULL, PVH_TYPE_NULL);
3645 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3646 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3647 pmap_pages_free(pa, PAGE_SIZE);
3648 ptd_deallocate(ptdp);
3649
3650 return KERN_ABORTED;
3651 }
3652
3653 for (va_end = phystokv(pa) + PAGE_SIZE, va = phystokv(pa) + pmap_page_size; va < va_end; va = va + pmap_page_size) {
3654 ((tt_free_entry_t *)va)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3655 pmap->tt_entry_free = (tt_entry_t *)va;
3656 }
3657 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3658 }
3659
3660 *ttp = (tt_entry_t *)phystokv(pa);
3661 }
3662
3663 #if XNU_MONITOR
3664 assert(*ttp);
3665 #endif
3666
3667 return KERN_SUCCESS;
3668 }
3669
3670
3671 static void
pmap_tt_deallocate(pmap_t pmap,tt_entry_t * ttp,unsigned int level)3672 pmap_tt_deallocate(
3673 pmap_t pmap,
3674 tt_entry_t *ttp,
3675 unsigned int level)
3676 {
3677 pt_desc_t *ptdp;
3678 ptd_info_t *ptd_info;
3679 unsigned pt_acc_cnt;
3680 unsigned i;
3681 vm_offset_t free_page = 0;
3682 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3683 unsigned max_pt_index = PAGE_SIZE / pt_attr_page_size(pt_attr);
3684
3685 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
3686
3687 ptdp = ptep_get_ptd(ttp);
3688 ptd_info = ptd_get_info(ptdp, ttp);
3689
3690 ptdp->va[ptd_get_index(ptdp, ttp)] = (vm_offset_t)-1;
3691
3692 if ((level < pt_attr_leaf_level(pt_attr)) && (ptd_info->refcnt == PT_DESC_REFCOUNT)) {
3693 ptd_info->refcnt = 0;
3694 }
3695
3696 if (__improbable(ptd_info->refcnt != 0)) {
3697 panic("pmap_tt_deallocate(): ptdp %p, count %d", ptdp, ptd_info->refcnt);
3698 }
3699
3700 for (i = 0, pt_acc_cnt = 0; i < max_pt_index; i++) {
3701 pt_acc_cnt += ptdp->ptd_info[i].refcnt;
3702 }
3703
3704 if (pt_acc_cnt == 0) {
3705 tt_free_entry_t *tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3706 unsigned pt_free_entry_cnt = 1;
3707
3708 while (pt_free_entry_cnt < max_pt_index && tt_free_list) {
3709 tt_free_entry_t *tt_free_list_next;
3710
3711 tt_free_list_next = tt_free_list->next;
3712 if ((((vm_offset_t)tt_free_list_next) - ((vm_offset_t)ttp & ~PAGE_MASK)) < PAGE_SIZE) {
3713 pt_free_entry_cnt++;
3714 }
3715 tt_free_list = tt_free_list_next;
3716 }
3717 if (pt_free_entry_cnt == max_pt_index) {
3718 tt_free_entry_t *tt_free_list_cur;
3719
3720 free_page = (vm_offset_t)ttp & ~PAGE_MASK;
3721 tt_free_list = (tt_free_entry_t *)&pmap->tt_entry_free;
3722 tt_free_list_cur = (tt_free_entry_t *)&pmap->tt_entry_free;
3723
3724 while (tt_free_list_cur) {
3725 tt_free_entry_t *tt_free_list_next;
3726
3727 tt_free_list_next = tt_free_list_cur->next;
3728 if ((((vm_offset_t)tt_free_list_next) - free_page) < PAGE_SIZE) {
3729 tt_free_list->next = tt_free_list_next->next;
3730 } else {
3731 tt_free_list = tt_free_list_next;
3732 }
3733 tt_free_list_cur = tt_free_list_next;
3734 }
3735 } else {
3736 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3737 pmap->tt_entry_free = ttp;
3738 }
3739 } else {
3740 ((tt_free_entry_t *)ttp)->next = (tt_free_entry_t *)pmap->tt_entry_free;
3741 pmap->tt_entry_free = ttp;
3742 }
3743
3744 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3745
3746 if (free_page != 0) {
3747 ptd_deallocate(ptep_get_ptd((pt_entry_t*)free_page));
3748 *(pt_desc_t **)pai_to_pvh(pa_index(ml_static_vtop(free_page))) = NULL;
3749 pmap_pages_free(ml_static_vtop(free_page), PAGE_SIZE);
3750 if (level < pt_attr_leaf_level(pt_attr)) {
3751 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ttepages_count : &inuse_user_ttepages_count));
3752 } else {
3753 OSAddAtomic(-1, (pmap == kernel_pmap ? &inuse_kernel_ptepages_count : &inuse_user_ptepages_count));
3754 }
3755 PMAP_ZINFO_PFREE(pmap, PAGE_SIZE);
3756 pmap_tt_ledger_debit(pmap, PAGE_SIZE);
3757 }
3758 }
3759
3760 /**
3761 * Safely clear out a translation table entry.
3762 *
3763 * @note If the TTE to clear out points to a leaf table, then that leaf table
3764 * must have a refcnt of zero before the TTE can be removed.
3765 * @note This function expects to be called with pmap locked exclusive, and will
3766 * return with pmap unlocked.
3767 *
3768 * @param pmap The pmap containing the page table whose TTE is being removed.
3769 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3770 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3771 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3772 * @param ttep Pointer to the TTE that should be cleared out.
3773 * @param level The level of the page table that contains the TTE to be removed.
3774 */
3775 static void
pmap_tte_remove(pmap_t pmap,vm_offset_t va_start,vm_offset_t va_end,bool need_strong_sync,tt_entry_t * ttep,unsigned int level)3776 pmap_tte_remove(
3777 pmap_t pmap,
3778 vm_offset_t va_start,
3779 vm_offset_t va_end,
3780 bool need_strong_sync,
3781 tt_entry_t *ttep,
3782 unsigned int level)
3783 {
3784 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3785
3786 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
3787 const tt_entry_t tte = *ttep;
3788
3789 if (__improbable(tte == ARM_TTE_EMPTY)) {
3790 panic("%s: L%d TTE is already empty. Potential double unmap or memory "
3791 "stomper? pmap=%p ttep=%p", __func__, level, pmap, ttep);
3792 }
3793
3794 *ttep = (tt_entry_t) 0;
3795 FLUSH_PTE_STRONG();
3796 // If given a VA range, we're being asked to flush the TLB before the table in ttep is freed.
3797 if (va_end > va_start) {
3798 PMAP_UPDATE_TLBS(pmap, va_start, va_end, need_strong_sync, false);
3799 }
3800
3801 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
3802
3803 /**
3804 * Remember, the passed in "level" parameter refers to the level above the
3805 * table that's getting removed (e.g., removing an L2 TTE will unmap an L3
3806 * page table).
3807 */
3808 const bool remove_leaf_table = (level == pt_attr_twig_level(pt_attr));
3809
3810 /**
3811 * Non-leaf pagetables don't track active references in the PTD and instead
3812 * use a sentinel refcount. If we're removing a leaf pagetable, we'll load
3813 * the real refcount below.
3814 */
3815 unsigned short refcnt = PT_DESC_REFCOUNT;
3816
3817 /*
3818 * It's possible that a concurrent pmap_disconnect() operation may need to reference
3819 * a PTE on the pagetable page to be removed. A full disconnect() may have cleared
3820 * one or more PTEs on this page but not yet dropped the refcount, which would cause
3821 * us to panic in this function on a non-zero refcount. Moreover, it's possible for
3822 * a disconnect-to-compress operation to set the compressed marker on a PTE, and
3823 * for pmap_remove_range_options() to concurrently observe that marker, clear it, and
3824 * drop the pagetable refcount accordingly, without taking any PVH locks that could
3825 * synchronize it against the disconnect operation. If that removal caused the
3826 * refcount to reach zero, the pagetable page could be freed before the disconnect
3827 * operation is finished using the relevant pagetable descriptor.
3828 * Address these cases by waiting until all CPUs have been observed to not be
3829 * executing pmap_disconnect().
3830 */
3831 if (remove_leaf_table) {
3832 bitmap_t active_disconnects[BITMAP_LEN(MAX_CPUS)];
3833 const int max_cpu = ml_get_max_cpu_number();
3834 bitmap_full(&active_disconnects[0], max_cpu + 1);
3835 bool inflight_disconnect;
3836
3837 /*
3838 * Ensure the ensuing load of per-CPU inflight_disconnect is not speculated
3839 * ahead of any prior PTE load which may have observed the effect of a
3840 * concurrent disconnect operation. An acquire fence is required for this;
3841 * a load-acquire operation is insufficient.
3842 */
3843 os_atomic_thread_fence(acquire);
3844 do {
3845 inflight_disconnect = false;
3846 for (int i = bitmap_first(&active_disconnects[0], max_cpu + 1);
3847 i >= 0;
3848 i = bitmap_next(&active_disconnects[0], i)) {
3849 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
3850 if (cpu_data == NULL) {
3851 continue;
3852 }
3853 if (os_atomic_load_exclusive(&cpu_data->inflight_disconnect, relaxed)) {
3854 __builtin_arm_wfe();
3855 inflight_disconnect = true;
3856 continue;
3857 }
3858 os_atomic_clear_exclusive();
3859 bitmap_clear(&active_disconnects[0], (unsigned int)i);
3860 }
3861 } while (inflight_disconnect);
3862 /* Ensure the refcount is observed after any observation of inflight_disconnect */
3863 os_atomic_thread_fence(acquire);
3864 refcnt = os_atomic_load(&(ptep_get_info((pt_entry_t*)ttetokv(tte))->refcnt), relaxed);
3865 }
3866
3867 #if MACH_ASSERT
3868 /**
3869 * On internal devices, always do the page table consistency check
3870 * regardless of page table level or the actual refcnt value.
3871 */
3872 {
3873 #else /* MACH_ASSERT */
3874 /**
3875 * Only perform the page table consistency check when deleting leaf page
3876 * tables and it seems like there might be valid/compressed mappings
3877 * leftover.
3878 */
3879 if (__improbable(remove_leaf_table && refcnt != 0)) {
3880 #endif /* MACH_ASSERT */
3881
3882 /**
3883 * There are multiple problems that can arise as a non-zero refcnt:
3884 * 1. A bug in the refcnt management logic.
3885 * 2. A memory stomper or hardware failure.
3886 * 3. The VM forgetting to unmap all of the valid mappings in an address
3887 * space before destroying a pmap.
3888 *
3889 * By looping over the page table and determining how many valid or
3890 * compressed entries there actually are, we can narrow down which of
3891 * these three cases is causing this panic. If the expected refcnt
3892 * (valid + compressed) and the actual refcnt don't match then the
3893 * problem is probably either a memory corruption issue (if the
3894 * non-empty entries don't match valid+compressed, that could also be a
3895 * sign of corruption) or refcnt management bug. Otherwise, there
3896 * actually are leftover mappings and the higher layers of xnu are
3897 * probably at fault.
3898 */
3899 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
3900 pt_entry_t *bpte = ((pt_entry_t *) (ttetokv(tte) & ~(pmap_page_size - 1)));
3901
3902 pt_entry_t *ptep = bpte;
3903 unsigned short non_empty = 0, valid = 0, comp = 0;
3904
3905 for (unsigned int i = 0; i < (pmap_page_size / sizeof(*ptep)); i++, ptep++) {
3906 /**
3907 * Note that, for older 4K devices that emulate 16K pages, we ignore the
3908 * compressed marker on PTEs that aren't at an index that's a multiple of 4.
3909 * That's because it's possible for the 4-tuple PTE clear operation in
3910 * pmap_remove() and the 4-tuple PTE 'compressed' marker write operation in
3911 * pmap_disconnect() to race each other in such a way that the compressed marker
3912 * may be left in the 2nd, 3rd, and/or 4th PTEs.
3913 * This should be harmless as only the 1st PTE is used for accounting purposes,
3914 * but we don't want it to trip our internal checks here.
3915 */
3916 if (__improbable(ARM_PTE_IS_COMPRESSED(*ptep, ptep))) {
3917 if ((i % PAGE_RATIO) == 0) {
3918 comp++;
3919 } else {
3920 continue;
3921 }
3922 } else if (__improbable((*ptep & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE)) {
3923 valid++;
3924 }
3925
3926 /* Keep track of all non-empty entries to detect memory corruption. */
3927 if (__improbable(*ptep != ARM_PTE_EMPTY)) {
3928 non_empty++;
3929 }
3930 }
3931
3932 #if MACH_ASSERT
3933 /**
3934 * On internal machines, panic whenever a page table getting deleted has
3935 * leftover mappings (valid or otherwise) or a leaf page table has a
3936 * non-zero refcnt.
3937 */
3938 if (__improbable((non_empty != 0) || (remove_leaf_table && refcnt != 0))) {
3939 #else /* MACH_ASSERT */
3940 /* We already know the leaf page-table has a non-zero refcnt, so panic. */
3941 {
3942 #endif /* MACH_ASSERT */
3943 panic("%s: Found inconsistent state in soon to be deleted L%d table: %d valid, "
3944 "%d compressed, %d non-empty, refcnt=%d, L%d tte=%#llx, pmap=%p, bpte=%p", __func__,
3945 level + 1, valid, comp, non_empty, refcnt, level, (uint64_t)tte, pmap, bpte);
3946 }
3947 }
3948 }
3949
3950 /**
3951 * Given a pointer to an entry within a `level` page table, delete the
3952 * page table at `level` + 1 that is represented by that entry. For instance,
3953 * to delete an unused L3 table, `ttep` would be a pointer to the L2 entry that
3954 * contains the PA of the L3 table, and `level` would be "2".
3955 *
3956 * @note If the table getting deallocated is a leaf table, then that leaf table
3957 * must have a refcnt of zero before getting deallocated. All other levels
3958 * must have a refcnt of PT_DESC_REFCOUNT in their page table descriptor.
3959 * @note This function expects to be called with pmap locked exclusive and will
3960 * return with pmap unlocked.
3961 *
3962 * @param pmap The pmap that owns the page table to be deallocated.
3963 * @param va_start Beginning of the VA range mapped by the table being removed, for TLB maintenance.
3964 * @param va_end Non-inclusive end of the VA range mapped by the table being removed, for TLB maintenance.
3965 * @param need_strong_sync Indicates whether strong DSB should be used to synchronize TLB maintenance.
3966 * @param ttep Pointer to the `level` TTE to remove.
3967 * @param level The level of the table that contains an entry pointing to the
3968 * table to be removed. The deallocated page table will be a
3969 * `level` + 1 table (so if `level` is 2, then an L3 table will be
3970 * deleted).
3971 */
3972 void
3973 pmap_tte_deallocate(
3974 pmap_t pmap,
3975 vm_offset_t va_start,
3976 vm_offset_t va_end,
3977 bool need_strong_sync,
3978 tt_entry_t *ttep,
3979 unsigned int level)
3980 {
3981 tt_entry_t tte;
3982
3983 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
3984
3985 tte = *ttep;
3986
3987 if (tte_get_ptd(tte)->pmap != pmap) {
3988 panic("%s: Passed in pmap doesn't own the page table to be deleted ptd=%p ptd->pmap=%p pmap=%p",
3989 __func__, tte_get_ptd(tte), tte_get_ptd(tte)->pmap, pmap);
3990 }
3991
3992 assertf((tte & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE, "%s: invalid TTE %p (0x%llx)",
3993 __func__, ttep, (unsigned long long)tte);
3994
3995 /* pmap_tte_remove() will drop the pmap lock */
3996 pmap_tte_remove(pmap, va_start, va_end, need_strong_sync, ttep, level);
3997
3998 pmap_tt_deallocate(pmap, (tt_entry_t *) phystokv(tte_to_pa(tte)), level + 1);
3999 }
4000
4001 /*
4002 * Remove a range of hardware page-table entries.
4003 * The entries given are the first (inclusive)
4004 * and last (exclusive) entries for the VM pages.
4005 * The virtual address is the va for the first pte.
4006 *
4007 * The pmap must be locked.
4008 * If the pmap is not the kernel pmap, the range must lie
4009 * entirely within one pte-page. This is NOT checked.
4010 * Assumes that the pte-page exists.
4011 *
4012 * Returns the number of PTE changed
4013 */
4014 MARK_AS_PMAP_TEXT static int
4015 pmap_remove_range(
4016 pmap_t pmap,
4017 vm_map_address_t va,
4018 pt_entry_t *bpte,
4019 pt_entry_t *epte)
4020 {
4021 bool need_strong_sync = false;
4022 int num_changed = pmap_remove_range_options(pmap, va, bpte, epte, NULL,
4023 &need_strong_sync, PMAP_OPTIONS_REMOVE);
4024 if (num_changed > 0) {
4025 PMAP_UPDATE_TLBS(pmap, va,
4026 va + (pt_attr_page_size(pmap_get_pt_attr(pmap)) * (epte - bpte)), need_strong_sync, true);
4027 }
4028 return num_changed;
4029 }
4030
4031
4032 #ifdef PVH_FLAG_EXEC
4033
4034 /*
4035 * Update the access protection bits of the physical aperture mapping for a page.
4036 * This is useful, for example, in guranteeing that a verified executable page
4037 * has no writable mappings anywhere in the system, including the physical
4038 * aperture. flush_tlb_async can be set to true to avoid unnecessary TLB
4039 * synchronization overhead in cases where the call to this function is
4040 * guaranteed to be followed by other TLB operations.
4041 */
4042 void
4043 pmap_set_ptov_ap(unsigned int pai __unused, unsigned int ap __unused, boolean_t flush_tlb_async __unused)
4044 {
4045 #if __ARM_PTE_PHYSMAP__
4046 pvh_assert_locked(pai);
4047 vm_offset_t kva = phystokv(vm_first_phys + (pmap_paddr_t)ptoa(pai));
4048 pt_entry_t *pte_p = pmap_pte(kernel_pmap, kva);
4049
4050 pt_entry_t tmplate = *pte_p;
4051 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(ap)) {
4052 return;
4053 }
4054 tmplate = (tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(ap);
4055 if (tmplate & ARM_PTE_HINT_MASK) {
4056 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
4057 __func__, pte_p, (void *)kva, tmplate);
4058 }
4059 write_pte_strong(pte_p, tmplate);
4060 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
4061 if (!flush_tlb_async) {
4062 sync_tlb_flush();
4063 }
4064 #endif
4065 }
4066 #endif /* defined(PVH_FLAG_EXEC) */
4067
4068
4069
4070 MARK_AS_PMAP_TEXT int
4071 pmap_remove_range_options(
4072 pmap_t pmap,
4073 vm_map_address_t va,
4074 pt_entry_t *bpte,
4075 pt_entry_t *epte,
4076 vm_map_address_t *eva,
4077 bool *need_strong_sync __unused,
4078 int options)
4079 {
4080 pt_entry_t *cpte;
4081 size_t npages = 0;
4082 int num_removed, num_unwired;
4083 int num_pte_changed;
4084 unsigned int pai = 0;
4085 pmap_paddr_t pa;
4086 int num_external, num_internal, num_reusable;
4087 int num_alt_internal;
4088 uint64_t num_compressed, num_alt_compressed;
4089 int16_t refcnt = 0;
4090
4091 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
4092
4093 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4094 uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
4095
4096 if (__improbable((uintptr_t)epte > (((uintptr_t)bpte + pmap_page_size) & ~(pmap_page_size - 1)))) {
4097 panic("%s: PTE range [%p, %p) in pmap %p crosses page table boundary", __func__, bpte, epte, pmap);
4098 }
4099
4100 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4101 panic("%s: attempt to remove mappings from commpage pmap %p", __func__, pmap);
4102 }
4103
4104 if (__improbable((pmap == kernel_pmap) && (va >= physmap_base) && (va < physmap_end))) {
4105 panic("%s: attempt to remove mappings from the physical aperture for va: %p", __func__, (const void *) va);
4106 }
4107
4108 num_removed = 0;
4109 num_unwired = 0;
4110 num_pte_changed = 0;
4111 num_external = 0;
4112 num_internal = 0;
4113 num_reusable = 0;
4114 num_compressed = 0;
4115 num_alt_internal = 0;
4116 num_alt_compressed = 0;
4117
4118 #if XNU_MONITOR
4119 bool ro_va = false;
4120 if (__improbable((pmap == kernel_pmap) && (eva != NULL) && zone_spans_ro_va(va, *eva))) {
4121 ro_va = true;
4122 }
4123 #endif
4124 for (cpte = bpte; cpte < epte;
4125 cpte += PAGE_RATIO, va += pmap_page_size) {
4126 pt_entry_t spte;
4127 boolean_t managed = FALSE;
4128
4129 /*
4130 * Check for pending preemption on every iteration: the PV list may be arbitrarily long,
4131 * so we need to be as aggressive as possible in checking for preemption when we can.
4132 */
4133 if (__improbable((eva != NULL) && npages++ && pmap_pending_preemption())) {
4134 *eva = va;
4135 break;
4136 }
4137
4138 spte = *((volatile pt_entry_t*)cpte);
4139
4140 while (!managed) {
4141 if (pmap != kernel_pmap &&
4142 (options & PMAP_OPTIONS_REMOVE) &&
4143 (ARM_PTE_IS_COMPRESSED(spte, cpte))) {
4144 /*
4145 * "pmap" must be locked at this point,
4146 * so this should not race with another
4147 * pmap_remove_range() or pmap_enter().
4148 */
4149
4150 /* one less "compressed"... */
4151 num_compressed++;
4152 if (spte & ARM_PTE_COMPRESSED_ALT) {
4153 /* ... but it used to be "ALTACCT" */
4154 num_alt_compressed++;
4155 }
4156
4157 /* clear marker */
4158 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4159 /*
4160 * "refcnt" also accounts for
4161 * our "compressed" markers,
4162 * so let's update it here.
4163 */
4164 --refcnt;
4165 spte = *((volatile pt_entry_t*)cpte);
4166 }
4167 /*
4168 * It may be possible for the pte to transition from managed
4169 * to unmanaged in this timeframe; for now, elide the assert.
4170 * We should break out as a consequence of checking pa_valid.
4171 */
4172 //assert(!ARM_PTE_IS_COMPRESSED(spte));
4173 pa = pte_to_pa(spte);
4174 if (!pa_valid(pa)) {
4175 #if XNU_MONITOR
4176 unsigned int cacheattr = pmap_cache_attributes((ppnum_t)atop(pa));
4177 #endif
4178 #if XNU_MONITOR
4179 if (__improbable((cacheattr & PP_ATTR_MONITOR) &&
4180 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) && !pmap_ppl_disable)) {
4181 panic("%s: attempt to remove mapping of writable PPL-protected I/O address 0x%llx",
4182 __func__, (uint64_t)pa);
4183 }
4184 #endif
4185 break;
4186 }
4187 #if HAS_FEAT_XS
4188 if (pte_is_xs(pt_attr, spte)) {
4189 *need_strong_sync = true;
4190 }
4191 #endif /* HAS_FEAT_XS */
4192 pai = pa_index(pa);
4193 pvh_lock(pai);
4194 spte = *((volatile pt_entry_t*)cpte);
4195 pa = pte_to_pa(spte);
4196 if (pai == pa_index(pa)) {
4197 managed = TRUE;
4198 break; // Leave pai locked as we will unlock it after we free the PV entry
4199 }
4200 pvh_unlock(pai);
4201 }
4202
4203 if (ARM_PTE_IS_COMPRESSED(*cpte, cpte)) {
4204 /*
4205 * There used to be a valid mapping here but it
4206 * has already been removed when the page was
4207 * sent to the VM compressor, so nothing left to
4208 * remove now...
4209 */
4210 continue;
4211 }
4212
4213 /* remove the translation, do not flush the TLB */
4214 if (*cpte != ARM_PTE_TYPE_FAULT) {
4215 assertf(!ARM_PTE_IS_COMPRESSED(*cpte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4216 assertf((*cpte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)*cpte);
4217 #if MACH_ASSERT
4218 if (managed && (pmap != kernel_pmap) && (ptep_get_va(cpte) != va)) {
4219 panic("pmap_remove_range_options(): VA mismatch: cpte=%p ptd=%p pte=0x%llx va=0x%llx, cpte va=0x%llx",
4220 cpte, ptep_get_ptd(cpte), (uint64_t)*cpte, (uint64_t)va, (uint64_t)ptep_get_va(cpte));
4221 }
4222 #endif
4223 write_pte_fast(cpte, ARM_PTE_TYPE_FAULT);
4224 num_pte_changed++;
4225 }
4226
4227 if ((spte != ARM_PTE_TYPE_FAULT) &&
4228 (pmap != kernel_pmap)) {
4229 assertf(!ARM_PTE_IS_COMPRESSED(spte, cpte), "unexpected compressed pte %p (=0x%llx)", cpte, (uint64_t)spte);
4230 assertf((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", cpte, (uint64_t)spte);
4231 --refcnt;
4232 }
4233
4234 if (pte_is_wired(spte)) {
4235 pte_set_wired(pmap, cpte, 0);
4236 num_unwired++;
4237 }
4238 /*
4239 * if not managed, we're done
4240 */
4241 if (!managed) {
4242 continue;
4243 }
4244
4245 #if XNU_MONITOR
4246 if (__improbable(pmap == kernel_pmap && ppattr_pa_test_no_monitor(pa))) {
4247 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)pa);
4248 }
4249 if (__improbable(ro_va)) {
4250 pmap_ppl_unlockdown_page_locked(pai, PVH_FLAG_LOCKDOWN_RO, true);
4251 }
4252 #endif
4253
4254 /*
4255 * find and remove the mapping from the chain for this
4256 * physical address.
4257 */
4258 bool is_internal, is_altacct;
4259 pmap_remove_pv(pmap, cpte, pai, true, &is_internal, &is_altacct);
4260
4261 if (is_altacct) {
4262 assert(is_internal);
4263 num_internal++;
4264 num_alt_internal++;
4265 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4266 ppattr_clear_altacct(pai);
4267 ppattr_clear_internal(pai);
4268 }
4269 } else if (is_internal) {
4270 if (ppattr_test_reusable(pai)) {
4271 num_reusable++;
4272 } else {
4273 num_internal++;
4274 }
4275 if (!pvh_test_type(pai_to_pvh(pai), PVH_TYPE_PTEP)) {
4276 ppattr_clear_internal(pai);
4277 }
4278 } else {
4279 num_external++;
4280 }
4281 pvh_unlock(pai);
4282 num_removed++;
4283 }
4284
4285 /*
4286 * Update the counts
4287 */
4288 pmap_ledger_debit(pmap, task_ledgers.phys_mem, num_removed * pmap_page_size);
4289
4290 if (pmap != kernel_pmap) {
4291 if ((refcnt != 0) && (OSAddAtomic16(refcnt, (SInt16 *) &(ptep_get_info(bpte)->refcnt)) <= 0)) {
4292 panic("pmap_remove_range_options: over-release of ptdp %p for pte [%p, %p)", ptep_get_ptd(bpte), bpte, epte);
4293 }
4294
4295 /* update ledgers */
4296 pmap_ledger_debit(pmap, task_ledgers.external, (num_external) * pmap_page_size);
4297 pmap_ledger_debit(pmap, task_ledgers.reusable, (num_reusable) * pmap_page_size);
4298 pmap_ledger_debit(pmap, task_ledgers.wired_mem, (num_unwired) * pmap_page_size);
4299 pmap_ledger_debit(pmap, task_ledgers.internal, (num_internal) * pmap_page_size);
4300 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, (num_alt_internal) * pmap_page_size);
4301 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, (num_alt_compressed) * pmap_page_size);
4302 pmap_ledger_debit(pmap, task_ledgers.internal_compressed, (num_compressed) * pmap_page_size);
4303 /* make needed adjustments to phys_footprint */
4304 pmap_ledger_debit(pmap, task_ledgers.phys_footprint,
4305 ((num_internal -
4306 num_alt_internal) +
4307 (num_compressed -
4308 num_alt_compressed)) * pmap_page_size);
4309 }
4310
4311 /* flush the ptable entries we have written */
4312 if (num_pte_changed > 0) {
4313 FLUSH_PTE_STRONG();
4314 }
4315
4316 return num_pte_changed;
4317 }
4318
4319
4320 /*
4321 * Remove the given range of addresses
4322 * from the specified map.
4323 *
4324 * It is assumed that the start and end are properly
4325 * rounded to the hardware page size.
4326 */
4327 void
4328 pmap_remove(
4329 pmap_t pmap,
4330 vm_map_address_t start,
4331 vm_map_address_t end)
4332 {
4333 pmap_remove_options(pmap, start, end, PMAP_OPTIONS_REMOVE);
4334 }
4335
4336 MARK_AS_PMAP_TEXT vm_map_address_t
4337 pmap_remove_options_internal(
4338 pmap_t pmap,
4339 vm_map_address_t start,
4340 vm_map_address_t end,
4341 int options)
4342 {
4343 vm_map_address_t eva = end;
4344 pt_entry_t *bpte, *epte;
4345 pt_entry_t *pte_p;
4346 tt_entry_t *tte_p;
4347 int remove_count = 0;
4348 bool need_strong_sync = false;
4349 bool unlock = true;
4350
4351 validate_pmap_mutable(pmap);
4352
4353 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4354
4355 if (__improbable((end < start) || ((start | end) & pt_attr_leaf_offmask(pt_attr)))) {
4356 panic("%s: pmap %p invalid address range %p, %p", __func__, pmap, (void*)start, (void*)end);
4357 }
4358
4359 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
4360
4361 tte_p = pmap_tte(pmap, start);
4362
4363 if (tte_p == (tt_entry_t *) NULL) {
4364 goto done;
4365 }
4366
4367 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
4368 pte_p = (pt_entry_t *) ttetokv(*tte_p);
4369 bpte = &pte_p[pte_index(pt_attr, start)];
4370 epte = bpte + ((end - start) >> pt_attr_leaf_shift(pt_attr));
4371
4372 /*
4373 * This check is really intended to ensure that mappings in a nested pmap can't be removed
4374 * through a top-level user pmap, although it's also a useful sanity check for other pmap types.
4375 * Note that kernel page tables may not have PTDs, so we can't use the check there.
4376 */
4377 if (__improbable((pmap->type != PMAP_TYPE_KERNEL) && (ptep_get_pmap(bpte) != pmap))) {
4378 panic("%s: attempt to remove mappings owned by pmap %p through pmap %p, starting at pte %p",
4379 __func__, ptep_get_pmap(bpte), pmap, bpte);
4380 }
4381
4382 remove_count = pmap_remove_range_options(pmap, start, bpte, epte, &eva,
4383 &need_strong_sync, options);
4384
4385 if ((pmap->type == PMAP_TYPE_USER) && (ptep_get_info(pte_p)->refcnt == 0)) {
4386 pmap_tte_deallocate(pmap, start, eva, need_strong_sync, tte_p, pt_attr_twig_level(pt_attr));
4387 remove_count = 0; // pmap_tte_deallocate has flushed the TLB for us
4388 unlock = false; // pmap_tte_deallocate() has dropped the lock
4389 }
4390 }
4391
4392 done:
4393 if (unlock) {
4394 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
4395 }
4396
4397 if (remove_count > 0) {
4398 PMAP_UPDATE_TLBS(pmap, start, eva, need_strong_sync, true);
4399 }
4400 return eva;
4401 }
4402
4403 void
4404 pmap_remove_options(
4405 pmap_t pmap,
4406 vm_map_address_t start,
4407 vm_map_address_t end,
4408 int options)
4409 {
4410 vm_map_address_t va;
4411
4412 if (pmap == PMAP_NULL) {
4413 return;
4414 }
4415
4416 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4417
4418 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_START,
4419 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
4420 VM_KERNEL_ADDRHIDE(end));
4421
4422 /*
4423 * We allow single-page requests to execute non-preemptibly,
4424 * as it doesn't make sense to sample AST_URGENT for a single-page
4425 * operation, and there are a couple of special use cases that
4426 * require a non-preemptible single-page operation.
4427 */
4428 if ((end - start) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
4429 pmap_verify_preemptible();
4430 }
4431
4432 /*
4433 * Invalidate the translation buffer first
4434 */
4435 va = start;
4436 while (va < end) {
4437 vm_map_address_t l;
4438
4439 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
4440 if (l > end) {
4441 l = end;
4442 }
4443
4444 #if XNU_MONITOR
4445 va = pmap_remove_options_ppl(pmap, va, l, options);
4446
4447 pmap_ledger_check_balance(pmap);
4448 #else
4449 va = pmap_remove_options_internal(pmap, va, l, options);
4450 #endif
4451 }
4452
4453 PMAP_TRACE(2, PMAP_CODE(PMAP__REMOVE) | DBG_FUNC_END);
4454 }
4455
4456
4457 /*
4458 * Remove phys addr if mapped in specified map
4459 */
4460 void
4461 pmap_remove_some_phys(
4462 __unused pmap_t map,
4463 __unused ppnum_t pn)
4464 {
4465 /* Implement to support working set code */
4466 }
4467
4468 /*
4469 * Implementation of PMAP_SWITCH_USER that Mach VM uses to
4470 * switch a thread onto a new vm_map.
4471 */
4472 void
4473 pmap_switch_user(thread_t thread, vm_map_t new_map)
4474 {
4475 pmap_t new_pmap = new_map->pmap;
4476
4477
4478 thread->map = new_map;
4479 pmap_set_pmap(new_pmap, thread);
4480
4481 }
4482
4483 void
4484 pmap_set_pmap(
4485 pmap_t pmap,
4486 #if !__ARM_USER_PROTECT__
4487 __unused
4488 #endif
4489 thread_t thread)
4490 {
4491 pmap_switch(pmap);
4492 #if __ARM_USER_PROTECT__
4493 thread->machine.uptw_ttb = ((unsigned int) pmap->ttep) | TTBR_SETUP;
4494 thread->machine.asid = pmap->hw_asid;
4495 #endif
4496 }
4497
4498 static void
4499 pmap_flush_core_tlb_asid_async(pmap_t pmap)
4500 {
4501 flush_core_tlb_asid_async(((uint64_t) pmap->hw_asid) << TLBI_ASID_SHIFT);
4502 }
4503
4504 static inline bool
4505 pmap_user_ttb_is_clear(void)
4506 {
4507 return get_mmu_ttb() == (invalid_ttep & TTBR_BADDR_MASK);
4508 }
4509
4510 MARK_AS_PMAP_TEXT void
4511 pmap_switch_internal(
4512 pmap_t pmap)
4513 {
4514 pmap_cpu_data_t *cpu_data_ptr = pmap_get_cpu_data();
4515 #if XNU_MONITOR
4516 os_atomic_store(&cpu_data_ptr->active_pmap, pmap, relaxed);
4517
4518 /**
4519 * Make sure a pmap is never active-and-nested. For more details,
4520 * see pmap_set_nested_internal().
4521 */
4522 os_atomic_thread_fence(seq_cst);
4523 if (__improbable(os_atomic_load(&pmap->type, relaxed) == PMAP_TYPE_NESTED)) {
4524 panic("%s: attempt to activate nested pmap %p", __func__, pmap);
4525 }
4526 #endif
4527 validate_pmap_mutable(pmap);
4528 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4529 uint16_t asid_index = pmap->hw_asid;
4530 bool do_asid_flush = false;
4531 bool do_commpage_flush = false;
4532
4533 if (__improbable((asid_index == 0) && (pmap != kernel_pmap))) {
4534 panic("%s: attempt to activate pmap with invalid ASID %p", __func__, pmap);
4535 }
4536 #if __ARM_KERNEL_PROTECT__
4537 asid_index >>= 1;
4538 #endif
4539
4540 pmap_t last_nested_pmap = cpu_data_ptr->cpu_nested_pmap;
4541 __unused const pt_attr_t *last_nested_pmap_attr = cpu_data_ptr->cpu_nested_pmap_attr;
4542 __unused vm_map_address_t last_nested_region_addr = cpu_data_ptr->cpu_nested_region_addr;
4543 __unused vm_map_offset_t last_nested_region_size = cpu_data_ptr->cpu_nested_region_size;
4544 bool do_shared_region_flush = ((pmap != kernel_pmap) && (last_nested_pmap != NULL) && (pmap->nested_pmap != last_nested_pmap));
4545 bool break_before_make = do_shared_region_flush;
4546
4547 #if !HAS_16BIT_ASID
4548 if ((pmap_max_asids > MAX_HW_ASIDS) && (asid_index > 0)) {
4549 asid_index -= 1;
4550 pmap_update_plru(asid_index);
4551
4552 /* Paranoia. */
4553 assert(asid_index < (sizeof(cpu_data_ptr->cpu_sw_asids) / sizeof(*cpu_data_ptr->cpu_sw_asids)));
4554
4555 /* Extract the "virtual" bits of the ASIDs (which could cause us to alias). */
4556 uint8_t new_sw_asid = pmap->sw_asid;
4557 uint8_t last_sw_asid = cpu_data_ptr->cpu_sw_asids[asid_index];
4558
4559 if (new_sw_asid != last_sw_asid) {
4560 /*
4561 * If the virtual ASID of the new pmap does not match the virtual ASID
4562 * last seen on this CPU for the physical ASID (that was a mouthful),
4563 * then this switch runs the risk of aliasing. We need to flush the
4564 * TLB for this phyiscal ASID in this case.
4565 */
4566 cpu_data_ptr->cpu_sw_asids[asid_index] = new_sw_asid;
4567 do_asid_flush = true;
4568 break_before_make = true;
4569 }
4570 }
4571 #endif /* !HAS_16BIT_ASID */
4572
4573 #if __ARM_MIXED_PAGE_SIZE__
4574 if (pt_attr->pta_tcr_value != get_tcr()) {
4575 break_before_make = true;
4576 }
4577 #endif
4578 #if __ARM_MIXED_PAGE_SIZE__
4579 /*
4580 * For mixed page size configurations, we need to flush the global commpage mappings from
4581 * the TLB when transitioning between address spaces with different page sizes. Otherwise
4582 * it's possible for a TLB fill against the incoming commpage to produce a TLB entry which
4583 * which partially overlaps a TLB entry from the outgoing commpage, leading to a TLB
4584 * conflict abort or other unpredictable behavior.
4585 */
4586 if (pt_attr_leaf_shift(pt_attr) != cpu_data_ptr->commpage_page_shift) {
4587 do_commpage_flush = true;
4588 }
4589 if (do_commpage_flush) {
4590 break_before_make = true;
4591 }
4592 #endif
4593 if (__improbable(break_before_make && !pmap_user_ttb_is_clear())) {
4594 PMAP_TRACE(1, PMAP_CODE(PMAP__CLEAR_USER_TTB), VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4595 pmap_clear_user_ttb_internal();
4596 }
4597
4598 /* If we're switching to a different nested pmap (i.e. shared region), we'll need
4599 * to flush the userspace mappings for that region. Those mappings are global
4600 * and will not be protected by the ASID. It should also be cheaper to flush the
4601 * entire local TLB rather than to do a broadcast MMU flush by VA region. */
4602 if (__improbable(do_shared_region_flush)) {
4603 #if __ARM_RANGE_TLBI__
4604 uint64_t page_shift_prev = pt_attr_leaf_shift(last_nested_pmap_attr);
4605 vm_map_offset_t npages_prev = last_nested_region_size >> page_shift_prev;
4606
4607 /* NOTE: here we flush the global TLB entries for the previous nested region only.
4608 * There may still be non-global entries that overlap with the incoming pmap's
4609 * nested region. On Apple SoCs at least, this is acceptable. Those non-global entries
4610 * must necessarily belong to a different ASID than the incoming pmap, or they would
4611 * be flushed in the do_asid_flush case below. This will prevent them from conflicting
4612 * with the incoming pmap's nested region. However, the ARMv8 ARM is not crystal clear
4613 * on whether such a global/inactive-nonglobal overlap is acceptable, so we may need
4614 * to consider additional invalidation here in the future. */
4615 if ((npages_prev >= ARM64_TLB_RANGE_MIN_PAGES) && (npages_prev <= ARM64_TLB_RANGE_MAX_PAGES)) {
4616 flush_core_tlb_allrange_async(generate_rtlbi_param((ppnum_t)npages_prev, 0, last_nested_region_addr, page_shift_prev));
4617 } else {
4618 /*
4619 * Note that we also do a full flush if npages_prev is 1 (i.e. at or below
4620 * ARM64_RANGE_TLB_FLUSH_THRESHOLD), but a properly configured system will never
4621 * have a single-page shared region anyway, not least because pmap_nest()
4622 * requires L2 block alignment of the address and size.
4623 */
4624 do_asid_flush = false;
4625 flush_core_tlb_async();
4626 }
4627 #else
4628 do_asid_flush = false;
4629 flush_core_tlb_async();
4630 #endif // __ARM_RANGE_TLBI__
4631 }
4632
4633 #if __ARM_MIXED_PAGE_SIZE__
4634 if (__improbable(do_commpage_flush)) {
4635 const uint64_t commpage_shift = cpu_data_ptr->commpage_page_shift;
4636 const uint64_t rtlbi_param = generate_rtlbi_param((ppnum_t)_COMM_PAGE64_NESTING_SIZE >> commpage_shift,
4637 0, _COMM_PAGE64_NESTING_START, commpage_shift);
4638 flush_core_tlb_allrange_async(rtlbi_param);
4639 }
4640 #endif
4641 if (__improbable(do_asid_flush)) {
4642 pmap_flush_core_tlb_asid_async(pmap);
4643 #if DEVELOPMENT || DEBUG
4644 os_atomic_inc(&pmap_asid_flushes, relaxed);
4645 #endif
4646 }
4647 if (__improbable(do_asid_flush || do_shared_region_flush || do_commpage_flush)) {
4648 sync_tlb_flush_local();
4649 }
4650
4651 pmap_switch_user_ttb(pmap, cpu_data_ptr);
4652 }
4653
4654 void
4655 pmap_switch(
4656 pmap_t pmap)
4657 {
4658 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_START, VM_KERNEL_ADDRHIDE(pmap), PMAP_VASID(pmap), pmap->hw_asid);
4659 #if XNU_MONITOR
4660 pmap_switch_ppl(pmap);
4661 #else
4662 pmap_switch_internal(pmap);
4663 #endif
4664 PMAP_TRACE(1, PMAP_CODE(PMAP__SWITCH) | DBG_FUNC_END);
4665 }
4666
4667 void
4668 pmap_page_protect(
4669 ppnum_t ppnum,
4670 vm_prot_t prot)
4671 {
4672 pmap_page_protect_options(ppnum, prot, 0, NULL);
4673 }
4674
4675 /*
4676 * Routine: pmap_page_protect_options
4677 *
4678 * Function:
4679 * Lower the permission for all mappings to a given
4680 * page.
4681 */
4682 MARK_AS_PMAP_TEXT static void
4683 pmap_page_protect_options_with_flush_range(
4684 ppnum_t ppnum,
4685 vm_prot_t prot,
4686 unsigned int options,
4687 pmap_tlb_flush_range_t *flush_range)
4688 {
4689 pmap_paddr_t phys = ptoa(ppnum);
4690 pv_entry_t **pv_h;
4691 pv_entry_t *pve_p, *orig_pve_p;
4692 pv_entry_t *pveh_p;
4693 pv_entry_t *pvet_p;
4694 pt_entry_t *pte_p, *orig_pte_p;
4695 pv_entry_t *new_pve_p;
4696 pt_entry_t *new_pte_p;
4697 vm_offset_t pvh_flags;
4698 unsigned int pai;
4699 bool remove;
4700 bool set_NX;
4701 unsigned int pvh_cnt = 0;
4702 unsigned int pass1_updated = 0;
4703 unsigned int pass2_updated = 0;
4704
4705 assert(ppnum != vm_page_fictitious_addr);
4706
4707 /* Only work with managed pages. */
4708 if (!pa_valid(phys)) {
4709 return;
4710 }
4711
4712 /*
4713 * Determine the new protection.
4714 */
4715 switch (prot) {
4716 case VM_PROT_ALL:
4717 return; /* nothing to do */
4718 case VM_PROT_READ:
4719 case VM_PROT_READ | VM_PROT_EXECUTE:
4720 remove = false;
4721 break;
4722 default:
4723 /* PPL security model requires that we flush TLBs before we exit if the page may be recycled. */
4724 options = options & ~PMAP_OPTIONS_NOFLUSH;
4725 remove = true;
4726 break;
4727 }
4728
4729 pmap_cpu_data_t *pmap_cpu_data = NULL;
4730 if (remove) {
4731 #if !XNU_MONITOR
4732 mp_disable_preemption();
4733 #endif
4734 pmap_cpu_data = pmap_get_cpu_data();
4735 os_atomic_store(&pmap_cpu_data->inflight_disconnect, true, relaxed);
4736 /*
4737 * Ensure the store to inflight_disconnect will be observed before any of the
4738 * ensuing PTE/refcount stores in this function. This flag is used to avoid
4739 * a race in which the VM may clear a pmap's mappings and destroy the pmap on
4740 * another CPU, in between this function's clearing a PTE and dropping the
4741 * corresponding pagetable refcount. That can lead to a panic if the
4742 * destroying thread observes a non-zero refcount. For this we need a store-
4743 * store barrier; a store-release operation would not be sufficient.
4744 */
4745 os_atomic_thread_fence(release);
4746 }
4747
4748 pai = pa_index(phys);
4749 pvh_lock(pai);
4750 pv_h = pai_to_pvh(pai);
4751 pvh_flags = pvh_get_flags(pv_h);
4752
4753 #if XNU_MONITOR
4754 if (__improbable(remove && (pvh_flags & PVH_FLAG_LOCKDOWN_MASK))) {
4755 panic("%d is locked down (%#llx), cannot remove", pai, (uint64_t)pvh_get_flags(pv_h));
4756 }
4757 if (__improbable(ppattr_pa_test_monitor(phys))) {
4758 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
4759 }
4760 if (__improbable(remove && ppattr_pa_test_no_monitor(phys))) {
4761 panic("%s: PA 0x%llx is pinned.", __func__, (uint64_t)phys);
4762 }
4763 #endif
4764
4765
4766 orig_pte_p = pte_p = PT_ENTRY_NULL;
4767 orig_pve_p = pve_p = PV_ENTRY_NULL;
4768 pveh_p = PV_ENTRY_NULL;
4769 pvet_p = PV_ENTRY_NULL;
4770 new_pve_p = PV_ENTRY_NULL;
4771 new_pte_p = PT_ENTRY_NULL;
4772
4773
4774 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
4775 orig_pte_p = pte_p = pvh_ptep(pv_h);
4776 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
4777 orig_pve_p = pve_p = pvh_pve_list(pv_h);
4778 pveh_p = pve_p;
4779 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
4780 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
4781 }
4782
4783 /* Pass 1: Update all CPU PTEs and accounting info as necessary */
4784 int pve_ptep_idx = 0;
4785
4786 /*
4787 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
4788 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
4789 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
4790 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
4791 * operation, TLB invalidation may be handled by the caller so it's possible for
4792 * tlb_flush_needed to be true while issue_tlbi is false.
4793 */
4794 bool issue_tlbi = false;
4795 bool tlb_flush_needed = false;
4796 const bool compress = (options & PMAP_OPTIONS_COMPRESSOR);
4797 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
4798 pt_entry_t tmplate = ARM_PTE_TYPE_FAULT;
4799 bool update = false;
4800
4801 if (pve_p != PV_ENTRY_NULL) {
4802 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
4803 if (pte_p == PT_ENTRY_NULL) {
4804 goto protect_skip_pve_pass1;
4805 }
4806 }
4807
4808 #ifdef PVH_FLAG_IOMMU
4809 if (pvh_ptep_is_iommu(pte_p)) {
4810 #if XNU_MONITOR
4811 if (__improbable(pvh_flags & PVH_FLAG_LOCKDOWN_MASK)) {
4812 panic("pmap_page_protect: ppnum 0x%x locked down, cannot be owned by iommu %p, pve_p=%p",
4813 ppnum, ptep_get_iommu(pte_p), pve_p);
4814 }
4815 #endif
4816 if (remove && (options & PMAP_OPTIONS_COMPRESSOR)) {
4817 panic("pmap_page_protect: attempt to compress ppnum 0x%x owned by iommu %p, pve_p=%p",
4818 ppnum, ptep_get_iommu(pte_p), pve_p);
4819 }
4820 goto protect_skip_pve_pass1;
4821 }
4822 #endif
4823 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
4824 const pmap_t pmap = ptdp->pmap;
4825 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
4826
4827 if (__improbable((pmap == NULL) || (atop(pte_to_pa(*pte_p)) != ppnum))) {
4828 #if MACH_ASSERT
4829 if ((pmap != NULL) && (pve_p != PV_ENTRY_NULL) && (kern_feature_override(KF_PMAPV_OVRD) == FALSE)) {
4830 /* Temporarily set PTEP to NULL so that the logic below doesn't pick it up as duplicate. */
4831 pt_entry_t *temp_ptep = pve_get_ptep(pve_p, pve_ptep_idx);
4832 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
4833
4834 pv_entry_t *check_pvep = pve_p;
4835
4836 do {
4837 if (pve_find_ptep_index(check_pvep, pte_p) != -1) {
4838 panic_plain("%s: duplicate pve entry ptep=%p pmap=%p, pvh=%p, "
4839 "pvep=%p, pai=0x%x", __func__, pte_p, pmap, pv_h, pve_p, pai);
4840 }
4841 } while ((check_pvep = pve_next(check_pvep)) != PV_ENTRY_NULL);
4842
4843 /* Restore previous PTEP value. */
4844 pve_set_ptep(pve_p, pve_ptep_idx, temp_ptep);
4845 }
4846 #endif
4847 panic("pmap_page_protect: bad pve entry pte_p=%p pmap=%p prot=%d options=%u, pv_h=%p, pveh_p=%p, pve_p=%p, pte=0x%llx, va=0x%llx ppnum: 0x%x",
4848 pte_p, pmap, prot, options, pv_h, pveh_p, pve_p, (uint64_t)*pte_p, (uint64_t)va, ppnum);
4849 }
4850
4851 #if DEVELOPMENT || DEBUG
4852 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
4853 #else
4854 if ((prot & VM_PROT_EXECUTE))
4855 #endif
4856 {
4857 set_NX = false;
4858 } else {
4859 set_NX = true;
4860 }
4861
4862 #if HAS_FEAT_XS
4863 /**
4864 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
4865 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
4866 */
4867 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
4868 #endif /* HAS_FEAT_XS */
4869
4870 /* Remove the mapping if new protection is NONE */
4871 if (remove) {
4872 if (__improbable(pmap->type == PMAP_TYPE_COMMPAGE)) {
4873 panic("%s: trying to remove mappings from a COMMPAGE pmap %p when unmapping ppnum %u.",
4874 __func__, pmap, ppnum);
4875 }
4876
4877 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
4878 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
4879 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
4880 pt_entry_t spte = *pte_p;
4881
4882 if (pte_is_wired(spte)) {
4883 pte_set_wired(pmap, pte_p, 0);
4884 spte = *pte_p;
4885 if (pmap != kernel_pmap) {
4886 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4887 }
4888 }
4889
4890 assertf(atop(pte_to_pa(spte)) == ppnum, "unexpected value 0x%llx for pte %p mapping ppnum 0x%x",
4891 (uint64_t)spte, pte_p, ppnum);
4892
4893 if (compress && is_internal && (pmap != kernel_pmap)) {
4894 assert(!ARM_PTE_IS_COMPRESSED(*pte_p, pte_p));
4895 /* mark this PTE as having been "compressed" */
4896 tmplate = ARM_PTE_COMPRESSED;
4897 if (is_altacct) {
4898 tmplate |= ARM_PTE_COMPRESSED_ALT;
4899 }
4900 } else {
4901 tmplate = ARM_PTE_TYPE_FAULT;
4902 }
4903
4904 assert(spte != tmplate);
4905 write_pte_fast(pte_p, tmplate);
4906 update = true;
4907 ++pass1_updated;
4908
4909 pmap_ledger_debit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4910
4911 if (pmap != kernel_pmap) {
4912 if (ppattr_test_reusable(pai) &&
4913 is_internal &&
4914 !is_altacct) {
4915 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4916 } else if (!is_internal) {
4917 pmap_ledger_debit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4918 }
4919
4920 if (is_altacct) {
4921 assert(is_internal);
4922 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4923 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4924 if (options & PMAP_OPTIONS_COMPRESSOR) {
4925 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4926 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4927 }
4928 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4929 ppattr_pve_clr_altacct(pai, pve_p, pve_ptep_idx);
4930 } else if (ppattr_test_reusable(pai)) {
4931 assert(is_internal);
4932 if (options & PMAP_OPTIONS_COMPRESSOR) {
4933 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4934 /* was not in footprint, but is now */
4935 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4936 }
4937 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4938 } else if (is_internal) {
4939 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4940
4941 /*
4942 * Update all stats related to physical footprint, which only
4943 * deals with internal pages.
4944 */
4945 if (options & PMAP_OPTIONS_COMPRESSOR) {
4946 /*
4947 * This removal is only being done so we can send this page to
4948 * the compressor; therefore it mustn't affect total task footprint.
4949 */
4950 pmap_ledger_credit(pmap, task_ledgers.internal_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4951 } else {
4952 /*
4953 * This internal page isn't going to the compressor, so adjust stats to keep
4954 * phys_footprint up to date.
4955 */
4956 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
4957 }
4958 ppattr_pve_clr_internal(pai, pve_p, pve_ptep_idx);
4959 } else {
4960 /* external page: no impact on ledgers */
4961 }
4962 }
4963 assert((pve_p == PV_ENTRY_NULL) || !pve_get_altacct(pve_p, pve_ptep_idx));
4964 } else {
4965 pt_entry_t spte = *pte_p;
4966 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
4967
4968 if (pmap == kernel_pmap) {
4969 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
4970 } else {
4971 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
4972 }
4973
4974 /*
4975 * While the naive implementation of this would serve to add execute
4976 * permission, this is not how the VM uses this interface, or how
4977 * x86_64 implements it. So ignore requests to add execute permissions.
4978 */
4979 if (set_NX) {
4980 tmplate |= pt_attr_leaf_xn(pt_attr);
4981 }
4982
4983
4984 assert(spte != ARM_PTE_TYPE_FAULT);
4985 assert(!ARM_PTE_IS_COMPRESSED(spte, pte_p));
4986
4987 if (spte != tmplate) {
4988 /*
4989 * Mark the PTE so that we'll know this mapping requires a TLB flush in pass 2.
4990 * This allows us to avoid unnecessary flushing e.g. for COW aliases that didn't
4991 * require permission updates. We use the ARM_PTE_WRITEABLE bit as that bit
4992 * should always be cleared by this function.
4993 */
4994 pte_set_was_writeable(tmplate, true);
4995 write_pte_fast(pte_p, tmplate);
4996 update = true;
4997 ++pass1_updated;
4998 } else if (pte_was_writeable(tmplate)) {
4999 /*
5000 * We didn't change any of the relevant permission bits in the PTE, so we don't need
5001 * to flush the TLB, but we do want to clear the "was_writeable" flag. When revoking
5002 * write access to a page, this function should always at least clear that flag for
5003 * all PTEs, as the VM is effectively requesting that subsequent write accesses to
5004 * these mappings go through vm_fault(). We therefore don't want those accesses to
5005 * be handled through arm_fast_fault().
5006 */
5007 pte_set_was_writeable(tmplate, false);
5008 write_pte_fast(pte_p, tmplate);
5009 }
5010 }
5011
5012 if (!issue_tlbi && update && !(options & PMAP_OPTIONS_NOFLUSH)) {
5013 tlb_flush_needed = true;
5014 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5015 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5016 issue_tlbi = true;
5017 }
5018 }
5019 protect_skip_pve_pass1:
5020 pte_p = PT_ENTRY_NULL;
5021 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5022 pve_ptep_idx = 0;
5023 pve_p = pve_next(pve_p);
5024 }
5025 }
5026
5027 if (tlb_flush_needed) {
5028 FLUSH_PTE_STRONG();
5029 }
5030
5031 if (!remove && !issue_tlbi) {
5032 goto protect_finish;
5033 }
5034
5035 /* Pass 2: Invalidate TLBs and update the list to remove CPU mappings */
5036 pv_entry_t **pve_pp = pv_h;
5037 pve_p = orig_pve_p;
5038 pte_p = orig_pte_p;
5039 pve_ptep_idx = 0;
5040
5041 /*
5042 * We need to keep track of whether a particular PVE list contains IOMMU
5043 * mappings when removing entries, because we should only remove CPU
5044 * mappings. If a PVE list contains at least one IOMMU mapping, we keep
5045 * it around.
5046 */
5047 bool iommu_mapping_in_pve = false;
5048 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
5049 if (pve_p != PV_ENTRY_NULL) {
5050 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
5051 if (pte_p == PT_ENTRY_NULL) {
5052 goto protect_skip_pve_pass2;
5053 }
5054 }
5055
5056 #ifdef PVH_FLAG_IOMMU
5057 if (pvh_ptep_is_iommu(pte_p)) {
5058 iommu_mapping_in_pve = true;
5059 if (remove && (pve_p == PV_ENTRY_NULL)) {
5060 /*
5061 * We've found an IOMMU entry and it's the only entry in the PV list.
5062 * We don't discard IOMMU entries, so simply set up the new PV list to
5063 * contain the single IOMMU PTE and exit the loop.
5064 */
5065 new_pte_p = pte_p;
5066 break;
5067 }
5068 goto protect_skip_pve_pass2;
5069 }
5070 #endif
5071 pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
5072 const pmap_t pmap = ptdp->pmap;
5073 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
5074
5075 if (remove) {
5076 if (!compress && (pmap != kernel_pmap)) {
5077 /*
5078 * We must wait to decrement the refcount until we're completely finished using the PTE
5079 * on this path. Otherwise, if we happened to drop the refcount to zero, a concurrent
5080 * pmap_remove() call might observe the zero refcount and free the pagetable out from
5081 * under us.
5082 */
5083 if (OSAddAtomic16(-1, (SInt16 *) &(ptd_get_info(ptdp, pte_p)->refcnt)) <= 0) {
5084 panic("pmap_page_protect_options(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
5085 }
5086 }
5087 /* Remove this CPU mapping from PVE list. */
5088 if (pve_p != PV_ENTRY_NULL) {
5089 pve_set_ptep(pve_p, pve_ptep_idx, PT_ENTRY_NULL);
5090 }
5091 } else {
5092 pt_entry_t spte = *pte_p;
5093 if (pte_was_writeable(spte)) {
5094 pte_set_was_writeable(spte, false);
5095 write_pte_fast(pte_p, spte);
5096 } else {
5097 goto protect_skip_pve_pass2;
5098 }
5099 }
5100 ++pass2_updated;
5101 if (remove || !flush_range || (flush_range->ptfr_pmap != pmap) ||
5102 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
5103 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
5104 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
5105 }
5106
5107 protect_skip_pve_pass2:
5108 pte_p = PT_ENTRY_NULL;
5109 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
5110 pve_ptep_idx = 0;
5111
5112 if (remove) {
5113 /**
5114 * If there are any IOMMU mappings in the PVE list, preserve
5115 * those mappings in a new PVE list (new_pve_p) which will later
5116 * become the new PVH entry. Keep track of the CPU mappings in
5117 * pveh_p/pvet_p so they can be deallocated later.
5118 */
5119 if (iommu_mapping_in_pve) {
5120 iommu_mapping_in_pve = false;
5121 pv_entry_t *temp_pve_p = pve_next(pve_p);
5122 pve_remove(pv_h, pve_pp, pve_p);
5123 pveh_p = pvh_pve_list(pv_h);
5124 pve_p->pve_next = new_pve_p;
5125 new_pve_p = pve_p;
5126 pve_p = temp_pve_p;
5127 continue;
5128 } else {
5129 pvet_p = pve_p;
5130 pvh_cnt++;
5131 }
5132 }
5133
5134 pve_pp = pve_next_ptr(pve_p);
5135 pve_p = pve_next(pve_p);
5136 iommu_mapping_in_pve = false;
5137 }
5138 }
5139
5140 protect_finish:
5141
5142 #ifdef PVH_FLAG_EXEC
5143 if (remove && (pvh_get_flags(pv_h) & PVH_FLAG_EXEC)) {
5144 pmap_set_ptov_ap(pai, AP_RWNA, tlb_flush_needed);
5145 }
5146 #endif
5147 if (__improbable(pass1_updated != pass2_updated)) {
5148 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
5149 __func__, pass1_updated, pass2_updated);
5150 }
5151 /* if we removed a bunch of entries, take care of them now */
5152 if (remove) {
5153 if (new_pve_p != PV_ENTRY_NULL) {
5154 pvh_update_head(pv_h, new_pve_p, PVH_TYPE_PVEP);
5155 pvh_set_flags(pv_h, pvh_flags);
5156 } else if (new_pte_p != PT_ENTRY_NULL) {
5157 pvh_update_head(pv_h, new_pte_p, PVH_TYPE_PTEP);
5158 pvh_set_flags(pv_h, pvh_flags);
5159 } else {
5160 if (__improbable(pvh_flags & PVH_FLAG_FLUSH_NEEDED)) {
5161 pmap_flush_noncoherent_page(phys);
5162 }
5163 pvh_update_head(pv_h, PV_ENTRY_NULL, PVH_TYPE_NULL);
5164 }
5165 }
5166
5167 if (flush_range && tlb_flush_needed) {
5168 if (!remove) {
5169 flush_range->ptfr_flush_needed = true;
5170 tlb_flush_needed = false;
5171 }
5172 }
5173
5174 /*
5175 * If we removed PV entries, ensure prior TLB flushes are complete before we drop the PVH
5176 * lock to allow the backing pages to be repurposed. This is a security precaution, aimed
5177 * primarily at XNU_MONITOR configurations, to reduce the likelihood of an attacker causing
5178 * a page to be repurposed while it is still live in the TLBs.
5179 */
5180 if (remove && tlb_flush_needed) {
5181 sync_tlb_flush();
5182 }
5183
5184
5185 pvh_unlock(pai);
5186
5187 if (remove) {
5188 os_atomic_store(&pmap_cpu_data->inflight_disconnect, false, release);
5189 #if !XNU_MONITOR
5190 mp_enable_preemption();
5191 #endif
5192 }
5193
5194 if (!remove && tlb_flush_needed) {
5195 sync_tlb_flush();
5196 }
5197
5198 if (remove && (pvet_p != PV_ENTRY_NULL)) {
5199 pv_list_free(pveh_p, pvet_p, pvh_cnt);
5200 }
5201 }
5202
5203 MARK_AS_PMAP_TEXT void
5204 pmap_page_protect_options_internal(
5205 ppnum_t ppnum,
5206 vm_prot_t prot,
5207 unsigned int options,
5208 void *arg)
5209 {
5210 if (arg != NULL) {
5211 /*
5212 * If the argument is non-NULL, the VM layer is conveying its intention that the TLBs should
5213 * ultimately be flushed. The nature of ARM TLB maintenance is such that we can flush the
5214 * TLBs much more precisely if we do so inline with the pagetable updates, and PPL security
5215 * model requires that we not exit the PPL without performing required TLB flushes anyway.
5216 * In that case, force the flush to take place.
5217 */
5218 options &= ~PMAP_OPTIONS_NOFLUSH;
5219 }
5220 pmap_page_protect_options_with_flush_range(ppnum, prot, options, NULL);
5221 }
5222
5223 void
5224 pmap_page_protect_options(
5225 ppnum_t ppnum,
5226 vm_prot_t prot,
5227 unsigned int options,
5228 void *arg)
5229 {
5230 pmap_paddr_t phys = ptoa(ppnum);
5231
5232 assert(ppnum != vm_page_fictitious_addr);
5233
5234 /* Only work with managed pages. */
5235 if (!pa_valid(phys)) {
5236 return;
5237 }
5238
5239 /*
5240 * Determine the new protection.
5241 */
5242 if (prot == VM_PROT_ALL) {
5243 return; /* nothing to do */
5244 }
5245
5246 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_START, ppnum, prot);
5247
5248 #if XNU_MONITOR
5249 pmap_page_protect_options_ppl(ppnum, prot, options, arg);
5250 #else
5251 pmap_page_protect_options_internal(ppnum, prot, options, arg);
5252 #endif
5253
5254 PMAP_TRACE(2, PMAP_CODE(PMAP__PAGE_PROTECT) | DBG_FUNC_END);
5255 }
5256
5257
5258 #if __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG))
5259 MARK_AS_PMAP_TEXT void
5260 pmap_disable_user_jop_internal(pmap_t pmap)
5261 {
5262 if (pmap == kernel_pmap) {
5263 panic("%s: called with kernel_pmap", __func__);
5264 }
5265 validate_pmap_mutable(pmap);
5266 pmap->disable_jop = true;
5267 }
5268
5269 void
5270 pmap_disable_user_jop(pmap_t pmap)
5271 {
5272 #if XNU_MONITOR
5273 pmap_disable_user_jop_ppl(pmap);
5274 #else
5275 pmap_disable_user_jop_internal(pmap);
5276 #endif
5277 }
5278 #endif /* __has_feature(ptrauth_calls) && (defined(XNU_TARGET_OS_OSX) || (DEVELOPMENT || DEBUG)) */
5279
5280 /*
5281 * Indicates if the pmap layer enforces some additional restrictions on the
5282 * given set of protections.
5283 */
5284 bool
5285 pmap_has_prot_policy(__unused pmap_t pmap, __unused bool translated_allow_execute, __unused vm_prot_t prot)
5286 {
5287 return false;
5288 }
5289
5290 /*
5291 * Set the physical protection on the
5292 * specified range of this map as requested.
5293 * VERY IMPORTANT: Will not increase permissions.
5294 * VERY IMPORTANT: Only pmap_enter() is allowed to grant permissions.
5295 */
5296 void
5297 pmap_protect(
5298 pmap_t pmap,
5299 vm_map_address_t b,
5300 vm_map_address_t e,
5301 vm_prot_t prot)
5302 {
5303 pmap_protect_options(pmap, b, e, prot, 0, NULL);
5304 }
5305
5306 MARK_AS_PMAP_TEXT vm_map_address_t
5307 pmap_protect_options_internal(
5308 pmap_t pmap,
5309 vm_map_address_t start,
5310 vm_map_address_t end,
5311 vm_prot_t prot,
5312 unsigned int options,
5313 __unused void *args)
5314 {
5315 tt_entry_t *tte_p;
5316 pt_entry_t *bpte_p, *epte_p;
5317 pt_entry_t *pte_p;
5318 boolean_t set_NX = TRUE;
5319 boolean_t set_XO = FALSE;
5320 boolean_t should_have_removed = FALSE;
5321 bool need_strong_sync = false;
5322
5323 /* Validate the pmap input before accessing its data. */
5324 validate_pmap_mutable(pmap);
5325
5326 const pt_attr_t *const pt_attr = pmap_get_pt_attr(pmap);
5327
5328 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
5329 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
5330 }
5331
5332 #if DEVELOPMENT || DEBUG
5333 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5334 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5335 should_have_removed = TRUE;
5336 }
5337 } else
5338 #endif
5339 {
5340 /* Determine the new protection. */
5341 switch (prot) {
5342 case VM_PROT_EXECUTE:
5343 set_XO = TRUE;
5344 OS_FALLTHROUGH;
5345 case VM_PROT_READ:
5346 case VM_PROT_READ | VM_PROT_EXECUTE:
5347 break;
5348 case VM_PROT_READ | VM_PROT_WRITE:
5349 case VM_PROT_ALL:
5350 return end; /* nothing to do */
5351 default:
5352 should_have_removed = TRUE;
5353 }
5354 }
5355
5356 if (should_have_removed) {
5357 panic("%s: should have been a remove operation, "
5358 "pmap=%p, start=%p, end=%p, prot=%#x, options=%#x, args=%p",
5359 __FUNCTION__,
5360 pmap, (void *)start, (void *)end, prot, options, args);
5361 }
5362
5363 #if DEVELOPMENT || DEBUG
5364 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5365 #else
5366 if ((prot & VM_PROT_EXECUTE))
5367 #endif
5368 {
5369 set_NX = FALSE;
5370 } else {
5371 set_NX = TRUE;
5372 }
5373
5374 const uint64_t pmap_page_size = PAGE_RATIO * pt_attr_page_size(pt_attr);
5375 vm_map_address_t va = start;
5376 unsigned int npages = 0;
5377
5378 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
5379
5380 tte_p = pmap_tte(pmap, start);
5381
5382 if ((tte_p != (tt_entry_t *) NULL) && (*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
5383 bpte_p = (pt_entry_t *) ttetokv(*tte_p);
5384 bpte_p = &bpte_p[pte_index(pt_attr, start)];
5385 epte_p = bpte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
5386 pte_p = bpte_p;
5387
5388 for (pte_p = bpte_p;
5389 pte_p < epte_p;
5390 pte_p += PAGE_RATIO, va += pmap_page_size) {
5391 ++npages;
5392 if (__improbable(!(npages % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
5393 pmap_pending_preemption())) {
5394 break;
5395 }
5396 pt_entry_t spte;
5397 #if DEVELOPMENT || DEBUG
5398 boolean_t force_write = FALSE;
5399 #endif
5400
5401 spte = *((volatile pt_entry_t*)pte_p);
5402
5403 if ((spte == ARM_PTE_TYPE_FAULT) ||
5404 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5405 continue;
5406 }
5407
5408 pmap_paddr_t pa;
5409 unsigned int pai = 0;
5410 boolean_t managed = FALSE;
5411
5412 while (!managed) {
5413 /*
5414 * It may be possible for the pte to transition from managed
5415 * to unmanaged in this timeframe; for now, elide the assert.
5416 * We should break out as a consequence of checking pa_valid.
5417 */
5418 // assert(!ARM_PTE_IS_COMPRESSED(spte));
5419 pa = pte_to_pa(spte);
5420 if (!pa_valid(pa)) {
5421 break;
5422 }
5423 pai = pa_index(pa);
5424 pvh_lock(pai);
5425 spte = *((volatile pt_entry_t*)pte_p);
5426 pa = pte_to_pa(spte);
5427 if (pai == pa_index(pa)) {
5428 managed = TRUE;
5429 break; // Leave the PVH locked as we will unlock it after we free the PTE
5430 }
5431 pvh_unlock(pai);
5432 }
5433
5434 if ((spte == ARM_PTE_TYPE_FAULT) ||
5435 ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
5436 continue;
5437 }
5438
5439 pt_entry_t tmplate;
5440
5441 if (pmap == kernel_pmap) {
5442 #if DEVELOPMENT || DEBUG
5443 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5444 force_write = TRUE;
5445 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
5446 } else
5447 #endif
5448 {
5449 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
5450 }
5451 } else {
5452 #if DEVELOPMENT || DEBUG
5453 if ((options & PMAP_OPTIONS_PROTECT_IMMEDIATE) && (prot & VM_PROT_WRITE)) {
5454 assert(pmap->type != PMAP_TYPE_NESTED);
5455 force_write = TRUE;
5456 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pt_attr));
5457 } else
5458 #endif
5459 {
5460 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
5461 }
5462 }
5463
5464 /*
5465 * XXX Removing "NX" would
5466 * grant "execute" access
5467 * immediately, bypassing any
5468 * checks VM might want to do
5469 * in its soft fault path.
5470 * pmap_protect() and co. are
5471 * not allowed to increase
5472 * access permissions.
5473 */
5474 if (set_NX) {
5475 tmplate |= pt_attr_leaf_xn(pt_attr);
5476 } else {
5477 if (pmap == kernel_pmap) {
5478 /* do NOT clear "PNX"! */
5479 tmplate |= ARM_PTE_NX;
5480 } else {
5481 /* do NOT clear "NX"! */
5482 tmplate |= pt_attr_leaf_x(pt_attr);
5483 if (set_XO) {
5484 tmplate &= ~ARM_PTE_APMASK;
5485 tmplate |= pt_attr_leaf_rona(pt_attr);
5486 }
5487 }
5488 }
5489
5490 #if DEVELOPMENT || DEBUG
5491 if (force_write) {
5492 /*
5493 * TODO: Run CS/Monitor checks here.
5494 */
5495 if (managed) {
5496 /*
5497 * We are marking the page as writable,
5498 * so we consider it to be modified and
5499 * referenced.
5500 */
5501 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
5502 tmplate |= ARM_PTE_AF;
5503
5504 if (ppattr_test_reffault(pai)) {
5505 ppattr_clear_reffault(pai);
5506 }
5507
5508 if (ppattr_test_modfault(pai)) {
5509 ppattr_clear_modfault(pai);
5510 }
5511 }
5512 } else if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5513 /*
5514 * An immediate request for anything other than
5515 * write should still mark the page as
5516 * referenced if managed.
5517 */
5518 if (managed) {
5519 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
5520 tmplate |= ARM_PTE_AF;
5521
5522 if (ppattr_test_reffault(pai)) {
5523 ppattr_clear_reffault(pai);
5524 }
5525 }
5526 }
5527 #endif
5528
5529 /* We do not expect to write fast fault the entry. */
5530 pte_set_was_writeable(tmplate, false);
5531 #if HAS_FEAT_XS
5532 if (pte_is_xs(pt_attr, spte)) {
5533 need_strong_sync = true;
5534 }
5535 #endif /* HAS_FEAT_XS */
5536
5537 write_pte_fast(pte_p, tmplate);
5538
5539 if (managed) {
5540 pvh_assert_locked(pai);
5541 pvh_unlock(pai);
5542 }
5543 }
5544 FLUSH_PTE_STRONG();
5545 PMAP_UPDATE_TLBS(pmap, start, va, need_strong_sync, true);
5546 } else {
5547 va = end;
5548 }
5549
5550 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
5551 return va;
5552 }
5553
5554 void
5555 pmap_protect_options(
5556 pmap_t pmap,
5557 vm_map_address_t b,
5558 vm_map_address_t e,
5559 vm_prot_t prot,
5560 unsigned int options,
5561 __unused void *args)
5562 {
5563 vm_map_address_t l, beg;
5564
5565 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5566
5567 if ((b | e) & pt_attr_leaf_offmask(pt_attr)) {
5568 panic("pmap_protect_options() pmap %p start 0x%llx end 0x%llx",
5569 pmap, (uint64_t)b, (uint64_t)e);
5570 }
5571
5572 /*
5573 * We allow single-page requests to execute non-preemptibly,
5574 * as it doesn't make sense to sample AST_URGENT for a single-page
5575 * operation, and there are a couple of special use cases that
5576 * require a non-preemptible single-page operation.
5577 */
5578 if ((e - b) > (pt_attr_page_size(pt_attr) * PAGE_RATIO)) {
5579 pmap_verify_preemptible();
5580 }
5581
5582 #if DEVELOPMENT || DEBUG
5583 if (options & PMAP_OPTIONS_PROTECT_IMMEDIATE) {
5584 if ((prot & VM_PROT_ALL) == VM_PROT_NONE) {
5585 pmap_remove_options(pmap, b, e, options);
5586 return;
5587 }
5588 } else
5589 #endif
5590 {
5591 /* Determine the new protection. */
5592 switch (prot) {
5593 case VM_PROT_EXECUTE:
5594 case VM_PROT_READ:
5595 case VM_PROT_READ | VM_PROT_EXECUTE:
5596 break;
5597 case VM_PROT_READ | VM_PROT_WRITE:
5598 case VM_PROT_ALL:
5599 return; /* nothing to do */
5600 default:
5601 pmap_remove_options(pmap, b, e, options);
5602 return;
5603 }
5604 }
5605
5606 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_START,
5607 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(b),
5608 VM_KERNEL_ADDRHIDE(e));
5609
5610 beg = b;
5611
5612 while (beg < e) {
5613 l = ((beg + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
5614
5615 if (l > e) {
5616 l = e;
5617 }
5618
5619 #if XNU_MONITOR
5620 beg = pmap_protect_options_ppl(pmap, beg, l, prot, options, args);
5621 #else
5622 beg = pmap_protect_options_internal(pmap, beg, l, prot, options, args);
5623 #endif
5624 }
5625
5626 PMAP_TRACE(2, PMAP_CODE(PMAP__PROTECT) | DBG_FUNC_END);
5627 }
5628
5629 /**
5630 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5631 *
5632 * @param pmap pmap to insert the pages into.
5633 * @param va virtual address to map the pages into.
5634 * @param pa page number of the first physical page to map.
5635 * @param size block size, in number of pages.
5636 * @param prot mapping protection attributes.
5637 * @param attr flags to pass to pmap_enter().
5638 *
5639 * @return KERN_SUCCESS.
5640 */
5641 kern_return_t
5642 pmap_map_block(
5643 pmap_t pmap,
5644 addr64_t va,
5645 ppnum_t pa,
5646 uint32_t size,
5647 vm_prot_t prot,
5648 int attr,
5649 unsigned int flags)
5650 {
5651 return pmap_map_block_addr(pmap, va, ((pmap_paddr_t)pa) << PAGE_SHIFT, size, prot, attr, flags);
5652 }
5653
5654 /**
5655 * Inserts an arbitrary number of physical pages ("block") in a pmap.
5656 * As opposed to pmap_map_block(), this function takes
5657 * a physical address as an input and operates using the
5658 * page size associated with the input pmap.
5659 *
5660 * @param pmap pmap to insert the pages into.
5661 * @param va virtual address to map the pages into.
5662 * @param pa physical address of the first physical page to map.
5663 * @param size block size, in number of pages.
5664 * @param prot mapping protection attributes.
5665 * @param attr flags to pass to pmap_enter().
5666 *
5667 * @return KERN_SUCCESS.
5668 */
5669 kern_return_t
5670 pmap_map_block_addr(
5671 pmap_t pmap,
5672 addr64_t va,
5673 pmap_paddr_t pa,
5674 uint32_t size,
5675 vm_prot_t prot,
5676 int attr,
5677 unsigned int flags)
5678 {
5679 #if __ARM_MIXED_PAGE_SIZE__
5680 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5681 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
5682 #else
5683 const uint64_t pmap_page_size = PAGE_SIZE;
5684 #endif
5685
5686 for (ppnum_t page = 0; page < size; page++) {
5687 if (pmap_enter_addr(pmap, va, pa, prot, VM_PROT_NONE, attr, TRUE) != KERN_SUCCESS) {
5688 panic("%s: failed pmap_enter_addr, "
5689 "pmap=%p, va=%#llx, pa=%llu, size=%u, prot=%#x, flags=%#x",
5690 __FUNCTION__,
5691 pmap, va, (uint64_t)pa, size, prot, flags);
5692 }
5693
5694 va += pmap_page_size;
5695 pa += pmap_page_size;
5696 }
5697
5698 return KERN_SUCCESS;
5699 }
5700
5701 kern_return_t
5702 pmap_enter_addr(
5703 pmap_t pmap,
5704 vm_map_address_t v,
5705 pmap_paddr_t pa,
5706 vm_prot_t prot,
5707 vm_prot_t fault_type,
5708 unsigned int flags,
5709 boolean_t wired)
5710 {
5711 return pmap_enter_options_addr(pmap, v, pa, prot, fault_type, flags, wired, 0, NULL, PMAP_MAPPING_TYPE_INFER);
5712 }
5713
5714 /*
5715 * Insert the given physical page (p) at
5716 * the specified virtual address (v) in the
5717 * target physical map with the protection requested.
5718 *
5719 * If specified, the page will be wired down, meaning
5720 * that the related pte can not be reclaimed.
5721 *
5722 * NB: This is the only routine which MAY NOT lazy-evaluate
5723 * or lose information. That is, this routine must actually
5724 * insert this page into the given map eventually (must make
5725 * forward progress eventually.
5726 */
5727 kern_return_t
5728 pmap_enter(
5729 pmap_t pmap,
5730 vm_map_address_t v,
5731 ppnum_t pn,
5732 vm_prot_t prot,
5733 vm_prot_t fault_type,
5734 unsigned int flags,
5735 boolean_t wired,
5736 __unused pmap_mapping_type_t mapping_type)
5737 {
5738 return pmap_enter_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired);
5739 }
5740
5741 /*
5742 * Attempt to commit the pte.
5743 * Succeeds iff able to change *pte_p from old_pte to new_pte.
5744 * Performs no page table or accounting writes on failures.
5745 */
5746 static inline bool
5747 pmap_enter_pte(pmap_t pmap, pt_entry_t *pte_p, pt_entry_t *old_pte, pt_entry_t new_pte, vm_map_address_t v)
5748 {
5749 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
5750 bool success = false, changed_wiring = false;
5751
5752 __unreachable_ok_push
5753 if (TEST_PAGE_RATIO_4) {
5754 /*
5755 * 16K virtual pages w/ 4K hw pages.
5756 * We actually need to update 4 ptes here which can't easily be done atomically.
5757 * As a result we require the exclusive pmap lock.
5758 */
5759 pmap_assert_locked(pmap, PMAP_LOCK_EXCLUSIVE);
5760 *old_pte = *pte_p;
5761 if (*old_pte == new_pte) {
5762 /* Another thread completed this operation. Nothing to do here. */
5763 success = true;
5764 } else if (pa_valid(pte_to_pa(new_pte)) && pte_to_pa(*old_pte) != pte_to_pa(new_pte) &&
5765 (*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5766 /* pte has been modified by another thread and we hold the wrong PVH lock. Retry. */
5767 success = false;
5768 } else {
5769 write_pte_fast(pte_p, new_pte);
5770 success = true;
5771 }
5772 } else {
5773 success = os_atomic_cmpxchgv(pte_p, *old_pte, new_pte, old_pte, acq_rel);
5774 }
5775 __unreachable_ok_pop
5776
5777 if (success && *old_pte != new_pte) {
5778 if ((*old_pte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) {
5779 bool need_strong_sync = false;
5780 FLUSH_PTE_STRONG();
5781 #if HAS_FEAT_XS
5782 if (pte_is_xs(pt_attr, *old_pte)) {
5783 need_strong_sync = true;
5784 }
5785 #endif /* HAS_FEAT_XS */
5786 PMAP_UPDATE_TLBS(pmap, v, v + (pt_attr_page_size(pt_attr) * PAGE_RATIO), need_strong_sync, true);
5787 } else {
5788 FLUSH_PTE();
5789 __builtin_arm_isb(ISB_SY);
5790 }
5791 changed_wiring = ARM_PTE_IS_COMPRESSED(*old_pte, pte_p) ?
5792 (new_pte & ARM_PTE_WIRED) != 0 :
5793 (new_pte & ARM_PTE_WIRED) != (*old_pte & ARM_PTE_WIRED);
5794
5795 if (pmap != kernel_pmap && changed_wiring) {
5796 SInt16 *ptd_wiredcnt_ptr = (SInt16 *)&(ptep_get_info(pte_p)->wiredcnt);
5797 if (new_pte & ARM_PTE_WIRED) {
5798 OSAddAtomic16(1, ptd_wiredcnt_ptr);
5799 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5800 } else {
5801 OSAddAtomic16(-1, ptd_wiredcnt_ptr);
5802 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
5803 }
5804 }
5805
5806 PMAP_TRACE(4 + pt_attr_leaf_level(pt_attr), PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap),
5807 VM_KERNEL_ADDRHIDE(v), VM_KERNEL_ADDRHIDE(v + (pt_attr_page_size(pt_attr) * PAGE_RATIO)), new_pte);
5808 }
5809 return success;
5810 }
5811
5812 MARK_AS_PMAP_TEXT static pt_entry_t
5813 wimg_to_pte(unsigned int wimg, __unused pmap_paddr_t pa)
5814 {
5815 pt_entry_t pte;
5816
5817 switch (wimg & (VM_WIMG_MASK)) {
5818 case VM_WIMG_IO:
5819 // Map DRAM addresses with VM_WIMG_IO as Device-GRE instead of
5820 // Device-nGnRnE. On H14+, accesses to them can be reordered by
5821 // AP, while preserving the security benefits of using device
5822 // mapping against side-channel attacks. On pre-H14 platforms,
5823 // the accesses will still be strongly ordered.
5824 if (is_dram_addr(pa)) {
5825 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5826 } else {
5827 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DISABLE);
5828 }
5829 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5830 break;
5831 case VM_WIMG_RT:
5832 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_RT);
5833 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5834 break;
5835 case VM_WIMG_POSTED:
5836 if (is_dram_addr(pa)) {
5837 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5838 } else {
5839 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED);
5840 }
5841 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5842 break;
5843 case VM_WIMG_POSTED_REORDERED:
5844 if (is_dram_addr(pa)) {
5845 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5846 } else {
5847 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_REORDERED);
5848 }
5849 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5850 break;
5851 case VM_WIMG_POSTED_COMBINED_REORDERED:
5852 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED);
5853 #if HAS_FEAT_XS
5854 if (!is_dram_addr(pa)) {
5855 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_POSTED_COMBINED_REORDERED_XS);
5856 }
5857 #endif /* HAS_FEAT_XS */
5858 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5859 break;
5860 case VM_WIMG_WCOMB:
5861 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITECOMB);
5862 pte |= ARM_PTE_NX | ARM_PTE_PNX;
5863 break;
5864 case VM_WIMG_WTHRU:
5865 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITETHRU);
5866 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5867 break;
5868 case VM_WIMG_COPYBACK:
5869 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
5870 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5871 break;
5872 case VM_WIMG_INNERWBACK:
5873 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_INNERWRITEBACK);
5874 pte |= ARM_PTE_SH(SH_INNER_MEMORY);
5875 break;
5876 default:
5877 pte = ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
5878 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
5879 }
5880
5881 return pte;
5882 }
5883
5884
5885 /*
5886 * Construct a PTE (and the physical page attributes) for the given virtual to
5887 * physical mapping.
5888 *
5889 * This function has no side effects and is safe to call so that it is safe to
5890 * call while attempting a pmap_enter transaction.
5891 */
5892 MARK_AS_PMAP_TEXT static pt_entry_t
5893 pmap_construct_pte(
5894 const pmap_t pmap,
5895 vm_map_address_t va,
5896 pmap_paddr_t pa,
5897 vm_prot_t prot,
5898 vm_prot_t fault_type,
5899 boolean_t wired,
5900 const pt_attr_t* const pt_attr,
5901 uint16_t *pp_attr_bits /* OUTPUT */
5902 )
5903 {
5904 bool set_NX = false, set_XO = false;
5905 pt_entry_t pte = pa_to_pte(pa) | ARM_PTE_TYPE;
5906 assert(pp_attr_bits != NULL);
5907 *pp_attr_bits = 0;
5908
5909 if (wired) {
5910 pte |= ARM_PTE_WIRED;
5911 }
5912
5913 #if DEVELOPMENT || DEBUG
5914 if ((prot & VM_PROT_EXECUTE) || !nx_enabled || !pmap->nx_enabled)
5915 #else
5916 if ((prot & VM_PROT_EXECUTE))
5917 #endif
5918 {
5919 set_NX = false;
5920 } else {
5921 set_NX = true;
5922 }
5923
5924 if (prot == VM_PROT_EXECUTE) {
5925 set_XO = true;
5926 }
5927
5928 if (set_NX) {
5929 pte |= pt_attr_leaf_xn(pt_attr);
5930 } else {
5931 if (pmap == kernel_pmap) {
5932 pte |= ARM_PTE_NX;
5933 } else {
5934 pte |= pt_attr_leaf_x(pt_attr);
5935 }
5936 }
5937
5938 if (pmap == kernel_pmap) {
5939 #if __ARM_KERNEL_PROTECT__
5940 pte |= ARM_PTE_NG;
5941 #endif /* __ARM_KERNEL_PROTECT__ */
5942 if (prot & VM_PROT_WRITE) {
5943 pte |= ARM_PTE_AP(AP_RWNA);
5944 *pp_attr_bits |= PP_ATTR_MODIFIED | PP_ATTR_REFERENCED;
5945 } else {
5946 pte |= ARM_PTE_AP(AP_RONA);
5947 *pp_attr_bits |= PP_ATTR_REFERENCED;
5948 }
5949 } else {
5950 if (pmap->type != PMAP_TYPE_NESTED) {
5951 pte |= ARM_PTE_NG;
5952 } else if ((pmap->nested_region_unnested_table_bitmap)
5953 && (va >= pmap->nested_region_addr)
5954 && (va < (pmap->nested_region_addr + pmap->nested_region_size))) {
5955 unsigned int index = (unsigned int)((va - pmap->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
5956
5957 if ((pmap->nested_region_unnested_table_bitmap)
5958 && testbit(index, (int *)pmap->nested_region_unnested_table_bitmap)) {
5959 pte |= ARM_PTE_NG;
5960 }
5961 }
5962 if (prot & VM_PROT_WRITE) {
5963 assert(pmap->type != PMAP_TYPE_NESTED);
5964 if (pa_valid(pa) && (!ppattr_pa_test_bits(pa, PP_ATTR_MODIFIED))) {
5965 if (fault_type & VM_PROT_WRITE) {
5966 pte |= pt_attr_leaf_rw(pt_attr);
5967 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODIFIED;
5968 } else {
5969 pte |= pt_attr_leaf_ro(pt_attr);
5970 /*
5971 * Mark the page as MODFAULT so that a subsequent write
5972 * may be handled through arm_fast_fault().
5973 */
5974 *pp_attr_bits |= PP_ATTR_REFERENCED | PP_ATTR_MODFAULT;
5975 pte_set_was_writeable(pte, true);
5976 }
5977 } else {
5978 pte |= pt_attr_leaf_rw(pt_attr);
5979 *pp_attr_bits |= PP_ATTR_REFERENCED;
5980 }
5981 } else {
5982 if (set_XO) {
5983 pte |= pt_attr_leaf_rona(pt_attr);
5984 } else {
5985 pte |= pt_attr_leaf_ro(pt_attr);
5986 }
5987 *pp_attr_bits |= PP_ATTR_REFERENCED;
5988 }
5989 }
5990
5991 pte |= ARM_PTE_AF;
5992 return pte;
5993 }
5994
5995 MARK_AS_PMAP_TEXT kern_return_t
5996 pmap_enter_options_internal(
5997 pmap_t pmap,
5998 vm_map_address_t v,
5999 pmap_paddr_t pa,
6000 vm_prot_t prot,
6001 vm_prot_t fault_type,
6002 unsigned int flags,
6003 boolean_t wired,
6004 unsigned int options)
6005 {
6006 ppnum_t pn = (ppnum_t)atop(pa);
6007 pt_entry_t pte;
6008 pt_entry_t spte;
6009 pt_entry_t *pte_p;
6010 bool refcnt_updated;
6011 bool wiredcnt_updated;
6012 bool ro_va = false;
6013 unsigned int wimg_bits;
6014 bool committed = false, drop_refcnt = false, had_valid_mapping = false, skip_footprint_debit = false;
6015 pmap_lock_mode_t lock_mode = PMAP_LOCK_SHARED;
6016 kern_return_t kr = KERN_SUCCESS;
6017 uint16_t pp_attr_bits;
6018 volatile uint16_t *refcnt;
6019 volatile uint16_t *wiredcnt;
6020 pv_free_list_t *local_pv_free;
6021
6022 validate_pmap_mutable(pmap);
6023
6024 #if XNU_MONITOR
6025 if (__improbable((options & PMAP_OPTIONS_NOWAIT) == 0)) {
6026 panic("%s called without PMAP_OPTIONS_NOWAIT set", __func__);
6027 }
6028 #endif
6029
6030 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6031
6032 if (__improbable(v & pt_attr_leaf_offmask(pt_attr))) {
6033 panic("%s: pmap %p v 0x%llx not page-aligned",
6034 __func__, pmap, (unsigned long long)v);
6035 }
6036
6037 if (__improbable((v < pmap->min) || (v >= pmap->max) || (v < pt_attr_pagezero_size(pt_attr)))) {
6038 panic("%s: attempt to map illegal VA 0x%llx in pmap %p", __func__, (unsigned long long)v, pmap);
6039 }
6040
6041 /* Only check kernel_pmap here as CPUWINDOWS only exists in kernel address space. */
6042 if (__improbable((pmap == kernel_pmap) && (v >= CPUWINDOWS_BASE) && (v < CPUWINDOWS_TOP))) {
6043 panic("pmap_enter_options() kernel pmap %p v 0x%llx belongs to [CPUWINDOWS_BASE: 0x%llx, CPUWINDOWS_TOP: 0x%llx)",
6044 pmap, (uint64_t)v, (uint64_t)CPUWINDOWS_BASE, (uint64_t)CPUWINDOWS_TOP);
6045 }
6046
6047 if ((pa) & pt_attr_leaf_offmask(pt_attr)) {
6048 panic("pmap_enter_options() pmap %p pa 0x%llx",
6049 pmap, (uint64_t)pa);
6050 }
6051
6052 /* The PA should not extend beyond the architected physical address space */
6053 pa &= ARM_PTE_PAGE_MASK;
6054
6055 if ((prot & VM_PROT_EXECUTE) && (pmap == kernel_pmap)) {
6056 #if defined(KERNEL_INTEGRITY_CTRR) && defined(CONFIG_XNUPOST)
6057 extern vm_offset_t ctrr_test_page;
6058 if (__probable(v != ctrr_test_page))
6059 #endif
6060 panic("pmap_enter_options(): attempt to add executable mapping to kernel_pmap");
6061 }
6062 if (__improbable((pmap == kernel_pmap) && zone_spans_ro_va(v, v + pt_attr_page_size(pt_attr)))) {
6063 if (__improbable(prot != VM_PROT_READ)) {
6064 panic("%s: attempt to map RO zone VA 0x%llx with prot 0x%x",
6065 __func__, (unsigned long long)v, prot);
6066 }
6067 ro_va = true;
6068 }
6069 assert(pn != vm_page_fictitious_addr);
6070
6071 refcnt_updated = false;
6072 wiredcnt_updated = false;
6073
6074 if ((prot & VM_PROT_EXECUTE) || TEST_PAGE_RATIO_4) {
6075 /*
6076 * We need to take the lock exclusive here because of SPLAY_FIND in pmap_cs_enforce.
6077 *
6078 * See rdar://problem/59655632 for thoughts on synchronization and the splay tree
6079 */
6080 lock_mode = PMAP_LOCK_EXCLUSIVE;
6081 }
6082
6083 if (!pmap_lock_preempt(pmap, lock_mode)) {
6084 return KERN_ABORTED;
6085 }
6086
6087 /*
6088 * Expand pmap to include this pte. Assume that
6089 * pmap is always expanded to include enough hardware
6090 * pages to map one VM page.
6091 */
6092 while ((pte_p = pmap_pte(pmap, v)) == PT_ENTRY_NULL) {
6093 /* Must unlock to expand the pmap. */
6094 pmap_unlock(pmap, lock_mode);
6095
6096 kr = pmap_expand(pmap, v, options, pt_attr_leaf_level(pt_attr));
6097
6098 if (kr != KERN_SUCCESS) {
6099 return kr;
6100 }
6101
6102 if (!pmap_lock_preempt(pmap, lock_mode)) {
6103 return KERN_ABORTED;
6104 }
6105 }
6106
6107 if (options & PMAP_OPTIONS_NOENTER) {
6108 pmap_unlock(pmap, lock_mode);
6109 return KERN_SUCCESS;
6110 }
6111
6112 /*
6113 * Since we may not hold the pmap lock exclusive, updating the pte is
6114 * done via a cmpxchg loop.
6115 * We need to be careful about modifying non-local data structures before commiting
6116 * the new pte since we may need to re-do the transaction.
6117 */
6118 spte = os_atomic_load(pte_p, relaxed);
6119 while (!committed) {
6120 refcnt = NULL;
6121 wiredcnt = NULL;
6122 pv_alloc_return_t pv_status = PV_ALLOC_SUCCESS;
6123 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6124
6125 if (pmap != kernel_pmap) {
6126 ptd_info_t *ptd_info = ptep_get_info(pte_p);
6127 refcnt = &ptd_info->refcnt;
6128 wiredcnt = &ptd_info->wiredcnt;
6129 /*
6130 * This check is really intended to ensure that mappings in a nested pmap can't be inserted
6131 * through a top-level user pmap, which would allow a non-global mapping to be inserted into a shared
6132 * region pmap and leveraged into a TLB-based write gadget (rdar://91504354).
6133 * It's also a useful sanity check for other pmap types, but note that kernel page tables may not
6134 * have PTDs, so we can't use the check there.
6135 */
6136 if (__improbable(ptep_get_pmap(pte_p) != pmap)) {
6137 panic("%s: attempt to enter mapping at pte %p owned by pmap %p through pmap %p",
6138 __func__, pte_p, ptep_get_pmap(pte_p), pmap);
6139 }
6140 /*
6141 * Bump the wired count to keep the PTE page from being reclaimed. We need this because
6142 * we may drop the PVH and pmap locks later in pmap_enter() if we need to allocate
6143 * or acquire the pmap lock exclusive.
6144 */
6145 if (!wiredcnt_updated) {
6146 OSAddAtomic16(1, (volatile int16_t*)wiredcnt);
6147 wiredcnt_updated = true;
6148 }
6149 if (!refcnt_updated) {
6150 OSAddAtomic16(1, (volatile int16_t*)refcnt);
6151 refcnt_updated = true;
6152 drop_refcnt = true;
6153 }
6154 }
6155
6156 if (had_valid_mapping && (pte_to_pa(spte) != pa)) {
6157 /*
6158 * There is already a mapping here & it's for a different physical page.
6159 * First remove that mapping.
6160 *
6161 * This requires that we take the pmap lock exclusive in order to call pmap_remove_range.
6162 */
6163 if (lock_mode == PMAP_LOCK_SHARED) {
6164 if (pmap_lock_shared_to_exclusive(pmap)) {
6165 lock_mode = PMAP_LOCK_EXCLUSIVE;
6166 } else {
6167 /*
6168 * We failed to upgrade to an exclusive lock.
6169 * As a result we no longer hold the lock at all,
6170 * so we need to re-acquire it and restart the transaction.
6171 */
6172 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
6173 lock_mode = PMAP_LOCK_EXCLUSIVE;
6174 /* pmap might have changed after we dropped the lock. Try again. */
6175 spte = os_atomic_load(pte_p, relaxed);
6176 continue;
6177 }
6178 }
6179 pmap_remove_range(pmap, v, pte_p, pte_p + PAGE_RATIO);
6180 spte = ARM_PTE_TYPE_FAULT;
6181 assert(os_atomic_load(pte_p, acquire) == ARM_PTE_TYPE_FAULT);
6182 }
6183
6184 /*
6185 * The XO index is used for TPRO mappings. To avoid exposing them as --x,
6186 * the VM code tracks VM_MAP_TPRO requests and couples them with the proper
6187 * read-write protection. The PMAP layer though still needs to use the right
6188 * index, which is the older XO-now-TPRO one and that is specially selected
6189 * here thanks to PMAP_OPTIONS_MAP_TPRO.
6190 */
6191 if (options & PMAP_OPTIONS_MAP_TPRO) {
6192 if (__improbable(pmap == kernel_pmap)) {
6193 panic("%s: attempt to create kernel TPRO mapping, will produce kernel RX mapping instead.",
6194 __func__);
6195 }
6196 pte = pmap_construct_pte(pmap, v, pa, VM_PROT_RORW_TP, fault_type, wired, pt_attr, &pp_attr_bits);
6197 } else {
6198 pte = pmap_construct_pte(pmap, v, pa, prot, fault_type, wired, pt_attr, &pp_attr_bits);
6199 }
6200
6201 if (pa_valid(pa)) {
6202 unsigned int pai;
6203 boolean_t is_altacct = FALSE, is_internal = FALSE, is_reusable = FALSE, is_external = FALSE;
6204
6205 is_internal = FALSE;
6206 is_altacct = FALSE;
6207
6208 pai = pa_index(pa);
6209
6210 pvh_lock(pai);
6211
6212 /*
6213 * Make sure that the current per-cpu PV free list has
6214 * enough entries (2 in the worst-case scenario) to handle the enter_pv
6215 * if the transaction succeeds. We're either in the
6216 * PPL (which can't be preempted) or we've explicitly disabled preemptions.
6217 * Note that we can still be interrupted, but a primary
6218 * interrupt handler can never enter the pmap.
6219 */
6220 #if !XNU_MONITOR
6221 assert(get_preemption_level() > 0);
6222 #endif
6223 local_pv_free = &pmap_get_cpu_data()->pv_free;
6224 pv_entry_t **pv_h = pai_to_pvh(pai);
6225 const bool allocation_required = !pvh_test_type(pv_h, PVH_TYPE_NULL) &&
6226 !(pvh_test_type(pv_h, PVH_TYPE_PTEP) && pvh_ptep(pv_h) == pte_p);
6227
6228 if (__improbable(allocation_required && (local_pv_free->count < 2))) {
6229 pv_entry_t *new_pve_p[2] = {PV_ENTRY_NULL};
6230 int new_allocated_pves = 0;
6231
6232 while (new_allocated_pves < 2) {
6233 local_pv_free = &pmap_get_cpu_data()->pv_free;
6234 pv_status = pv_alloc(pmap, pai, lock_mode, options, &new_pve_p[new_allocated_pves]);
6235 if (pv_status == PV_ALLOC_FAIL) {
6236 break;
6237 } else if (pv_status == PV_ALLOC_RETRY) {
6238 /*
6239 * In the case that pv_alloc() had to grab a new page of PVEs,
6240 * it will have dropped the pmap lock while doing so.
6241 * On non-PPL devices, dropping the lock re-enables preemption so we may
6242 * be on a different CPU now.
6243 */
6244 local_pv_free = &pmap_get_cpu_data()->pv_free;
6245 } else {
6246 /* If we've gotten this far then a node should've been allocated. */
6247 assert(new_pve_p[new_allocated_pves] != PV_ENTRY_NULL);
6248
6249 new_allocated_pves++;
6250 }
6251 }
6252
6253 for (int i = 0; i < new_allocated_pves; i++) {
6254 pv_free(new_pve_p[i]);
6255 }
6256 }
6257
6258 if (pv_status == PV_ALLOC_FAIL) {
6259 pvh_unlock(pai);
6260 kr = KERN_RESOURCE_SHORTAGE;
6261 break;
6262 } else if (pv_status == PV_ALLOC_RETRY) {
6263 pvh_unlock(pai);
6264 /* We dropped the pmap and PVH locks to allocate. Retry transaction. */
6265 spte = os_atomic_load(pte_p, relaxed);
6266 continue;
6267 }
6268
6269 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6270 wimg_bits = (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6271 } else {
6272 wimg_bits = pmap_cache_attributes(pn);
6273 }
6274
6275 /* We may be retrying this operation after dropping the PVH lock.
6276 * Cache attributes for the physical page may have changed while the lock
6277 * was dropped, so clear any cache attributes we may have previously set
6278 * in the PTE template. */
6279 pte &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
6280 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6281
6282 #if XNU_MONITOR
6283 /* The regular old kernel is not allowed to remap PPL pages. */
6284 if (__improbable(ppattr_pa_test_monitor(pa))) {
6285 panic("%s: page belongs to PPL, "
6286 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6287 __FUNCTION__,
6288 pmap, v, (void*)pa, prot, fault_type, flags, wired, options);
6289 }
6290
6291 if (__improbable(pvh_get_flags(pai_to_pvh(pai)) & PVH_FLAG_LOCKDOWN_MASK)) {
6292 panic("%s: page locked down, "
6293 "pmap=%p, v=0x%llx, pa=%p, prot=0x%x, fault_type=0x%x, flags=0x%x, wired=%u, options=0x%x",
6294 __FUNCTION__,
6295 pmap, v, (void *)pa, prot, fault_type, flags, wired, options);
6296 }
6297 #endif
6298
6299
6300
6301 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6302 if (!committed) {
6303 pvh_unlock(pai);
6304 continue;
6305 }
6306 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6307 /* End of transaction. Commit pv changes, pa bits, and memory accounting. */
6308
6309 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6310 /*
6311 * If there was already a valid pte here then we reuse its reference
6312 * on the ptd and drop the one that we took above.
6313 */
6314 drop_refcnt = had_valid_mapping;
6315
6316 if (!had_valid_mapping) {
6317 pv_entry_t *new_pve_p = PV_ENTRY_NULL;
6318 int pve_ptep_idx = 0;
6319 pv_status = pmap_enter_pv(pmap, pte_p, pai, options, lock_mode, &new_pve_p, &pve_ptep_idx);
6320 /* We did all the allocations up top. So this shouldn't be able to fail. */
6321 if (pv_status != PV_ALLOC_SUCCESS) {
6322 panic("%s: unexpected pmap_enter_pv ret code: %d. new_pve_p=%p pmap=%p",
6323 __func__, pv_status, new_pve_p, pmap);
6324 }
6325
6326 if (pmap != kernel_pmap) {
6327 if (options & PMAP_OPTIONS_INTERNAL) {
6328 ppattr_pve_set_internal(pai, new_pve_p, pve_ptep_idx);
6329 if ((options & PMAP_OPTIONS_ALT_ACCT) ||
6330 PMAP_FOOTPRINT_SUSPENDED(pmap)) {
6331 /*
6332 * Make a note to ourselves that this
6333 * mapping is using alternative
6334 * accounting. We'll need this in order
6335 * to know which ledger to debit when
6336 * the mapping is removed.
6337 *
6338 * The altacct bit must be set while
6339 * the pv head is locked. Defer the
6340 * ledger accounting until after we've
6341 * dropped the lock.
6342 */
6343 ppattr_pve_set_altacct(pai, new_pve_p, pve_ptep_idx);
6344 is_altacct = TRUE;
6345 }
6346 }
6347 if (ppattr_test_reusable(pai) &&
6348 !is_altacct) {
6349 is_reusable = TRUE;
6350 } else if (options & PMAP_OPTIONS_INTERNAL) {
6351 is_internal = TRUE;
6352 } else {
6353 is_external = TRUE;
6354 }
6355 }
6356 }
6357
6358 pvh_unlock(pai);
6359
6360 if (pp_attr_bits != 0) {
6361 ppattr_pa_set_bits(pa, pp_attr_bits);
6362 }
6363
6364 if (!had_valid_mapping && (pmap != kernel_pmap)) {
6365 pmap_ledger_credit(pmap, task_ledgers.phys_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6366
6367 if (is_internal) {
6368 /*
6369 * Make corresponding adjustments to
6370 * phys_footprint statistics.
6371 */
6372 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6373 if (is_altacct) {
6374 /*
6375 * If this page is internal and
6376 * in an IOKit region, credit
6377 * the task's total count of
6378 * dirty, internal IOKit pages.
6379 * It should *not* count towards
6380 * the task's total physical
6381 * memory footprint, because
6382 * this entire region was
6383 * already billed to the task
6384 * at the time the mapping was
6385 * created.
6386 *
6387 * Put another way, this is
6388 * internal++ and
6389 * alternate_accounting++, so
6390 * net effect on phys_footprint
6391 * is 0. That means: don't
6392 * touch phys_footprint here.
6393 */
6394 pmap_ledger_credit(pmap, task_ledgers.alternate_accounting, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6395 } else {
6396 if (ARM_PTE_IS_COMPRESSED(spte, pte_p) && !(spte & ARM_PTE_COMPRESSED_ALT)) {
6397 /* Replacing a compressed page (with internal accounting). No change to phys_footprint. */
6398 skip_footprint_debit = true;
6399 } else {
6400 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6401 }
6402 }
6403 }
6404 if (is_reusable) {
6405 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6406 } else if (is_external) {
6407 pmap_ledger_credit(pmap, task_ledgers.external, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6408 }
6409 }
6410 } else {
6411 if (prot & VM_PROT_EXECUTE) {
6412 kr = KERN_FAILURE;
6413 break;
6414 }
6415
6416 wimg_bits = pmap_cache_attributes(pn);
6417 if ((flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT))) {
6418 wimg_bits = (wimg_bits & (~VM_WIMG_MASK)) | (flags & (VM_WIMG_MASK | VM_WIMG_USE_DEFAULT));
6419 }
6420
6421 pte |= pmap_get_pt_ops(pmap)->wimg_to_pte(wimg_bits, pa);
6422
6423 #if XNU_MONITOR
6424 pte = pmap_construct_io_pte(pa, pte);
6425
6426 /**
6427 * PPL-protected MMIO mappings handed to IOMMUs must be PPL-writable and cannot be removed,
6428 * but in support of hibernation we allow temporary read-only mappings of these pages to be
6429 * created and later removed. We must therefore prevent an attacker from downgrading a
6430 * a writable mapping in order to allow it to be removed and remapped to something else.
6431 */
6432 if (__improbable((wimg_bits & PP_ATTR_MONITOR) &&
6433 ((spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE) &&
6434 (pte_to_xprr_perm(spte) != XPRR_KERN_RO_PERM) &&
6435 (pte_to_xprr_perm(pte) == XPRR_KERN_RO_PERM))) {
6436 panic("%s: attempt to downgrade mapping of writable PPL-protected I/O address 0x%llx",
6437 __func__, (uint64_t)pte_to_pa(spte));
6438 }
6439 #endif
6440
6441 committed = pmap_enter_pte(pmap, pte_p, &spte, pte, v);
6442 if (committed) {
6443 had_valid_mapping = (spte & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE;
6444 assert(!had_valid_mapping || (pte_to_pa(spte) == pa));
6445
6446 /**
6447 * If there was already a valid pte here then we reuse its
6448 * reference on the ptd and drop the one that we took above.
6449 */
6450 drop_refcnt = had_valid_mapping;
6451 }
6452 }
6453 if (committed) {
6454 if (ARM_PTE_IS_COMPRESSED(spte, pte_p)) {
6455 assert(pmap != kernel_pmap);
6456
6457 /* One less "compressed" */
6458 pmap_ledger_debit(pmap, task_ledgers.internal_compressed,
6459 pt_attr_page_size(pt_attr) * PAGE_RATIO);
6460
6461 if (spte & ARM_PTE_COMPRESSED_ALT) {
6462 pmap_ledger_debit(pmap, task_ledgers.alternate_accounting_compressed, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6463 } else if (!skip_footprint_debit) {
6464 /* Was part of the footprint */
6465 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6466 }
6467 /* The old entry held a reference so drop the extra one that we took above. */
6468 drop_refcnt = true;
6469 }
6470 }
6471 }
6472
6473 if (drop_refcnt && refcnt != NULL) {
6474 assert(refcnt_updated);
6475 if (OSAddAtomic16(-1, (volatile int16_t*)refcnt) <= 0) {
6476 panic("pmap_enter(): over-release of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6477 }
6478 }
6479
6480 if (wiredcnt_updated && (OSAddAtomic16(-1, (volatile int16_t*)wiredcnt) <= 0)) {
6481 panic("pmap_enter(): over-unwire of ptdp %p for pte %p", ptep_get_ptd(pte_p), pte_p);
6482 }
6483
6484 pmap_unlock(pmap, lock_mode);
6485
6486 if (__improbable(ro_va && kr == KERN_SUCCESS)) {
6487 pmap_phys_write_disable(v);
6488 }
6489
6490 return kr;
6491 }
6492
6493 kern_return_t
6494 pmap_enter_options_addr(
6495 pmap_t pmap,
6496 vm_map_address_t v,
6497 pmap_paddr_t pa,
6498 vm_prot_t prot,
6499 vm_prot_t fault_type,
6500 unsigned int flags,
6501 boolean_t wired,
6502 unsigned int options,
6503 __unused void *arg,
6504 __unused pmap_mapping_type_t mapping_type)
6505 {
6506 kern_return_t kr = KERN_FAILURE;
6507
6508
6509 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_START,
6510 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v), pa, prot);
6511
6512
6513 const bool nowait_requested = (options & PMAP_OPTIONS_NOWAIT) != 0;
6514 do {
6515 #if XNU_MONITOR
6516 kr = pmap_enter_options_ppl(pmap, v, pa, prot, fault_type, flags, wired, options | PMAP_OPTIONS_NOWAIT);
6517 #else
6518 kr = pmap_enter_options_internal(pmap, v, pa, prot, fault_type, flags, wired, options);
6519 #endif
6520
6521 if (kr == KERN_RESOURCE_SHORTAGE) {
6522 #if XNU_MONITOR
6523 pmap_alloc_page_for_ppl(nowait_requested ? PMAP_PAGES_ALLOCATE_NOWAIT : 0);
6524 #endif
6525 if (nowait_requested) {
6526 break;
6527 }
6528 }
6529 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
6530
6531 #if XNU_MONITOR
6532 pmap_ledger_check_balance(pmap);
6533 #endif
6534
6535 PMAP_TRACE(2, PMAP_CODE(PMAP__ENTER) | DBG_FUNC_END, kr);
6536
6537 return kr;
6538 }
6539
6540 kern_return_t
6541 pmap_enter_options(
6542 pmap_t pmap,
6543 vm_map_address_t v,
6544 ppnum_t pn,
6545 vm_prot_t prot,
6546 vm_prot_t fault_type,
6547 unsigned int flags,
6548 boolean_t wired,
6549 unsigned int options,
6550 __unused void *arg,
6551 pmap_mapping_type_t mapping_type)
6552 {
6553 return pmap_enter_options_addr(pmap, v, ((pmap_paddr_t)pn) << PAGE_SHIFT, prot, fault_type, flags, wired, options, arg, mapping_type);
6554 }
6555
6556 /*
6557 * Routine: pmap_change_wiring
6558 * Function: Change the wiring attribute for a map/virtual-address
6559 * pair.
6560 * In/out conditions:
6561 * The mapping must already exist in the pmap.
6562 */
6563 MARK_AS_PMAP_TEXT kern_return_t
6564 pmap_change_wiring_internal(
6565 pmap_t pmap,
6566 vm_map_address_t v,
6567 boolean_t wired)
6568 {
6569 pt_entry_t *pte_p;
6570 pmap_paddr_t pa;
6571
6572 validate_pmap_mutable(pmap);
6573
6574 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6575 return KERN_ABORTED;
6576 }
6577
6578 const pt_attr_t * pt_attr = pmap_get_pt_attr(pmap);
6579
6580 pte_p = pmap_pte(pmap, v);
6581 if (pte_p == PT_ENTRY_NULL) {
6582 if (!wired) {
6583 /*
6584 * The PTE may have already been cleared by a disconnect/remove operation, and the L3 table
6585 * may have been freed by a remove operation.
6586 */
6587 goto pmap_change_wiring_return;
6588 } else {
6589 panic("%s: Attempt to wire nonexistent PTE for pmap %p", __func__, pmap);
6590 }
6591 }
6592 /*
6593 * Use volatile loads to prevent the compiler from collapsing references to 'pa' back to loads of pte_p
6594 * until we've grabbed the final PVH lock; PTE contents may change during this time.
6595 */
6596 pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6597
6598 while (pa_valid(pa)) {
6599 pmap_paddr_t new_pa;
6600
6601 pvh_lock(pa_index(pa));
6602 new_pa = pte_to_pa(*((volatile pt_entry_t*)pte_p));
6603
6604 if (pa == new_pa) {
6605 break;
6606 }
6607
6608 pvh_unlock(pa_index(pa));
6609 pa = new_pa;
6610 }
6611
6612 /* PTE checks must be performed after acquiring the PVH lock (if applicable for the PA) */
6613 if ((*pte_p == ARM_PTE_EMPTY) || (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p))) {
6614 if (!wired) {
6615 /* PTE cleared by prior remove/disconnect operation */
6616 goto pmap_change_wiring_cleanup;
6617 } else {
6618 panic("%s: Attempt to wire empty/compressed PTE %p (=0x%llx) for pmap %p",
6619 __func__, pte_p, (uint64_t)*pte_p, pmap);
6620 }
6621 }
6622
6623 assertf((*pte_p & ARM_PTE_TYPE_VALID) == ARM_PTE_TYPE, "invalid pte %p (=0x%llx)", pte_p, (uint64_t)*pte_p);
6624 if (wired != pte_is_wired(*pte_p)) {
6625 pte_set_wired(pmap, pte_p, wired);
6626 if (pmap != kernel_pmap) {
6627 if (wired) {
6628 pmap_ledger_credit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6629 } else if (!wired) {
6630 pmap_ledger_debit(pmap, task_ledgers.wired_mem, pt_attr_page_size(pt_attr) * PAGE_RATIO);
6631 }
6632 }
6633 }
6634
6635 pmap_change_wiring_cleanup:
6636 if (pa_valid(pa)) {
6637 pvh_unlock(pa_index(pa));
6638 }
6639
6640 pmap_change_wiring_return:
6641 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6642
6643 return KERN_SUCCESS;
6644 }
6645
6646 void
6647 pmap_change_wiring(
6648 pmap_t pmap,
6649 vm_map_address_t v,
6650 boolean_t wired)
6651 {
6652 /* This function is going to lock the pmap lock, so it'd better be preemptible. */
6653 pmap_verify_preemptible();
6654
6655 kern_return_t kr = KERN_FAILURE;
6656 #if XNU_MONITOR
6657 /* Attempting to lock the pmap lock can abort when there is pending preemption. Retry in such case. */
6658 do {
6659 kr = pmap_change_wiring_ppl(pmap, v, wired);
6660 } while (kr == KERN_ABORTED);
6661
6662 pmap_ledger_check_balance(pmap);
6663 #else
6664 /* Since we verified preemptibility, call the helper only once. */
6665 kr = pmap_change_wiring_internal(pmap, v, wired);
6666 #endif
6667
6668 if (kr != KERN_SUCCESS) {
6669 panic("%s: failed with return code %d; pmap: 0x%016llx, v: 0x%016llx, wired: %s",
6670 __func__, kr, (uint64_t) pmap, (uint64_t) v, wired ? "true" : "false");
6671 }
6672 }
6673
6674 MARK_AS_PMAP_TEXT pmap_paddr_t
6675 pmap_find_pa_internal(
6676 pmap_t pmap,
6677 addr64_t va)
6678 {
6679 pmap_paddr_t pa = 0;
6680
6681 validate_pmap(pmap);
6682
6683 if (pmap != kernel_pmap) {
6684 pmap_lock(pmap, PMAP_LOCK_SHARED);
6685 }
6686
6687 pa = pmap_vtophys(pmap, va);
6688
6689 if (pmap != kernel_pmap) {
6690 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6691 }
6692
6693 return pa;
6694 }
6695
6696 pmap_paddr_t
6697 pmap_find_pa_nofault(pmap_t pmap, addr64_t va)
6698 {
6699 pmap_paddr_t pa = 0;
6700
6701 if (pmap == kernel_pmap) {
6702 pa = mmu_kvtop(va);
6703 } else if ((current_thread()->map) && (pmap == vm_map_pmap(current_thread()->map))) {
6704 /*
6705 * Note that this doesn't account for PAN: mmu_uvtop() may return a valid
6706 * translation even if PAN would prevent kernel access through the translation.
6707 * It's therefore assumed the UVA will be accessed in a PAN-disabled context.
6708 */
6709 pa = mmu_uvtop(va);
6710 }
6711 return pa;
6712 }
6713
6714 pmap_paddr_t
6715 pmap_find_pa(
6716 pmap_t pmap,
6717 addr64_t va)
6718 {
6719 pmap_paddr_t pa = pmap_find_pa_nofault(pmap, va);
6720
6721 if (pa != 0) {
6722 return pa;
6723 }
6724
6725 if (not_in_kdp) {
6726 #if XNU_MONITOR
6727 return pmap_find_pa_ppl(pmap, va);
6728 #else
6729 return pmap_find_pa_internal(pmap, va);
6730 #endif
6731 } else {
6732 return pmap_vtophys(pmap, va);
6733 }
6734 }
6735
6736 ppnum_t
6737 pmap_find_phys_nofault(
6738 pmap_t pmap,
6739 addr64_t va)
6740 {
6741 ppnum_t ppn;
6742 ppn = atop(pmap_find_pa_nofault(pmap, va));
6743 return ppn;
6744 }
6745
6746 ppnum_t
6747 pmap_find_phys(
6748 pmap_t pmap,
6749 addr64_t va)
6750 {
6751 ppnum_t ppn;
6752 ppn = atop(pmap_find_pa(pmap, va));
6753 return ppn;
6754 }
6755
6756 /**
6757 * Translate a kernel virtual address into a physical address.
6758 *
6759 * @param va The kernel virtual address to translate. Does not work on user
6760 * virtual addresses.
6761 *
6762 * @return The physical address if the translation was successful, or zero if
6763 * no valid mappings were found for the given virtual address.
6764 */
6765 pmap_paddr_t
6766 kvtophys(vm_offset_t va)
6767 {
6768 /**
6769 * Attempt to do the translation first in hardware using the AT (address
6770 * translation) instruction. This will attempt to use the MMU to do the
6771 * translation for us.
6772 */
6773 pmap_paddr_t pa = mmu_kvtop(va);
6774
6775 if (pa) {
6776 return pa;
6777 }
6778
6779 /* If the MMU can't find the mapping, then manually walk the page tables. */
6780 return pmap_vtophys(kernel_pmap, va);
6781 }
6782
6783 /**
6784 * Variant of kvtophys that can't fail. If no mapping is found or the mapping
6785 * points to a non-kernel-managed physical page, then this call will panic().
6786 *
6787 * @note The output of this function is guaranteed to be a kernel-managed
6788 * physical page, which means it's safe to pass the output directly to
6789 * pa_index() to create a physical address index for various pmap data
6790 * structures.
6791 *
6792 * @param va The kernel virtual address to translate. Does not work on user
6793 * virtual addresses.
6794 *
6795 * @return The translated physical address for the given virtual address.
6796 */
6797 pmap_paddr_t
6798 kvtophys_nofail(vm_offset_t va)
6799 {
6800 pmap_paddr_t pa = kvtophys(va);
6801
6802 if (!pa_valid(pa)) {
6803 panic("%s: Invalid or non-kernel-managed physical page returned, "
6804 "pa: %#llx, va: %p", __func__, (uint64_t)pa, (void *)va);
6805 }
6806
6807 return pa;
6808 }
6809
6810 pmap_paddr_t
6811 pmap_vtophys(
6812 pmap_t pmap,
6813 addr64_t va)
6814 {
6815 if ((va < pmap->min) || (va >= pmap->max)) {
6816 return 0;
6817 }
6818
6819 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6820
6821 tt_entry_t * ttp = NULL;
6822 tt_entry_t * ttep = NULL;
6823 tt_entry_t tte = ARM_TTE_EMPTY;
6824 pmap_paddr_t pa = 0;
6825 unsigned int cur_level;
6826
6827 ttp = pmap->tte;
6828
6829 for (cur_level = pt_attr_root_level(pt_attr); cur_level <= pt_attr_leaf_level(pt_attr); cur_level++) {
6830 ttep = &ttp[ttn_index(pt_attr, va, cur_level)];
6831
6832 tte = *ttep;
6833
6834 const uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
6835 const uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
6836 const uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
6837 const uint64_t offmask = pt_attr->pta_level_info[cur_level].offmask;
6838
6839 if ((tte & valid_mask) != valid_mask) {
6840 return (pmap_paddr_t) 0;
6841 }
6842
6843 /* This detects both leaf entries and intermediate block mappings. */
6844 if ((tte & type_mask) == type_block) {
6845 pa = ((tte & ARM_TTE_PA_MASK & ~offmask) | (va & offmask));
6846 break;
6847 }
6848
6849 ttp = (tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
6850 }
6851
6852 return pa;
6853 }
6854
6855 /*
6856 * pmap_init_pte_page - Initialize a page table page.
6857 */
6858 MARK_AS_PMAP_TEXT void
6859 pmap_init_pte_page(
6860 pmap_t pmap,
6861 pt_entry_t *pte_p,
6862 vm_offset_t va,
6863 unsigned int ttlevel,
6864 boolean_t alloc_ptd)
6865 {
6866 pt_desc_t *ptdp = NULL;
6867 pv_entry_t **pvh = pai_to_pvh(pa_index(ml_static_vtop((vm_offset_t)pte_p)));
6868
6869 if (pvh_test_type(pvh, PVH_TYPE_NULL)) {
6870 if (alloc_ptd) {
6871 /*
6872 * This path should only be invoked from arm_vm_init. If we are emulating 16KB pages
6873 * on 4KB hardware, we may already have allocated a page table descriptor for a
6874 * bootstrap request, so we check for an existing PTD here.
6875 */
6876 ptdp = ptd_alloc(pmap);
6877 if (ptdp == NULL) {
6878 panic("%s: unable to allocate PTD", __func__);
6879 }
6880 pvh_update_head_unlocked(pvh, ptdp, PVH_TYPE_PTDP);
6881 /* Clear all PVH flags when using a page for a PTD to avoid tripping unexpected page flag usage checks. */
6882 pvh_set_flags(pvh, 0);
6883 } else {
6884 panic("pmap_init_pte_page(): pte_p %p", pte_p);
6885 }
6886 } else if (pvh_test_type(pvh, PVH_TYPE_PTDP)) {
6887 ptdp = pvh_ptd(pvh);
6888 } else {
6889 panic("pmap_init_pte_page(): invalid PVH type for pte_p %p", pte_p);
6890 }
6891
6892 // below barrier ensures previous updates to the page are visible to PTW before
6893 // it is linked to the PTE of previous level
6894 __builtin_arm_dmb(DMB_ISHST);
6895 ptd_info_init(ptdp, pmap, va, ttlevel, pte_p);
6896 }
6897
6898 /*
6899 * Routine: pmap_expand
6900 *
6901 * Expands a pmap to be able to map the specified virtual address.
6902 *
6903 * Allocates new memory for the default (COARSE) translation table
6904 * entry, initializes all the pte entries to ARM_PTE_TYPE_FAULT and
6905 * also allocates space for the corresponding pv entries.
6906 *
6907 * Nothing should be locked.
6908 */
6909 MARK_AS_PMAP_TEXT static kern_return_t
6910 pmap_expand(
6911 pmap_t pmap,
6912 vm_map_address_t v,
6913 unsigned int options,
6914 unsigned int level)
6915 {
6916 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
6917
6918 if (__improbable((v < pmap->min) || (v >= pmap->max))) {
6919 return KERN_INVALID_ADDRESS;
6920 }
6921 pmap_paddr_t pa;
6922 unsigned int ttlevel = pt_attr_root_level(pt_attr);
6923 tt_entry_t *tte_p;
6924 tt_entry_t *tt_p;
6925
6926 pa = 0x0ULL;
6927 tt_p = (tt_entry_t *)NULL;
6928
6929 for (; ttlevel < level; ttlevel++) {
6930 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
6931 return KERN_ABORTED;
6932 }
6933
6934 if (pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL) {
6935 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6936 kern_return_t ret;
6937 while ((ret = pmap_tt_allocate(pmap, &tt_p, ttlevel + 1, ((options & PMAP_TT_ALLOCATE_NOWAIT)? PMAP_PAGES_ALLOCATE_NOWAIT : 0))) != KERN_SUCCESS) {
6938 if (options & PMAP_OPTIONS_NOWAIT) {
6939 /* Can be KERN_RESOURCE_SHORTAGE or KERN_ABORTED. */
6940 return ret;
6941 }
6942 #if XNU_MONITOR
6943 panic("%s: failed to allocate tt, "
6944 "pmap=%p, v=%p, options=0x%x, level=%u",
6945 __FUNCTION__,
6946 pmap, (void *)v, options, level);
6947 #else
6948 VM_PAGE_WAIT();
6949 #endif
6950 }
6951
6952 if (!pmap_lock_preempt(pmap, PMAP_LOCK_EXCLUSIVE)) {
6953 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6954 return KERN_ABORTED;
6955 }
6956
6957 if ((pmap_ttne(pmap, ttlevel + 1, v) == PT_ENTRY_NULL)) {
6958 pmap_init_pte_page(pmap, (pt_entry_t *) tt_p, v, ttlevel + 1, FALSE);
6959 pa = kvtophys_nofail((vm_offset_t)tt_p);
6960 tte_p = pmap_ttne(pmap, ttlevel, v);
6961 *tte_p = (pa & ARM_TTE_TABLE_MASK) | ARM_TTE_TYPE_TABLE | ARM_TTE_VALID;
6962 PMAP_TRACE(4 + ttlevel, PMAP_CODE(PMAP__TTE), VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(v & ~pt_attr_ln_offmask(pt_attr, ttlevel)),
6963 VM_KERNEL_ADDRHIDE((v & ~pt_attr_ln_offmask(pt_attr, ttlevel)) + pt_attr_ln_size(pt_attr, ttlevel)), *tte_p);
6964 pa = 0x0ULL;
6965 tt_p = (tt_entry_t *)NULL;
6966 }
6967 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
6968 } else {
6969 pmap_unlock(pmap, PMAP_LOCK_SHARED);
6970 }
6971
6972 if (tt_p != (tt_entry_t *)NULL) {
6973 pmap_tt_deallocate(pmap, tt_p, ttlevel + 1);
6974 tt_p = (tt_entry_t *)NULL;
6975 }
6976 }
6977
6978 return KERN_SUCCESS;
6979 }
6980
6981 /*
6982 * Routine: pmap_gc
6983 * Function:
6984 * Pmap garbage collection
6985 * Called by the pageout daemon when pages are scarce.
6986 *
6987 */
6988 void
6989 pmap_gc(void)
6990 {
6991 /*
6992 * TODO: as far as I can tell this has never been implemented to do anything meaninful.
6993 * We can't just destroy any old pmap on the chance that it may be active on a CPU
6994 * or may contain wired mappings. However, with the relatively recent change to
6995 * make pmap_page_reclaim() non-fatal in the event that it doesn't find an eligible
6996 * page, it may make sense to call that function here.
6997 */
6998 }
6999
7000 /*
7001 * By default, don't attempt pmap GC more frequently
7002 * than once / 1 minutes.
7003 */
7004
7005 void
7006 compute_pmap_gc_throttle(
7007 void *arg __unused)
7008 {
7009 }
7010
7011 /*
7012 * pmap_attribute_cache_sync(vm_offset_t pa)
7013 *
7014 * Invalidates all of the instruction cache on a physical page and
7015 * pushes any dirty data from the data cache for the same physical page
7016 */
7017
7018 kern_return_t
7019 pmap_attribute_cache_sync(
7020 ppnum_t pp,
7021 vm_size_t size,
7022 __unused vm_machine_attribute_t attribute,
7023 __unused vm_machine_attribute_val_t * value)
7024 {
7025 if (size > PAGE_SIZE) {
7026 panic("pmap_attribute_cache_sync size: 0x%llx", (uint64_t)size);
7027 } else {
7028 cache_sync_page(pp);
7029 }
7030
7031 return KERN_SUCCESS;
7032 }
7033
7034 /*
7035 * pmap_sync_page_data_phys(ppnum_t pp)
7036 *
7037 * Invalidates all of the instruction cache on a physical page and
7038 * pushes any dirty data from the data cache for the same physical page
7039 */
7040 void
7041 pmap_sync_page_data_phys(
7042 ppnum_t pp)
7043 {
7044 cache_sync_page(pp);
7045 }
7046
7047 /*
7048 * pmap_sync_page_attributes_phys(ppnum_t pp)
7049 *
7050 * Write back and invalidate all cachelines on a physical page.
7051 */
7052 void
7053 pmap_sync_page_attributes_phys(
7054 ppnum_t pp)
7055 {
7056 flush_dcache((vm_offset_t) (pp << PAGE_SHIFT), PAGE_SIZE, TRUE);
7057 }
7058
7059 #if CONFIG_COREDUMP
7060 /* temporary workaround */
7061 boolean_t
7062 coredumpok(
7063 vm_map_t map,
7064 mach_vm_offset_t va)
7065 {
7066 pt_entry_t *pte_p;
7067 pt_entry_t spte;
7068
7069 pte_p = pmap_pte(map->pmap, va);
7070 if (0 == pte_p) {
7071 return FALSE;
7072 }
7073 if (vm_map_entry_has_device_pager(map, va)) {
7074 return FALSE;
7075 }
7076 spte = *pte_p;
7077 return (spte & ARM_PTE_ATTRINDXMASK) == ARM_PTE_ATTRINDX(CACHE_ATTRINDX_DEFAULT);
7078 }
7079 #endif
7080
7081 void
7082 fillPage(
7083 ppnum_t pn,
7084 unsigned int fill)
7085 {
7086 unsigned int *addr;
7087 int count;
7088
7089 addr = (unsigned int *) phystokv(ptoa(pn));
7090 count = PAGE_SIZE / sizeof(unsigned int);
7091 while (count--) {
7092 *addr++ = fill;
7093 }
7094 }
7095
7096 extern void mapping_set_mod(ppnum_t pn);
7097
7098 void
7099 mapping_set_mod(
7100 ppnum_t pn)
7101 {
7102 pmap_set_modify(pn);
7103 }
7104
7105 extern void mapping_set_ref(ppnum_t pn);
7106
7107 void
7108 mapping_set_ref(
7109 ppnum_t pn)
7110 {
7111 pmap_set_reference(pn);
7112 }
7113
7114 /*
7115 * Clear specified attribute bits.
7116 *
7117 * Try to force an arm_fast_fault() for all mappings of
7118 * the page - to force attributes to be set again at fault time.
7119 * If the forcing succeeds, clear the cached bits at the head.
7120 * Otherwise, something must have been wired, so leave the cached
7121 * attributes alone.
7122 */
7123 MARK_AS_PMAP_TEXT static void
7124 phys_attribute_clear_with_flush_range(
7125 ppnum_t pn,
7126 unsigned int bits,
7127 int options,
7128 void *arg,
7129 pmap_tlb_flush_range_t *flush_range)
7130 {
7131 pmap_paddr_t pa = ptoa(pn);
7132 vm_prot_t allow_mode = VM_PROT_ALL;
7133
7134 #if XNU_MONITOR
7135 if (__improbable(bits & PP_ATTR_PPL_OWNED_BITS)) {
7136 panic("%s: illegal request, "
7137 "pn=%u, bits=%#x, options=%#x, arg=%p, flush_range=%p",
7138 __FUNCTION__,
7139 pn, bits, options, arg, flush_range);
7140 }
7141 #endif
7142 if ((arg != NULL) || (flush_range != NULL)) {
7143 options = options & ~PMAP_OPTIONS_NOFLUSH;
7144 }
7145
7146 if (__improbable((options & (PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_FF_LOCKED)) != 0)) {
7147 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7148 "invalid options",
7149 pn, bits, options, arg, flush_range);
7150 }
7151
7152 if (__improbable((bits & PP_ATTR_MODIFIED) &&
7153 (options & PMAP_OPTIONS_NOFLUSH))) {
7154 panic("phys_attribute_clear(%#010x,%#010x,%#010x,%p,%p): "
7155 "should not clear 'modified' without flushing TLBs",
7156 pn, bits, options, arg, flush_range);
7157 }
7158
7159 assert(pn != vm_page_fictitious_addr);
7160
7161 if (options & PMAP_OPTIONS_CLEAR_WRITE) {
7162 assert(bits == PP_ATTR_MODIFIED);
7163
7164 pmap_page_protect_options_with_flush_range(pn, (VM_PROT_ALL & ~VM_PROT_WRITE), options, flush_range);
7165 /*
7166 * We short circuit this case; it should not need to
7167 * invoke arm_force_fast_fault, so just clear the modified bit.
7168 * pmap_page_protect has taken care of resetting
7169 * the state so that we'll see the next write as a fault to
7170 * the VM (i.e. we don't want a fast fault).
7171 */
7172 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7173 return;
7174 }
7175 if (bits & PP_ATTR_REFERENCED) {
7176 allow_mode &= ~(VM_PROT_READ | VM_PROT_EXECUTE);
7177 }
7178 if (bits & PP_ATTR_MODIFIED) {
7179 allow_mode &= ~VM_PROT_WRITE;
7180 }
7181
7182 if (bits == PP_ATTR_NOENCRYPT) {
7183 /*
7184 * We short circuit this case; it should not need to
7185 * invoke arm_force_fast_fault, so just clear and
7186 * return. On ARM, this bit is just a debugging aid.
7187 */
7188 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7189 return;
7190 }
7191
7192 if (arm_force_fast_fault_with_flush_range(pn, allow_mode, options, flush_range)) {
7193 ppattr_pa_clear_bits(pa, (pp_attr_t)bits);
7194 }
7195 }
7196
7197 MARK_AS_PMAP_TEXT void
7198 phys_attribute_clear_internal(
7199 ppnum_t pn,
7200 unsigned int bits,
7201 int options,
7202 void *arg)
7203 {
7204 phys_attribute_clear_with_flush_range(pn, bits, options, arg, NULL);
7205 }
7206
7207 #if __ARM_RANGE_TLBI__
7208 MARK_AS_PMAP_TEXT static vm_map_address_t
7209 phys_attribute_clear_twig_internal(
7210 pmap_t pmap,
7211 vm_map_address_t start,
7212 vm_map_address_t end,
7213 unsigned int bits,
7214 unsigned int options,
7215 pmap_tlb_flush_range_t *flush_range)
7216 {
7217 pmap_assert_locked(pmap, PMAP_LOCK_SHARED);
7218 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7219 assert(end >= start);
7220 assert((end - start) <= pt_attr_twig_size(pt_attr));
7221 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
7222 vm_map_address_t va = start;
7223 pt_entry_t *pte_p, *start_pte_p, *end_pte_p, *curr_pte_p;
7224 tt_entry_t *tte_p;
7225 tte_p = pmap_tte(pmap, start);
7226 unsigned int npages = 0;
7227
7228 if (tte_p == (tt_entry_t *) NULL) {
7229 return end;
7230 }
7231
7232 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
7233 pte_p = (pt_entry_t *) ttetokv(*tte_p);
7234
7235 start_pte_p = &pte_p[pte_index(pt_attr, start)];
7236 end_pte_p = start_pte_p + ((end - start) >> pt_attr_leaf_shift(pt_attr));
7237 assert(end_pte_p >= start_pte_p);
7238 for (curr_pte_p = start_pte_p; curr_pte_p < end_pte_p; curr_pte_p++, va += pmap_page_size) {
7239 if (__improbable(npages++ && pmap_pending_preemption())) {
7240 return va;
7241 }
7242 pmap_paddr_t pa = pte_to_pa(*((volatile pt_entry_t*)curr_pte_p));
7243 if (pa_valid(pa)) {
7244 ppnum_t pn = (ppnum_t) atop(pa);
7245 phys_attribute_clear_with_flush_range(pn, bits, options, NULL, flush_range);
7246 }
7247 }
7248 }
7249 return end;
7250 }
7251
7252 MARK_AS_PMAP_TEXT vm_map_address_t
7253 phys_attribute_clear_range_internal(
7254 pmap_t pmap,
7255 vm_map_address_t start,
7256 vm_map_address_t end,
7257 unsigned int bits,
7258 unsigned int options)
7259 {
7260 if (__improbable(end < start)) {
7261 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
7262 }
7263 validate_pmap_mutable(pmap);
7264
7265 vm_map_address_t va = start;
7266 pmap_tlb_flush_range_t flush_range = {
7267 .ptfr_pmap = pmap,
7268 .ptfr_start = start,
7269 .ptfr_end = end,
7270 .ptfr_flush_needed = false
7271 };
7272
7273 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
7274 return va;
7275 }
7276
7277 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7278
7279 while (va < end) {
7280 vm_map_address_t curr_end;
7281
7282 curr_end = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
7283 if (curr_end > end) {
7284 curr_end = end;
7285 }
7286
7287 va = phys_attribute_clear_twig_internal(pmap, va, curr_end, bits, options, &flush_range);
7288 if ((va < curr_end) || pmap_pending_preemption()) {
7289 break;
7290 }
7291 }
7292 pmap_unlock(pmap, PMAP_LOCK_SHARED);
7293 if (flush_range.ptfr_flush_needed) {
7294 pmap_get_pt_ops(pmap)->flush_tlb_region_async(
7295 flush_range.ptfr_start,
7296 flush_range.ptfr_end - flush_range.ptfr_start,
7297 flush_range.ptfr_pmap,
7298 true,
7299 false);
7300 sync_tlb_flush();
7301 }
7302 return va;
7303 }
7304
7305 static void
7306 phys_attribute_clear_range(
7307 pmap_t pmap,
7308 vm_map_address_t start,
7309 vm_map_address_t end,
7310 unsigned int bits,
7311 unsigned int options)
7312 {
7313 /*
7314 * We allow single-page requests to execute non-preemptibly,
7315 * as it doesn't make sense to sample AST_URGENT for a single-page
7316 * operation, and there are a couple of special use cases that
7317 * require a non-preemptible single-page operation.
7318 */
7319 if ((end - start) > (pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO)) {
7320 pmap_verify_preemptible();
7321 }
7322
7323 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_START, bits);
7324
7325 while (start < end) {
7326 #if XNU_MONITOR
7327 start = phys_attribute_clear_range_ppl(pmap, start, end, bits, options);
7328 #else
7329 start = phys_attribute_clear_range_internal(pmap, start, end, bits, options);
7330 #endif
7331 }
7332
7333 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR_RANGE) | DBG_FUNC_END);
7334 }
7335 #endif /* __ARM_RANGE_TLBI__ */
7336
7337 static void
7338 phys_attribute_clear(
7339 ppnum_t pn,
7340 unsigned int bits,
7341 int options,
7342 void *arg)
7343 {
7344 /*
7345 * Do we really want this tracepoint? It will be extremely chatty.
7346 * Also, should we have a corresponding trace point for the set path?
7347 */
7348 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_START, pn, bits);
7349
7350 #if XNU_MONITOR
7351 phys_attribute_clear_ppl(pn, bits, options, arg);
7352 #else
7353 phys_attribute_clear_internal(pn, bits, options, arg);
7354 #endif
7355
7356 PMAP_TRACE(3, PMAP_CODE(PMAP__ATTRIBUTE_CLEAR) | DBG_FUNC_END);
7357 }
7358
7359 /*
7360 * Set specified attribute bits.
7361 *
7362 * Set cached value in the pv head because we have
7363 * no per-mapping hardware support for referenced and
7364 * modify bits.
7365 */
7366 MARK_AS_PMAP_TEXT void
7367 phys_attribute_set_internal(
7368 ppnum_t pn,
7369 unsigned int bits)
7370 {
7371 pmap_paddr_t pa = ptoa(pn);
7372 assert(pn != vm_page_fictitious_addr);
7373
7374 #if XNU_MONITOR
7375 if (bits & PP_ATTR_PPL_OWNED_BITS) {
7376 panic("%s: illegal request, "
7377 "pn=%u, bits=%#x",
7378 __FUNCTION__,
7379 pn, bits);
7380 }
7381 #endif
7382
7383 ppattr_pa_set_bits(pa, (uint16_t)bits);
7384
7385 return;
7386 }
7387
7388 static void
7389 phys_attribute_set(
7390 ppnum_t pn,
7391 unsigned int bits)
7392 {
7393 #if XNU_MONITOR
7394 phys_attribute_set_ppl(pn, bits);
7395 #else
7396 phys_attribute_set_internal(pn, bits);
7397 #endif
7398 }
7399
7400
7401 /*
7402 * Check specified attribute bits.
7403 *
7404 * use the software cached bits (since no hw support).
7405 */
7406 static boolean_t
7407 phys_attribute_test(
7408 ppnum_t pn,
7409 unsigned int bits)
7410 {
7411 pmap_paddr_t pa = ptoa(pn);
7412 assert(pn != vm_page_fictitious_addr);
7413 return ppattr_pa_test_bits(pa, (pp_attr_t)bits);
7414 }
7415
7416
7417 /*
7418 * Set the modify/reference bits on the specified physical page.
7419 */
7420 void
7421 pmap_set_modify(ppnum_t pn)
7422 {
7423 phys_attribute_set(pn, PP_ATTR_MODIFIED);
7424 }
7425
7426
7427 /*
7428 * Clear the modify bits on the specified physical page.
7429 */
7430 void
7431 pmap_clear_modify(
7432 ppnum_t pn)
7433 {
7434 phys_attribute_clear(pn, PP_ATTR_MODIFIED, 0, NULL);
7435 }
7436
7437
7438 /*
7439 * pmap_is_modified:
7440 *
7441 * Return whether or not the specified physical page is modified
7442 * by any physical maps.
7443 */
7444 boolean_t
7445 pmap_is_modified(
7446 ppnum_t pn)
7447 {
7448 return phys_attribute_test(pn, PP_ATTR_MODIFIED);
7449 }
7450
7451
7452 /*
7453 * Set the reference bit on the specified physical page.
7454 */
7455 static void
7456 pmap_set_reference(
7457 ppnum_t pn)
7458 {
7459 phys_attribute_set(pn, PP_ATTR_REFERENCED);
7460 }
7461
7462 /*
7463 * Clear the reference bits on the specified physical page.
7464 */
7465 void
7466 pmap_clear_reference(
7467 ppnum_t pn)
7468 {
7469 phys_attribute_clear(pn, PP_ATTR_REFERENCED, 0, NULL);
7470 }
7471
7472
7473 /*
7474 * pmap_is_referenced:
7475 *
7476 * Return whether or not the specified physical page is referenced
7477 * by any physical maps.
7478 */
7479 boolean_t
7480 pmap_is_referenced(
7481 ppnum_t pn)
7482 {
7483 return phys_attribute_test(pn, PP_ATTR_REFERENCED);
7484 }
7485
7486 /*
7487 * pmap_get_refmod(phys)
7488 * returns the referenced and modified bits of the specified
7489 * physical page.
7490 */
7491 unsigned int
7492 pmap_get_refmod(
7493 ppnum_t pn)
7494 {
7495 return ((phys_attribute_test(pn, PP_ATTR_MODIFIED)) ? VM_MEM_MODIFIED : 0)
7496 | ((phys_attribute_test(pn, PP_ATTR_REFERENCED)) ? VM_MEM_REFERENCED : 0);
7497 }
7498
7499 static inline unsigned int
7500 pmap_clear_refmod_mask_to_modified_bits(const unsigned int mask)
7501 {
7502 return ((mask & VM_MEM_MODIFIED) ? PP_ATTR_MODIFIED : 0) |
7503 ((mask & VM_MEM_REFERENCED) ? PP_ATTR_REFERENCED : 0);
7504 }
7505
7506 /*
7507 * pmap_clear_refmod(phys, mask)
7508 * clears the referenced and modified bits as specified by the mask
7509 * of the specified physical page.
7510 */
7511 void
7512 pmap_clear_refmod_options(
7513 ppnum_t pn,
7514 unsigned int mask,
7515 unsigned int options,
7516 void *arg)
7517 {
7518 unsigned int bits;
7519
7520 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7521 phys_attribute_clear(pn, bits, options, arg);
7522 }
7523
7524 /*
7525 * Perform pmap_clear_refmod_options on a virtual address range.
7526 * The operation will be performed in bulk & tlb flushes will be coalesced
7527 * if possible.
7528 *
7529 * Returns true if the operation is supported on this platform.
7530 * If this function returns false, the operation is not supported and
7531 * nothing has been modified in the pmap.
7532 */
7533 bool
7534 pmap_clear_refmod_range_options(
7535 pmap_t pmap __unused,
7536 vm_map_address_t start __unused,
7537 vm_map_address_t end __unused,
7538 unsigned int mask __unused,
7539 unsigned int options __unused)
7540 {
7541 #if __ARM_RANGE_TLBI__
7542 unsigned int bits;
7543 bits = pmap_clear_refmod_mask_to_modified_bits(mask);
7544 phys_attribute_clear_range(pmap, start, end, bits, options);
7545 return true;
7546 #else /* __ARM_RANGE_TLBI__ */
7547 #pragma unused(pmap, start, end, mask, options)
7548 /*
7549 * This operation allows the VM to bulk modify refmod bits on a virtually
7550 * contiguous range of addresses. This is large performance improvement on
7551 * platforms that support ranged tlbi instructions. But on older platforms,
7552 * we can only flush per-page or the entire asid. So we currently
7553 * only support this operation on platforms that support ranged tlbi.
7554 * instructions. On other platforms, we require that
7555 * the VM modify the bits on a per-page basis.
7556 */
7557 return false;
7558 #endif /* __ARM_RANGE_TLBI__ */
7559 }
7560
7561 void
7562 pmap_clear_refmod(
7563 ppnum_t pn,
7564 unsigned int mask)
7565 {
7566 pmap_clear_refmod_options(pn, mask, 0, NULL);
7567 }
7568
7569 unsigned int
7570 pmap_disconnect_options(
7571 ppnum_t pn,
7572 unsigned int options,
7573 void *arg)
7574 {
7575 if ((options & PMAP_OPTIONS_COMPRESSOR_IFF_MODIFIED)) {
7576 /*
7577 * On ARM, the "modified" bit is managed by software, so
7578 * we know up-front if the physical page is "modified",
7579 * without having to scan all the PTEs pointing to it.
7580 * The caller should have made the VM page "busy" so noone
7581 * should be able to establish any new mapping and "modify"
7582 * the page behind us.
7583 */
7584 if (pmap_is_modified(pn)) {
7585 /*
7586 * The page has been modified and will be sent to
7587 * the VM compressor.
7588 */
7589 options |= PMAP_OPTIONS_COMPRESSOR;
7590 } else {
7591 /*
7592 * The page hasn't been modified and will be freed
7593 * instead of compressed.
7594 */
7595 }
7596 }
7597
7598 /* disconnect the page */
7599 pmap_page_protect_options(pn, 0, options, arg);
7600
7601 /* return ref/chg status */
7602 return pmap_get_refmod(pn);
7603 }
7604
7605 /*
7606 * Routine:
7607 * pmap_disconnect
7608 *
7609 * Function:
7610 * Disconnect all mappings for this page and return reference and change status
7611 * in generic format.
7612 *
7613 */
7614 unsigned int
7615 pmap_disconnect(
7616 ppnum_t pn)
7617 {
7618 pmap_page_protect(pn, 0); /* disconnect the page */
7619 return pmap_get_refmod(pn); /* return ref/chg status */
7620 }
7621
7622 boolean_t
7623 pmap_has_managed_page(ppnum_t first, ppnum_t last)
7624 {
7625 if (ptoa(first) >= vm_last_phys) {
7626 return FALSE;
7627 }
7628 if (ptoa(last) < vm_first_phys) {
7629 return FALSE;
7630 }
7631
7632 return TRUE;
7633 }
7634
7635 /*
7636 * The state maintained by the noencrypt functions is used as a
7637 * debugging aid on ARM. This incurs some overhead on the part
7638 * of the caller. A special case check in phys_attribute_clear
7639 * (the most expensive path) currently minimizes this overhead,
7640 * but stubbing these functions out on RELEASE kernels yields
7641 * further wins.
7642 */
7643 boolean_t
7644 pmap_is_noencrypt(
7645 ppnum_t pn)
7646 {
7647 #if DEVELOPMENT || DEBUG
7648 boolean_t result = FALSE;
7649
7650 if (!pa_valid(ptoa(pn))) {
7651 return FALSE;
7652 }
7653
7654 result = (phys_attribute_test(pn, PP_ATTR_NOENCRYPT));
7655
7656 return result;
7657 #else
7658 #pragma unused(pn)
7659 return FALSE;
7660 #endif
7661 }
7662
7663 void
7664 pmap_set_noencrypt(
7665 ppnum_t pn)
7666 {
7667 #if DEVELOPMENT || DEBUG
7668 if (!pa_valid(ptoa(pn))) {
7669 return;
7670 }
7671
7672 phys_attribute_set(pn, PP_ATTR_NOENCRYPT);
7673 #else
7674 #pragma unused(pn)
7675 #endif
7676 }
7677
7678 void
7679 pmap_clear_noencrypt(
7680 ppnum_t pn)
7681 {
7682 #if DEVELOPMENT || DEBUG
7683 if (!pa_valid(ptoa(pn))) {
7684 return;
7685 }
7686
7687 phys_attribute_clear(pn, PP_ATTR_NOENCRYPT, 0, NULL);
7688 #else
7689 #pragma unused(pn)
7690 #endif
7691 }
7692
7693 #if XNU_MONITOR
7694 boolean_t
7695 pmap_is_monitor(ppnum_t pn)
7696 {
7697 assert(pa_valid(ptoa(pn)));
7698 return phys_attribute_test(pn, PP_ATTR_MONITOR);
7699 }
7700 #endif
7701
7702 void
7703 pmap_lock_phys_page(ppnum_t pn)
7704 {
7705 #if !XNU_MONITOR
7706 unsigned int pai;
7707 pmap_paddr_t phys = ptoa(pn);
7708
7709 if (pa_valid(phys)) {
7710 pai = pa_index(phys);
7711 pvh_lock(pai);
7712 } else
7713 #else
7714 (void)pn;
7715 #endif
7716 { simple_lock(&phys_backup_lock, LCK_GRP_NULL);}
7717 }
7718
7719
7720 void
7721 pmap_unlock_phys_page(ppnum_t pn)
7722 {
7723 #if !XNU_MONITOR
7724 unsigned int pai;
7725 pmap_paddr_t phys = ptoa(pn);
7726
7727 if (pa_valid(phys)) {
7728 pai = pa_index(phys);
7729 pvh_unlock(pai);
7730 } else
7731 #else
7732 (void)pn;
7733 #endif
7734 { simple_unlock(&phys_backup_lock);}
7735 }
7736
7737 MARK_AS_PMAP_TEXT static void
7738 pmap_switch_user_ttb(pmap_t pmap, pmap_cpu_data_t *cpu_data_ptr)
7739 {
7740 if (pmap != kernel_pmap) {
7741 cpu_data_ptr->cpu_nested_pmap = pmap->nested_pmap;
7742 cpu_data_ptr->cpu_nested_pmap_attr = (cpu_data_ptr->cpu_nested_pmap == NULL) ?
7743 NULL : pmap_get_pt_attr(cpu_data_ptr->cpu_nested_pmap);
7744 cpu_data_ptr->cpu_nested_region_addr = pmap->nested_region_addr;
7745 cpu_data_ptr->cpu_nested_region_size = pmap->nested_region_size;
7746 #if __ARM_MIXED_PAGE_SIZE__
7747 cpu_data_ptr->commpage_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
7748 #endif
7749 }
7750
7751
7752 #if __ARM_MIXED_PAGE_SIZE__
7753 if ((pmap != kernel_pmap) && (pmap_get_pt_attr(pmap)->pta_tcr_value != get_tcr())) {
7754 set_tcr(pmap_get_pt_attr(pmap)->pta_tcr_value);
7755 }
7756 #endif /* __ARM_MIXED_PAGE_SIZE__ */
7757
7758
7759 if (pmap != kernel_pmap) {
7760 set_mmu_ttb((pmap->ttep & TTBR_BADDR_MASK) | (((uint64_t)pmap->hw_asid) << TTBR_ASID_SHIFT));
7761 } else if (!pmap_user_ttb_is_clear()) {
7762 pmap_clear_user_ttb_internal();
7763 }
7764 }
7765
7766 MARK_AS_PMAP_TEXT void
7767 pmap_clear_user_ttb_internal(void)
7768 {
7769 set_mmu_ttb(invalid_ttep & TTBR_BADDR_MASK);
7770 }
7771
7772 void
7773 pmap_clear_user_ttb(void)
7774 {
7775 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_START, NULL, 0, 0);
7776 #if XNU_MONITOR
7777 pmap_clear_user_ttb_ppl();
7778 #else
7779 pmap_clear_user_ttb_internal();
7780 #endif
7781 PMAP_TRACE(3, PMAP_CODE(PMAP__CLEAR_USER_TTB) | DBG_FUNC_END);
7782 }
7783
7784
7785 #if defined(__arm64__)
7786 /*
7787 * Marker for use in multi-pass fast-fault PV list processing.
7788 * ARM_PTE_COMPRESSED should never otherwise be set on PTEs processed by
7789 * these functions, as compressed PTEs should never be present in PV lists.
7790 * Note that this only holds true for arm64; for arm32 we don't have enough
7791 * SW bits in the PTE, so the same bit does double-duty as the COMPRESSED
7792 * and WRITEABLE marker depending on whether the PTE is valid.
7793 */
7794 #define ARM_PTE_FF_MARKER ARM_PTE_COMPRESSED
7795 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WRITEABLE, "compressed bit aliases writeable");
7796 _Static_assert(ARM_PTE_COMPRESSED != ARM_PTE_WIRED, "compressed bit aliases wired");
7797 #endif
7798
7799
7800 MARK_AS_PMAP_TEXT static boolean_t
7801 arm_force_fast_fault_with_flush_range(
7802 ppnum_t ppnum,
7803 vm_prot_t allow_mode,
7804 int options,
7805 pmap_tlb_flush_range_t *flush_range)
7806 {
7807 pmap_paddr_t phys = ptoa(ppnum);
7808 pv_entry_t *pve_p;
7809 pt_entry_t *pte_p;
7810 unsigned int pai;
7811 unsigned int pass1_updated = 0;
7812 unsigned int pass2_updated = 0;
7813 boolean_t result;
7814 pv_entry_t **pv_h;
7815 bool is_reusable;
7816 bool ref_fault;
7817 bool mod_fault;
7818 bool clear_write_fault = false;
7819 bool ref_aliases_mod = false;
7820 bool mustsynch = ((options & PMAP_OPTIONS_FF_LOCKED) == 0);
7821
7822 assert(ppnum != vm_page_fictitious_addr);
7823
7824 if (!pa_valid(phys)) {
7825 return FALSE; /* Not a managed page. */
7826 }
7827
7828 result = TRUE;
7829 ref_fault = false;
7830 mod_fault = false;
7831 pai = pa_index(phys);
7832 if (__probable(mustsynch)) {
7833 pvh_lock(pai);
7834 }
7835 pv_h = pai_to_pvh(pai);
7836
7837 #if XNU_MONITOR
7838 if (__improbable(ppattr_pa_test_monitor(phys))) {
7839 panic("%s: PA 0x%llx belongs to PPL.", __func__, (uint64_t)phys);
7840 }
7841 #endif
7842 pte_p = PT_ENTRY_NULL;
7843 pve_p = PV_ENTRY_NULL;
7844 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
7845 pte_p = pvh_ptep(pv_h);
7846 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
7847 pve_p = pvh_pve_list(pv_h);
7848 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
7849 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)phys);
7850 }
7851
7852 is_reusable = ppattr_test_reusable(pai);
7853
7854 /*
7855 * issue_tlbi is used to indicate that this function will need to issue at least one TLB
7856 * invalidation during pass 2. tlb_flush_needed only indicates that PTE permissions have
7857 * changed and that a TLB flush will be needed *at some point*, so we'll need to call
7858 * FLUSH_PTE_STRONG() to synchronize prior PTE updates. In the case of a flush_range
7859 * operation, TLB invalidation may be handled by the caller so it's possible for
7860 * tlb_flush_needed to be true while issue_tlbi is false.
7861 */
7862 bool issue_tlbi = false;
7863 bool tlb_flush_needed = false;
7864
7865 pv_entry_t *orig_pve_p = pve_p;
7866 pt_entry_t *orig_pte_p = pte_p;
7867 int pve_ptep_idx = 0;
7868
7869 /*
7870 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
7871 * TLB invalidation in pass 2.
7872 */
7873 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
7874 pt_entry_t spte;
7875 pt_entry_t tmplate;
7876
7877 if (pve_p != PV_ENTRY_NULL) {
7878 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
7879 if (pte_p == PT_ENTRY_NULL) {
7880 goto fff_skip_pve_pass1;
7881 }
7882 }
7883
7884 #ifdef PVH_FLAG_IOMMU
7885 if (pvh_ptep_is_iommu(pte_p)) {
7886 goto fff_skip_pve_pass1;
7887 }
7888 #endif
7889 if (*pte_p == ARM_PTE_EMPTY) {
7890 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7891 }
7892 if (ARM_PTE_IS_COMPRESSED(*pte_p, pte_p)) {
7893 panic("pte is COMPRESSED: pte_p=%p ppnum=0x%x", pte_p, ppnum);
7894 }
7895
7896 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
7897 const pmap_t pmap = ptdp->pmap;
7898 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
7899 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
7900
7901 assert(va >= pmap->min && va < pmap->max);
7902
7903 /* update pmap stats and ledgers */
7904 const bool is_internal = ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx);
7905 const bool is_altacct = ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx);
7906 if (is_altacct) {
7907 /*
7908 * We do not track "reusable" status for
7909 * "alternate accounting" mappings.
7910 */
7911 } else if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) &&
7912 is_reusable &&
7913 is_internal &&
7914 pmap != kernel_pmap) {
7915 /* one less "reusable" */
7916 pmap_ledger_debit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7917 /* one more "internal" */
7918 pmap_ledger_credit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7919 pmap_ledger_credit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7920
7921 /*
7922 * Since the page is being marked non-reusable, we assume that it will be
7923 * modified soon. Avoid the cost of another trap to handle the fast
7924 * fault when we next write to this page.
7925 */
7926 clear_write_fault = true;
7927 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) &&
7928 !is_reusable &&
7929 is_internal &&
7930 pmap != kernel_pmap) {
7931 /* one more "reusable" */
7932 pmap_ledger_credit(pmap, task_ledgers.reusable, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7933 pmap_ledger_debit(pmap, task_ledgers.internal, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7934 pmap_ledger_debit(pmap, task_ledgers.phys_footprint, pt_attr_page_size(pt_attr) * PAGE_RATIO);
7935 }
7936
7937 bool wiredskip = pte_is_wired(*pte_p) &&
7938 ((options & PMAP_OPTIONS_FF_WIRED) == 0);
7939
7940 if (wiredskip) {
7941 result = FALSE;
7942 goto fff_skip_pve_pass1;
7943 }
7944
7945 spte = *pte_p;
7946 tmplate = spte;
7947
7948 #if HAS_FEAT_XS
7949 /**
7950 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
7951 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
7952 */
7953 assert(!pte_is_xs(pt_attr, spte));
7954 #endif /* HAS_FEAT_XS */
7955 if ((allow_mode & VM_PROT_READ) != VM_PROT_READ) {
7956 /* read protection sets the pte to fault */
7957 tmplate = tmplate & ~ARM_PTE_AF;
7958 ref_fault = true;
7959 }
7960 if ((allow_mode & VM_PROT_WRITE) != VM_PROT_WRITE) {
7961 /* take away write permission if set */
7962 if (pmap == kernel_pmap) {
7963 if ((tmplate & ARM_PTE_APMASK) == ARM_PTE_AP(AP_RWNA)) {
7964 tmplate = ((tmplate & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RONA));
7965 pte_set_was_writeable(tmplate, true);
7966 mod_fault = true;
7967 }
7968 } else {
7969 if ((tmplate & ARM_PTE_APMASK) == pt_attr_leaf_rw(pt_attr)) {
7970 tmplate = ((tmplate & ~ARM_PTE_APMASK) | pt_attr_leaf_ro(pt_attr));
7971 pte_set_was_writeable(tmplate, true);
7972 mod_fault = true;
7973 }
7974 }
7975 }
7976
7977 #if MACH_ASSERT && XNU_MONITOR
7978 if (is_pte_xprr_protected(pmap, spte)) {
7979 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
7980 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
7981 "ppnum=0x%x, options=0x%x, allow_mode=0x%x",
7982 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
7983 ppnum, options, allow_mode);
7984 }
7985 }
7986 #endif /* MACH_ASSERT && XNU_MONITOR */
7987
7988 if (result && (tmplate != spte)) {
7989 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE)) &&
7990 !(options & PMAP_OPTIONS_NOFLUSH)) {
7991 tlb_flush_needed = true;
7992 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
7993 va >= flush_range->ptfr_end || va < flush_range->ptfr_start) {
7994 #ifdef ARM_PTE_FF_MARKER
7995 assert(!(spte & ARM_PTE_FF_MARKER));
7996 tmplate |= ARM_PTE_FF_MARKER;
7997 ++pass1_updated;
7998 #endif
7999 issue_tlbi = true;
8000 }
8001 }
8002 write_pte_fast(pte_p, tmplate);
8003 }
8004
8005 fff_skip_pve_pass1:
8006 pte_p = PT_ENTRY_NULL;
8007 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8008 pve_ptep_idx = 0;
8009 pve_p = pve_next(pve_p);
8010 }
8011 }
8012
8013 if (tlb_flush_needed) {
8014 FLUSH_PTE_STRONG();
8015 }
8016
8017 if (!issue_tlbi) {
8018 goto fff_finish;
8019 }
8020
8021 /* Pass 2: Issue any required TLB invalidations */
8022 pve_p = orig_pve_p;
8023 pte_p = orig_pte_p;
8024 pve_ptep_idx = 0;
8025
8026 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8027 if (pve_p != PV_ENTRY_NULL) {
8028 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8029 if (pte_p == PT_ENTRY_NULL) {
8030 goto fff_skip_pve_pass2;
8031 }
8032 }
8033
8034 #ifdef PVH_FLAG_IOMMU
8035 if (pvh_ptep_is_iommu(pte_p)) {
8036 goto fff_skip_pve_pass2;
8037 }
8038 #endif
8039
8040 #ifdef ARM_PTE_FF_MARKER
8041 pt_entry_t spte = *pte_p;
8042
8043 if (!(spte & ARM_PTE_FF_MARKER)) {
8044 goto fff_skip_pve_pass2;
8045 } else {
8046 spte &= (~ARM_PTE_FF_MARKER);
8047 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8048 write_pte_fast(pte_p, spte);
8049 ++pass2_updated;
8050 }
8051 #endif
8052 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8053 const pmap_t pmap = ptdp->pmap;
8054 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8055
8056 if (!flush_range || (flush_range->ptfr_pmap != pmap) ||
8057 (va >= flush_range->ptfr_end) || (va < flush_range->ptfr_start)) {
8058 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va,
8059 pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO, pmap, true, false);
8060 }
8061
8062 fff_skip_pve_pass2:
8063 pte_p = PT_ENTRY_NULL;
8064 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8065 pve_ptep_idx = 0;
8066 pve_p = pve_next(pve_p);
8067 }
8068 }
8069
8070 fff_finish:
8071 if (__improbable(pass1_updated != pass2_updated)) {
8072 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8073 __func__, pass1_updated, pass2_updated);
8074 }
8075
8076 /*
8077 * If we are using the same approach for ref and mod
8078 * faults on this PTE, do not clear the write fault;
8079 * this would cause both ref and mod to be set on the
8080 * page again, and prevent us from taking ANY read/write
8081 * fault on the mapping.
8082 */
8083 if (clear_write_fault && !ref_aliases_mod) {
8084 arm_clear_fast_fault(ppnum, VM_PROT_WRITE, PT_ENTRY_NULL);
8085 }
8086 if (tlb_flush_needed) {
8087 if (flush_range) {
8088 /* Delayed flush. Signal to the caller that the flush is needed. */
8089 flush_range->ptfr_flush_needed = true;
8090 } else {
8091 sync_tlb_flush();
8092 }
8093 }
8094
8095 /* update global "reusable" status for this page */
8096 if ((options & PMAP_OPTIONS_CLEAR_REUSABLE) && is_reusable) {
8097 ppattr_clear_reusable(pai);
8098 } else if ((options & PMAP_OPTIONS_SET_REUSABLE) && !is_reusable) {
8099 ppattr_set_reusable(pai);
8100 }
8101
8102 if (mod_fault) {
8103 ppattr_set_modfault(pai);
8104 }
8105 if (ref_fault) {
8106 ppattr_set_reffault(pai);
8107 }
8108 if (__probable(mustsynch)) {
8109 pvh_unlock(pai);
8110 }
8111 return result;
8112 }
8113
8114 MARK_AS_PMAP_TEXT boolean_t
8115 arm_force_fast_fault_internal(
8116 ppnum_t ppnum,
8117 vm_prot_t allow_mode,
8118 int options)
8119 {
8120 if (__improbable((options & (PMAP_OPTIONS_FF_LOCKED | PMAP_OPTIONS_FF_WIRED | PMAP_OPTIONS_NOFLUSH)) != 0)) {
8121 panic("arm_force_fast_fault(0x%x, 0x%x, 0x%x): invalid options", ppnum, allow_mode, options);
8122 }
8123 return arm_force_fast_fault_with_flush_range(ppnum, allow_mode, options, NULL);
8124 }
8125
8126 /*
8127 * Routine: arm_force_fast_fault
8128 *
8129 * Function:
8130 * Force all mappings for this page to fault according
8131 * to the access modes allowed, so we can gather ref/modify
8132 * bits again.
8133 */
8134
8135 boolean_t
8136 arm_force_fast_fault(
8137 ppnum_t ppnum,
8138 vm_prot_t allow_mode,
8139 int options,
8140 __unused void *arg)
8141 {
8142 pmap_paddr_t phys = ptoa(ppnum);
8143
8144 assert(ppnum != vm_page_fictitious_addr);
8145
8146 if (!pa_valid(phys)) {
8147 return FALSE; /* Not a managed page. */
8148 }
8149
8150 #if XNU_MONITOR
8151 return arm_force_fast_fault_ppl(ppnum, allow_mode, options);
8152 #else
8153 return arm_force_fast_fault_internal(ppnum, allow_mode, options);
8154 #endif
8155 }
8156
8157 /*
8158 * Routine: arm_clear_fast_fault
8159 *
8160 * Function:
8161 * Clear pending force fault for all mappings for this page based on
8162 * the observed fault type, update ref/modify bits.
8163 */
8164 MARK_AS_PMAP_TEXT static boolean_t
8165 arm_clear_fast_fault(
8166 ppnum_t ppnum,
8167 vm_prot_t fault_type,
8168 pt_entry_t *pte_p)
8169 {
8170 pmap_paddr_t pa = ptoa(ppnum);
8171 pv_entry_t *pve_p;
8172 unsigned int pai;
8173 boolean_t result;
8174 bool tlb_flush_needed = false;
8175 pv_entry_t **pv_h;
8176 unsigned int npve = 0;
8177 unsigned int pass1_updated = 0;
8178 unsigned int pass2_updated = 0;
8179
8180 assert(ppnum != vm_page_fictitious_addr);
8181
8182 if (!pa_valid(pa)) {
8183 return FALSE; /* Not a managed page. */
8184 }
8185
8186 result = FALSE;
8187 pai = pa_index(pa);
8188 pvh_assert_locked(pai);
8189 pv_h = pai_to_pvh(pai);
8190
8191 pve_p = PV_ENTRY_NULL;
8192 if (pte_p == PT_ENTRY_NULL) {
8193 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
8194 pte_p = pvh_ptep(pv_h);
8195 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
8196 pve_p = pvh_pve_list(pv_h);
8197 } else if (__improbable(!pvh_test_type(pv_h, PVH_TYPE_NULL))) {
8198 panic("%s: invalid PV head 0x%llx for PA 0x%llx", __func__, (uint64_t)(*pv_h), (uint64_t)pa);
8199 }
8200 }
8201
8202 pv_entry_t *orig_pve_p = pve_p;
8203 pt_entry_t *orig_pte_p = pte_p;
8204 int pve_ptep_idx = 0;
8205
8206 /*
8207 * Pass 1: Make any necessary PTE updates, marking PTEs that will require
8208 * TLB invalidation in pass 2.
8209 */
8210 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8211 pt_entry_t spte;
8212 pt_entry_t tmplate;
8213
8214 if (pve_p != PV_ENTRY_NULL) {
8215 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8216 if (pte_p == PT_ENTRY_NULL) {
8217 goto cff_skip_pve_pass1;
8218 }
8219 }
8220
8221 #ifdef PVH_FLAG_IOMMU
8222 if (pvh_ptep_is_iommu(pte_p)) {
8223 goto cff_skip_pve_pass1;
8224 }
8225 #endif
8226 if (*pte_p == ARM_PTE_EMPTY) {
8227 panic("pte is empty: pte_p=%p ppnum=0x%x", pte_p, ppnum);
8228 }
8229
8230 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8231 const pmap_t pmap = ptdp->pmap;
8232 __assert_only const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8233
8234 assert(va >= pmap->min && va < pmap->max);
8235
8236 spte = *pte_p;
8237 tmplate = spte;
8238
8239 if ((fault_type & VM_PROT_WRITE) && (pte_was_writeable(spte))) {
8240 {
8241 if (pmap == kernel_pmap) {
8242 tmplate = ((spte & ~ARM_PTE_APMASK) | ARM_PTE_AP(AP_RWNA));
8243 } else {
8244 assert(pmap->type != PMAP_TYPE_NESTED);
8245 tmplate = ((spte & ~ARM_PTE_APMASK) | pt_attr_leaf_rw(pmap_get_pt_attr(pmap)));
8246 }
8247 }
8248
8249 tmplate |= ARM_PTE_AF;
8250
8251 pte_set_was_writeable(tmplate, false);
8252 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED | PP_ATTR_MODIFIED);
8253 } else if ((fault_type & VM_PROT_READ) && ((spte & ARM_PTE_AF) != ARM_PTE_AF)) {
8254 tmplate = spte | ARM_PTE_AF;
8255
8256 {
8257 ppattr_pa_set_bits(pa, PP_ATTR_REFERENCED);
8258 }
8259 }
8260
8261 #if MACH_ASSERT && XNU_MONITOR
8262 if (is_pte_xprr_protected(pmap, spte)) {
8263 if (pte_to_xprr_perm(spte) != pte_to_xprr_perm(tmplate)) {
8264 panic("%s: attempted to mutate an xPRR mapping pte_p=%p, pmap=%p, pv_h=%p, pve_p=%p, pte=0x%llx, tmplate=0x%llx, va=0x%llx, "
8265 "ppnum=0x%x, fault_type=0x%x",
8266 __FUNCTION__, pte_p, pmap, pv_h, pve_p, (unsigned long long)spte, (unsigned long long)tmplate, (unsigned long long)va,
8267 ppnum, fault_type);
8268 }
8269 }
8270 #endif /* MACH_ASSERT && XNU_MONITOR */
8271
8272 assert(spte != ARM_PTE_TYPE_FAULT);
8273 if (spte != tmplate) {
8274 if ((spte & (~ARM_PTE_WRITEABLE)) != (tmplate & (~ARM_PTE_WRITEABLE))) {
8275 #ifdef ARM_PTE_FF_MARKER
8276 assert(!(spte & ARM_PTE_FF_MARKER));
8277 tmplate |= ARM_PTE_FF_MARKER;
8278 ++pass1_updated;
8279 #endif
8280 tlb_flush_needed = true;
8281 }
8282 write_pte_fast(pte_p, tmplate);
8283 result = TRUE;
8284 }
8285
8286 cff_skip_pve_pass1:
8287 pte_p = PT_ENTRY_NULL;
8288 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8289 pve_ptep_idx = 0;
8290 pve_p = pve_next(pve_p);
8291 ++npve;
8292 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8293 break;
8294 }
8295 }
8296 }
8297
8298 if (!tlb_flush_needed) {
8299 goto cff_finish;
8300 }
8301
8302 FLUSH_PTE_STRONG();
8303
8304 /* Pass 2: Issue any required TLB invalidations */
8305 pve_p = orig_pve_p;
8306 pte_p = orig_pte_p;
8307 pve_ptep_idx = 0;
8308 npve = 0;
8309
8310 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
8311 if (pve_p != PV_ENTRY_NULL) {
8312 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
8313 if (pte_p == PT_ENTRY_NULL) {
8314 goto cff_skip_pve_pass2;
8315 }
8316 }
8317
8318 #ifdef PVH_FLAG_IOMMU
8319 if (pvh_ptep_is_iommu(pte_p)) {
8320 goto cff_skip_pve_pass2;
8321 }
8322 #endif
8323
8324 #ifdef ARM_PTE_FF_MARKER
8325 pt_entry_t spte = *pte_p;
8326
8327 if (!(spte & ARM_PTE_FF_MARKER)) {
8328 goto cff_skip_pve_pass2;
8329 } else {
8330 spte &= (~ARM_PTE_FF_MARKER);
8331 /* No need to synchronize with the TLB flush; we're changing a SW-managed bit */
8332 write_pte_fast(pte_p, spte);
8333 ++pass2_updated;
8334 }
8335 #endif
8336 const pt_desc_t * const ptdp = ptep_get_ptd(pte_p);
8337 const pmap_t pmap = ptdp->pmap;
8338 const vm_map_address_t va = ptd_get_va(ptdp, pte_p);
8339
8340 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
8341 pmap, true, false);
8342
8343 cff_skip_pve_pass2:
8344 pte_p = PT_ENTRY_NULL;
8345 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
8346 pve_ptep_idx = 0;
8347 pve_p = pve_next(pve_p);
8348 ++npve;
8349 if (__improbable(npve == PMAP_MAX_PV_LIST_CHUNK_SIZE)) {
8350 break;
8351 }
8352 }
8353 }
8354
8355 cff_finish:
8356 if (__improbable(pass1_updated != pass2_updated)) {
8357 panic("%s: first pass (%u) and second pass (%u) disagree on updated mappings",
8358 __func__, pass1_updated, pass2_updated);
8359 }
8360 if (tlb_flush_needed) {
8361 sync_tlb_flush();
8362 }
8363 return result;
8364 }
8365
8366 /*
8367 * Determine if the fault was induced by software tracking of
8368 * modify/reference bits. If so, re-enable the mapping (and set
8369 * the appropriate bits).
8370 *
8371 * Returns KERN_SUCCESS if the fault was induced and was
8372 * successfully handled.
8373 *
8374 * Returns KERN_FAILURE if the fault was not induced and
8375 * the function was unable to deal with it.
8376 *
8377 * Returns KERN_PROTECTION_FAILURE if the pmap layer explictly
8378 * disallows this type of access.
8379 *
8380 * Returns KERN_ABORTED if the pmap lock is taken and a
8381 * preemption is pending.
8382 *
8383 */
8384 MARK_AS_PMAP_TEXT kern_return_t
8385 arm_fast_fault_internal(
8386 pmap_t pmap,
8387 vm_map_address_t va,
8388 vm_prot_t fault_type,
8389 __unused bool was_af_fault,
8390 __unused bool from_user)
8391 {
8392 kern_return_t result = KERN_FAILURE;
8393 pt_entry_t *ptep;
8394 pt_entry_t spte = ARM_PTE_TYPE_FAULT;
8395 unsigned int pai;
8396 pmap_paddr_t pa;
8397 validate_pmap_mutable(pmap);
8398
8399 if (!pmap_lock_preempt(pmap, PMAP_LOCK_SHARED)) {
8400 return KERN_ABORTED;
8401 }
8402
8403 /*
8404 * If the entry doesn't exist, is completely invalid, or is already
8405 * valid, we can't fix it here.
8406 */
8407
8408 const uint64_t pmap_page_size = pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO;
8409 ptep = pmap_pte(pmap, va & ~(pmap_page_size - 1));
8410 if (ptep != PT_ENTRY_NULL) {
8411 while (true) {
8412 spte = *((volatile pt_entry_t*)ptep);
8413
8414 pa = pte_to_pa(spte);
8415
8416 if ((spte == ARM_PTE_TYPE_FAULT) ||
8417 ARM_PTE_IS_COMPRESSED(spte, ptep)) {
8418 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8419 return result;
8420 }
8421
8422 if (!pa_valid(pa)) {
8423 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8424 #if XNU_MONITOR
8425 if (pmap_cache_attributes((ppnum_t)atop(pa)) & PP_ATTR_MONITOR) {
8426 return KERN_PROTECTION_FAILURE;
8427 } else
8428 #endif
8429 return result;
8430 }
8431 pai = pa_index(pa);
8432 pvh_lock(pai);
8433 if (*ptep == spte) {
8434 /*
8435 * Double-check the spte value, as we care about the AF bit.
8436 * It's also possible that pmap_page_protect() transitioned the
8437 * PTE to compressed/empty before we grabbed the PVH lock.
8438 */
8439 break;
8440 }
8441 pvh_unlock(pai);
8442 }
8443 } else {
8444 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8445 return result;
8446 }
8447
8448
8449 if ((result != KERN_SUCCESS) &&
8450 ((ppattr_test_reffault(pai)) || ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)))) {
8451 /*
8452 * An attempted access will always clear ref/mod fault state, as
8453 * appropriate for the fault type. arm_clear_fast_fault will
8454 * update the associated PTEs for the page as appropriate; if
8455 * any PTEs are updated, we redrive the access. If the mapping
8456 * does not actually allow for the attempted access, the
8457 * following fault will (hopefully) fail to update any PTEs, and
8458 * thus cause arm_fast_fault to decide that it failed to handle
8459 * the fault.
8460 */
8461 if (ppattr_test_reffault(pai)) {
8462 ppattr_clear_reffault(pai);
8463 }
8464 if ((fault_type & VM_PROT_WRITE) && ppattr_test_modfault(pai)) {
8465 ppattr_clear_modfault(pai);
8466 }
8467
8468 if (arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, PT_ENTRY_NULL)) {
8469 /*
8470 * Should this preserve KERN_PROTECTION_FAILURE? The
8471 * cost of not doing so is a another fault in a case
8472 * that should already result in an exception.
8473 */
8474 result = KERN_SUCCESS;
8475 }
8476 }
8477
8478 /*
8479 * If the PTE already has sufficient permissions, we can report the fault as handled.
8480 * This may happen, for example, if multiple threads trigger roughly simultaneous faults
8481 * on mappings of the same page
8482 */
8483 if ((result == KERN_FAILURE) && (spte & ARM_PTE_AF)) {
8484 uintptr_t ap_ro, ap_rw, ap_x;
8485 if (pmap == kernel_pmap) {
8486 ap_ro = ARM_PTE_AP(AP_RONA);
8487 ap_rw = ARM_PTE_AP(AP_RWNA);
8488 ap_x = ARM_PTE_NX;
8489 } else {
8490 ap_ro = pt_attr_leaf_ro(pmap_get_pt_attr(pmap));
8491 ap_rw = pt_attr_leaf_rw(pmap_get_pt_attr(pmap));
8492 ap_x = pt_attr_leaf_x(pmap_get_pt_attr(pmap));
8493 }
8494 /*
8495 * NOTE: this doesn't currently handle user-XO mappings. Depending upon the
8496 * hardware they may be xPRR-protected, in which case they'll be handled
8497 * by the is_pte_xprr_protected() case above. Additionally, the exception
8498 * handling path currently does not call arm_fast_fault() without at least
8499 * VM_PROT_READ in fault_type.
8500 */
8501 if (((spte & ARM_PTE_APMASK) == ap_rw) ||
8502 (!(fault_type & VM_PROT_WRITE) && ((spte & ARM_PTE_APMASK) == ap_ro))) {
8503 if (!(fault_type & VM_PROT_EXECUTE) || ((spte & ARM_PTE_XMASK) == ap_x)) {
8504 result = KERN_SUCCESS;
8505 }
8506 }
8507 }
8508
8509 if ((result == KERN_FAILURE) && arm_clear_fast_fault((ppnum_t)atop(pa), fault_type, ptep)) {
8510 /*
8511 * A prior arm_clear_fast_fault() operation may have returned early due to
8512 * another pending PV list operation or an excessively large PV list.
8513 * Attempt a targeted fixup of the PTE that caused the fault to avoid repeatedly
8514 * taking a fault on the same mapping.
8515 */
8516 result = KERN_SUCCESS;
8517 }
8518
8519 pvh_unlock(pai);
8520 pmap_unlock(pmap, PMAP_LOCK_SHARED);
8521 return result;
8522 }
8523
8524 kern_return_t
8525 arm_fast_fault(
8526 pmap_t pmap,
8527 vm_map_address_t va,
8528 vm_prot_t fault_type,
8529 bool was_af_fault,
8530 __unused bool from_user)
8531 {
8532 kern_return_t result = KERN_FAILURE;
8533
8534 if (va < pmap->min || va >= pmap->max) {
8535 return result;
8536 }
8537
8538 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_START,
8539 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(va), fault_type,
8540 from_user);
8541
8542 do {
8543 #if XNU_MONITOR
8544 result = arm_fast_fault_ppl(pmap, va, fault_type, was_af_fault, from_user);
8545 #else
8546 result = arm_fast_fault_internal(pmap, va, fault_type, was_af_fault, from_user);
8547 #endif
8548 } while (result == KERN_ABORTED);
8549
8550 PMAP_TRACE(3, PMAP_CODE(PMAP__FAST_FAULT) | DBG_FUNC_END, result);
8551
8552 return result;
8553 }
8554
8555 void
8556 pmap_copy_page(
8557 ppnum_t psrc,
8558 ppnum_t pdst)
8559 {
8560 bcopy_phys((addr64_t) (ptoa(psrc)),
8561 (addr64_t) (ptoa(pdst)),
8562 PAGE_SIZE);
8563 }
8564
8565
8566 /*
8567 * pmap_copy_page copies the specified (machine independent) pages.
8568 */
8569 void
8570 pmap_copy_part_page(
8571 ppnum_t psrc,
8572 vm_offset_t src_offset,
8573 ppnum_t pdst,
8574 vm_offset_t dst_offset,
8575 vm_size_t len)
8576 {
8577 bcopy_phys((addr64_t) (ptoa(psrc) + src_offset),
8578 (addr64_t) (ptoa(pdst) + dst_offset),
8579 len);
8580 }
8581
8582
8583 /*
8584 * pmap_zero_page zeros the specified (machine independent) page.
8585 */
8586 void
8587 pmap_zero_page(
8588 ppnum_t pn)
8589 {
8590 assert(pn != vm_page_fictitious_addr);
8591 bzero_phys((addr64_t) ptoa(pn), PAGE_SIZE);
8592 }
8593
8594 /*
8595 * pmap_zero_part_page
8596 * zeros the specified (machine independent) part of a page.
8597 */
8598 void
8599 pmap_zero_part_page(
8600 ppnum_t pn,
8601 vm_offset_t offset,
8602 vm_size_t len)
8603 {
8604 assert(pn != vm_page_fictitious_addr);
8605 assert(offset + len <= PAGE_SIZE);
8606 bzero_phys((addr64_t) (ptoa(pn) + offset), len);
8607 }
8608
8609 void
8610 pmap_map_globals(
8611 void)
8612 {
8613 pt_entry_t *ptep, pte;
8614
8615 ptep = pmap_pte(kernel_pmap, LOWGLOBAL_ALIAS);
8616 assert(ptep != PT_ENTRY_NULL);
8617 assert(*ptep == ARM_PTE_EMPTY);
8618
8619 pte = pa_to_pte(ml_static_vtop((vm_offset_t)&lowGlo)) | AP_RONA | ARM_PTE_NX | ARM_PTE_PNX | ARM_PTE_AF | ARM_PTE_TYPE;
8620 #if __ARM_KERNEL_PROTECT__
8621 pte |= ARM_PTE_NG;
8622 #endif /* __ARM_KERNEL_PROTECT__ */
8623 pte |= ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK);
8624 pte |= ARM_PTE_SH(SH_OUTER_MEMORY);
8625 *ptep = pte;
8626 FLUSH_PTE();
8627 PMAP_UPDATE_TLBS(kernel_pmap, LOWGLOBAL_ALIAS, LOWGLOBAL_ALIAS + PAGE_SIZE, false, true);
8628
8629 #if KASAN
8630 kasan_notify_address(LOWGLOBAL_ALIAS, PAGE_SIZE);
8631 #endif
8632 }
8633
8634 vm_offset_t
8635 pmap_cpu_windows_copy_addr(int cpu_num, unsigned int index)
8636 {
8637 if (__improbable(index >= CPUWINDOWS_MAX)) {
8638 panic("%s: invalid index %u", __func__, index);
8639 }
8640 return (vm_offset_t)(CPUWINDOWS_BASE + (PAGE_SIZE * ((CPUWINDOWS_MAX * cpu_num) + index)));
8641 }
8642
8643 MARK_AS_PMAP_TEXT unsigned int
8644 pmap_map_cpu_windows_copy_internal(
8645 ppnum_t pn,
8646 vm_prot_t prot,
8647 unsigned int wimg_bits)
8648 {
8649 pt_entry_t *ptep = NULL, pte;
8650 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8651 unsigned int cpu_num;
8652 unsigned int i;
8653 vm_offset_t cpu_copywindow_vaddr = 0;
8654 bool need_strong_sync = false;
8655
8656 #if XNU_MONITOR
8657 unsigned int cacheattr = (!pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) ? pmap_cache_attributes(pn) : 0);
8658 need_strong_sync = ((cacheattr & PMAP_IO_RANGE_STRONG_SYNC) != 0);
8659 #endif
8660
8661 #if XNU_MONITOR
8662 #ifdef __ARM_COHERENT_IO__
8663 if (__improbable(pa_valid(ptoa(pn) & ARM_PTE_PAGE_MASK) && !pmap_ppl_disable)) {
8664 panic("%s: attempted to map a managed page, "
8665 "pn=%u, prot=0x%x, wimg_bits=0x%x",
8666 __FUNCTION__,
8667 pn, prot, wimg_bits);
8668 }
8669 if (__improbable((cacheattr & PP_ATTR_MONITOR) && (prot != VM_PROT_READ) && !pmap_ppl_disable)) {
8670 panic("%s: attempt to map PPL-protected I/O address 0x%llx as writable", __func__, (uint64_t)ptoa(pn));
8671 }
8672
8673 #else /* __ARM_COHERENT_IO__ */
8674 #error CPU copy windows are not properly supported with both the PPL and incoherent IO
8675 #endif /* __ARM_COHERENT_IO__ */
8676 #endif /* XNU_MONITOR */
8677 cpu_num = pmap_cpu_data->cpu_number;
8678
8679 for (i = 0; i < CPUWINDOWS_MAX; i++) {
8680 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, i);
8681 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8682 assert(!ARM_PTE_IS_COMPRESSED(*ptep, ptep));
8683 if (*ptep == ARM_PTE_TYPE_FAULT) {
8684 break;
8685 }
8686 }
8687 if (i == CPUWINDOWS_MAX) {
8688 panic("pmap_map_cpu_windows_copy: out of window");
8689 }
8690
8691 pte = pa_to_pte(ptoa(pn)) | ARM_PTE_TYPE | ARM_PTE_AF | ARM_PTE_NX | ARM_PTE_PNX;
8692 #if __ARM_KERNEL_PROTECT__
8693 pte |= ARM_PTE_NG;
8694 #endif /* __ARM_KERNEL_PROTECT__ */
8695
8696 pte |= wimg_to_pte(wimg_bits, ptoa(pn));
8697
8698 if (prot & VM_PROT_WRITE) {
8699 pte |= ARM_PTE_AP(AP_RWNA);
8700 } else {
8701 pte |= ARM_PTE_AP(AP_RONA);
8702 }
8703 #if HAS_FEAT_XS
8704 need_strong_sync = pte_is_xs(native_pt_attr, pte);
8705 #endif
8706 write_pte_fast(ptep, pte);
8707 /*
8708 * Invalidate tlb. Cover nested cpu_copywindow_vaddr usage with the interrupted context
8709 * in pmap_unmap_cpu_windows_copy() after clearing the pte and before tlb invalidate.
8710 */
8711 FLUSH_PTE_STRONG();
8712 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[i], true);
8713 pmap_cpu_data->copywindow_strong_sync[i] = need_strong_sync;
8714
8715 return i;
8716 }
8717
8718 unsigned int
8719 pmap_map_cpu_windows_copy(
8720 ppnum_t pn,
8721 vm_prot_t prot,
8722 unsigned int wimg_bits)
8723 {
8724 #if XNU_MONITOR
8725 return pmap_map_cpu_windows_copy_ppl(pn, prot, wimg_bits);
8726 #else
8727 return pmap_map_cpu_windows_copy_internal(pn, prot, wimg_bits);
8728 #endif
8729 }
8730
8731 MARK_AS_PMAP_TEXT void
8732 pmap_unmap_cpu_windows_copy_internal(
8733 unsigned int index)
8734 {
8735 pt_entry_t *ptep;
8736 unsigned int cpu_num;
8737 vm_offset_t cpu_copywindow_vaddr = 0;
8738 pmap_cpu_data_t *pmap_cpu_data = pmap_get_cpu_data();
8739
8740 cpu_num = pmap_cpu_data->cpu_number;
8741
8742 cpu_copywindow_vaddr = pmap_cpu_windows_copy_addr(cpu_num, index);
8743 /* Issue full-system DSB to ensure prior operations on the per-CPU window
8744 * (which are likely to have been on I/O memory) are complete before
8745 * tearing down the mapping. */
8746 __builtin_arm_dsb(DSB_SY);
8747 ptep = pmap_pte(kernel_pmap, cpu_copywindow_vaddr);
8748 write_pte_strong(ptep, ARM_PTE_TYPE_FAULT);
8749 PMAP_UPDATE_TLBS(kernel_pmap, cpu_copywindow_vaddr, cpu_copywindow_vaddr + PAGE_SIZE, pmap_cpu_data->copywindow_strong_sync[index], true);
8750 }
8751
8752 void
8753 pmap_unmap_cpu_windows_copy(
8754 unsigned int index)
8755 {
8756 #if XNU_MONITOR
8757 return pmap_unmap_cpu_windows_copy_ppl(index);
8758 #else
8759 return pmap_unmap_cpu_windows_copy_internal(index);
8760 #endif
8761 }
8762
8763 #if XNU_MONITOR
8764
8765 MARK_AS_PMAP_TEXT void
8766 pmap_invoke_with_page(
8767 ppnum_t page_number,
8768 void *ctx,
8769 void (*callback)(void *ctx, ppnum_t page_number, const void *page))
8770 {
8771 #pragma unused(page_number, ctx, callback)
8772 }
8773
8774 /*
8775 * Loop over every pmap_io_range (I/O ranges marked as owned by
8776 * the PPL in the device tree) and conditionally call callback() on each range
8777 * that needs to be included in the hibernation image.
8778 *
8779 * @param ctx Will be passed as-is into the callback method. Use NULL if no
8780 * context is needed in the callback.
8781 * @param callback Callback function invoked on each range (gated by flag).
8782 */
8783 MARK_AS_PMAP_TEXT void
8784 pmap_hibernate_invoke(void *ctx, void (*callback)(void *ctx, uint64_t addr, uint64_t len))
8785 {
8786 extern const pmap_io_range_t* io_attr_table;
8787 extern const unsigned int num_io_rgns;
8788 for (unsigned int i = 0; i < num_io_rgns; ++i) {
8789 if (io_attr_table[i].wimg & PMAP_IO_RANGE_NEEDS_HIBERNATING) {
8790 callback(ctx, io_attr_table[i].addr, io_attr_table[i].len);
8791 }
8792 }
8793 }
8794
8795 /**
8796 * Set the HASHED pv_head_table flag for the passed in physical page if it's a
8797 * PPL-owned page. Otherwise, do nothing.
8798 *
8799 * @param addr Physical address of the page to set the HASHED flag on.
8800 */
8801 MARK_AS_PMAP_TEXT void
8802 pmap_set_ppl_hashed_flag(const pmap_paddr_t addr)
8803 {
8804 /* Ignore non-managed kernel memory. */
8805 if (!pa_valid(addr)) {
8806 return;
8807 }
8808
8809 const unsigned int pai = pa_index(addr);
8810 if (pp_attr_table[pai] & PP_ATTR_MONITOR) {
8811 pv_entry_t **pv_h = pai_to_pvh(pai);
8812
8813 /* Mark that the PPL-owned page has been hashed into the hibernation image. */
8814 pvh_lock(pai);
8815 pvh_set_flags(pv_h, pvh_get_flags(pv_h) | PVH_FLAG_HASHED);
8816 pvh_unlock(pai);
8817 }
8818 }
8819
8820 /**
8821 * Loop through every physical page in the system and clear out the HASHED flag
8822 * on every PPL-owned page. That flag is used to keep track of which pages have
8823 * been hashed into the hibernation image during the hibernation entry process.
8824 *
8825 * The HASHED flag needs to be cleared out between hibernation cycles because the
8826 * pv_head_table and pp_attr_table's might have been copied into the hibernation
8827 * image with the HASHED flag set on certain pages. It's important to clear the
8828 * HASHED flag to ensure that the enforcement of all PPL-owned memory being hashed
8829 * into the hibernation image can't be compromised across hibernation cycles.
8830 */
8831 MARK_AS_PMAP_TEXT void
8832 pmap_clear_ppl_hashed_flag_all(void)
8833 {
8834 const unsigned int last_index = pa_index(vm_last_phys);
8835 pv_entry_t **pv_h = NULL;
8836
8837 for (int pai = 0; pai < last_index; ++pai) {
8838 pv_h = pai_to_pvh(pai);
8839
8840 /* Test for PPL-owned pages that have the HASHED flag set in its pv_head_table entry. */
8841 if ((pvh_get_flags(pv_h) & PVH_FLAG_HASHED) &&
8842 (pp_attr_table[pai] & PP_ATTR_MONITOR)) {
8843 pvh_lock(pai);
8844 pvh_set_flags(pv_h, pvh_get_flags(pv_h) & ~PVH_FLAG_HASHED);
8845 pvh_unlock(pai);
8846 }
8847 }
8848 }
8849
8850 /**
8851 * Enforce that all PPL-owned pages were hashed into the hibernation image. The
8852 * ppl_hib driver will call this after all wired pages have been copied into the
8853 * hibernation image.
8854 */
8855 MARK_AS_PMAP_TEXT void
8856 pmap_check_ppl_hashed_flag_all(void)
8857 {
8858 const unsigned int last_index = pa_index(vm_last_phys);
8859 pv_entry_t **pv_h = NULL;
8860
8861 for (int pai = 0; pai < last_index; ++pai) {
8862 pv_h = pai_to_pvh(pai);
8863
8864 /**
8865 * The PMAP stacks are explicitly not saved into the image so skip checking
8866 * the pages that contain the PMAP stacks.
8867 */
8868 const bool is_pmap_stack = (pai >= pa_index(pmap_stacks_start_pa)) &&
8869 (pai < pa_index(pmap_stacks_end_pa));
8870
8871 if (!is_pmap_stack &&
8872 (pp_attr_table[pai] & PP_ATTR_MONITOR) &&
8873 !(pvh_get_flags(pv_h) & PVH_FLAG_HASHED)) {
8874 panic("Found PPL-owned page that was not hashed into the hibernation image: pai %d", pai);
8875 }
8876 }
8877 }
8878
8879 #endif /* XNU_MONITOR */
8880
8881 /*
8882 * Indicate that a pmap is intended to be used as a nested pmap
8883 * within one or more larger address spaces. This must be set
8884 * before pmap_nest() is called with this pmap as the 'subordinate'.
8885 */
8886 MARK_AS_PMAP_TEXT void
8887 pmap_set_nested_internal(
8888 pmap_t pmap)
8889 {
8890 validate_pmap_mutable(pmap);
8891 if (__improbable(!(os_atomic_cmpxchg(&pmap->type, PMAP_TYPE_USER, PMAP_TYPE_NESTED, seq_cst)))) {
8892 panic("%s: attempt to nest unsupported pmap %p of type 0x%hhx",
8893 __func__, pmap, pmap->type);
8894 }
8895
8896 #if XNU_MONITOR
8897 /**
8898 * The "seq_cst" ordering of the atomic load here guarantees
8899 * the check below is performed after the type update above
8900 * is observed. Together with similar order guarantee at
8901 * pmap_switch_internal(), it makes sure a pmap is never
8902 * active-and-nested:
8903 *
8904 * pmap_set_nested() | pmap_switch()
8905 * --------------------------------------
8906 * set nested | set active
8907 * store-load barrier| store-load barrier
8908 * assert !active | assert !nested
8909 */
8910 const int max_cpu = ml_get_max_cpu_number();
8911 for (unsigned int i = 0; i <= max_cpu; ++i) {
8912 const pmap_cpu_data_t *cpu_data = pmap_get_remote_cpu_data(i);
8913 if (cpu_data == NULL) {
8914 continue;
8915 }
8916 if (__improbable(os_atomic_load(&cpu_data->active_pmap, seq_cst) == pmap)) {
8917 panic("pmap %p: attempting to set nested while active on cpu %llu", pmap, (uint64_t)i);
8918 }
8919 }
8920 #endif /* XNU_MONITOR */
8921
8922 /**
8923 * Ensure that a (potentially concurrent) call to pmap_nest() hasn't tried to give
8924 * this pmap its own nested pmap.
8925 */
8926 if (__improbable(os_atomic_load(&pmap->nested_pmap, seq_cst) != NULL)) {
8927 panic("%s: attempt to nest pmap %p which already has a nested pmap", __func__, pmap);
8928 }
8929
8930 pmap_get_pt_ops(pmap)->free_id(pmap);
8931 }
8932
8933 void
8934 pmap_set_nested(
8935 pmap_t pmap)
8936 {
8937 #if XNU_MONITOR
8938 pmap_set_nested_ppl(pmap);
8939 #else
8940 pmap_set_nested_internal(pmap);
8941 #endif
8942 }
8943
8944 bool
8945 pmap_is_nested(
8946 pmap_t pmap)
8947 {
8948 return pmap->type == PMAP_TYPE_NESTED;
8949 }
8950
8951 /*
8952 * pmap_trim_range(pmap, start, end)
8953 *
8954 * pmap = pmap to operate on
8955 * start = start of the range
8956 * end = end of the range
8957 *
8958 * Attempts to deallocate TTEs for the given range in the nested range.
8959 */
8960 MARK_AS_PMAP_TEXT static void
8961 pmap_trim_range(
8962 pmap_t pmap,
8963 addr64_t start,
8964 addr64_t end)
8965 {
8966 addr64_t cur;
8967 addr64_t nested_region_start;
8968 addr64_t nested_region_end;
8969 addr64_t adjusted_start;
8970 addr64_t adjusted_end;
8971 addr64_t adjust_offmask;
8972 tt_entry_t * tte_p;
8973 pt_entry_t * pte_p;
8974 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
8975
8976 if (__improbable(end < start)) {
8977 panic("%s: invalid address range, "
8978 "pmap=%p, start=%p, end=%p",
8979 __func__,
8980 pmap, (void*)start, (void*)end);
8981 }
8982
8983 nested_region_start = pmap->nested_region_addr;
8984 nested_region_end = nested_region_start + pmap->nested_region_size;
8985
8986 if (__improbable((start < nested_region_start) || (end > nested_region_end))) {
8987 panic("%s: range outside nested region %p-%p, "
8988 "pmap=%p, start=%p, end=%p",
8989 __func__, (void *)nested_region_start, (void *)nested_region_end,
8990 pmap, (void*)start, (void*)end);
8991 }
8992
8993 /* Contract the range to TT page boundaries. */
8994 adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
8995 adjusted_start = ((start + adjust_offmask) & ~adjust_offmask);
8996 adjusted_end = end & ~adjust_offmask;
8997
8998 /* Iterate over the range, trying to remove TTEs. */
8999 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_twig_size(pt_attr)) {
9000 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9001
9002 tte_p = pmap_tte(pmap, cur);
9003
9004 if ((tte_p != NULL) && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
9005 pte_p = (pt_entry_t *) ttetokv(*tte_p);
9006
9007 /* pmap_tte_deallocate()/pmap_tte_remove() will drop the pmap lock */
9008 if ((pmap->type == PMAP_TYPE_NESTED) && (ptep_get_info(pte_p)->refcnt == 0)) {
9009 /* Deallocate for the nested map. */
9010 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9011 } else if (pmap->type == PMAP_TYPE_USER) {
9012 /**
9013 * Just remove for the parent map. If the leaf table pointed
9014 * to by the TTE being removed (owned by the nested pmap)
9015 * has any mappings, then this call will panic. This
9016 * enforces the policy that tables being trimmed must be
9017 * empty to prevent possible use-after-free attacks.
9018 */
9019 pmap_tte_remove(pmap, cur, cur + PAGE_SIZE, false, tte_p, pt_attr_twig_level(pt_attr));
9020 } else {
9021 panic("%s: Unsupported pmap type for nesting %p %d", __func__, pmap, pmap->type);
9022 }
9023 } else {
9024 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9025 }
9026 }
9027
9028 /* Remove empty L2 TTs. */
9029 adjusted_start = ((start + pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL)) & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL));
9030 adjusted_end = end & ~pt_attr_ln_offmask(pt_attr, PMAP_TT_L1_LEVEL);
9031
9032 for (cur = adjusted_start; (cur < adjusted_end) && (cur >= adjusted_start); cur += pt_attr_ln_size(pt_attr, PMAP_TT_L1_LEVEL)) {
9033 /* For each L1 entry in our range... */
9034 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
9035
9036 bool remove_tt1e = true;
9037 tt_entry_t * tt1e_p = pmap_tt1e(pmap, cur);
9038 tt_entry_t * tt2e_start;
9039 tt_entry_t * tt2e_end;
9040 tt_entry_t * tt2e_p;
9041 tt_entry_t tt1e;
9042
9043 if (tt1e_p == NULL) {
9044 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9045 continue;
9046 }
9047
9048 tt1e = *tt1e_p;
9049
9050 if (tt1e == ARM_TTE_TYPE_FAULT) {
9051 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9052 continue;
9053 }
9054
9055 tt2e_start = &((tt_entry_t*) phystokv(tt1e & ARM_TTE_TABLE_MASK))[0];
9056 tt2e_end = &tt2e_start[pt_attr_page_size(pt_attr) / sizeof(*tt2e_start)];
9057
9058 for (tt2e_p = tt2e_start; tt2e_p < tt2e_end; tt2e_p++) {
9059 if (*tt2e_p != ARM_TTE_TYPE_FAULT) {
9060 /*
9061 * If any TTEs are populated, don't remove the
9062 * L1 TT.
9063 */
9064 remove_tt1e = false;
9065 }
9066 }
9067
9068 if (remove_tt1e) {
9069 pmap_tte_deallocate(pmap, cur, cur + PAGE_SIZE, false, tt1e_p, PMAP_TT_L1_LEVEL);
9070 } else {
9071 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
9072 }
9073 }
9074 }
9075
9076 /**
9077 * State machine for multi-step pmap trimming. Trimming is the action of
9078 * deallocating the TTEs of the shared region of pmaps down to a given range.
9079 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9080 * disabling preemption for too long. These steps include computing the bounds
9081 * of the shared region, trimming the head of the "grand", trimming the tail of
9082 * the "grand", and trimming the "subord". Some of the steps can be skipped under
9083 * different conditions.
9084 *
9085 * @param grand the pmap in which the pages are nested
9086 * @param subord the pmap from which the pages are shared, or nested
9087 * @param vstart start of the used range in "grand"
9088 * @param size size of the used range
9089 * @param state the current state of the state machine
9090 *
9091 * @return the next state of the state machine, to be used in the next call
9092 * into this function.
9093 */
9094 MARK_AS_PMAP_TEXT pmap_trim_state_t
9095 pmap_trim_internal(
9096 pmap_t grand,
9097 pmap_t subord,
9098 addr64_t vstart,
9099 uint64_t size,
9100 pmap_trim_state_t state)
9101 {
9102 /* Validation needs to be done regardless of state. */
9103 addr64_t vend;
9104
9105 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9106 panic("%s: grand addr wraps around, "
9107 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9108 __func__, grand, subord, (void*)vstart, size, state);
9109 }
9110
9111 validate_pmap_mutable(grand);
9112 validate_pmap(subord);
9113
9114 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9115 panic("%s: subord is of non-nestable type 0x%hhx, "
9116 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9117 __func__, subord->type, grand, subord, (void*)vstart, size, state);
9118 }
9119
9120 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9121 panic("%s: grand is of unsupprted type 0x%hhx for nesting, "
9122 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9123 __func__, grand->type, grand, subord, (void*)vstart, size, state);
9124 }
9125
9126 if (__improbable(grand->nested_pmap != subord)) {
9127 panic("%s: grand->nested != subord, "
9128 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9129 __func__, grand, subord, (void*)vstart, size, state);
9130 }
9131
9132 if (__improbable((size != 0) &&
9133 ((vstart < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))))) {
9134 panic("%s: grand range not in nested region, "
9135 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9136 __func__, grand, subord, (void*)vstart, size, state);
9137 }
9138
9139 /* Trimming starts with figuring out the bounds for the grand. */
9140 if (state == PMAP_TRIM_STATE_START) {
9141 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9142
9143 /**
9144 * The "nested_no_bounds_ref_state" enum is set to NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER by
9145 * `pmap_nest()` if the subord is nested into the grand when the bounds are not known yet.
9146 * Therefore, if it is NESTED_NO_BOUNDS_REF_NONE, either any nesting has not happened, or
9147 * trimming has been done, or nesting has been done with bounds known so the "extra" region
9148 * was not nested in the first place. Anyway, trimming is not needed so we exit early with
9149 * PMAP_TRIM_STATE_DONE.
9150 */
9151 if (grand->nested_no_bounds_ref_state == NESTED_NO_BOUNDS_REF_NONE) {
9152 assert(subord->nested_bounds_set);
9153
9154 /* Nothing to do if the grand already has bounds set, otherwise inherit from the subord. */
9155 if (!grand->nested_bounds_set) {
9156 /* Inherit the bounds from subord. */
9157 grand->nested_region_true_start = subord->nested_region_true_start;
9158 grand->nested_region_true_end = subord->nested_region_true_end;
9159 grand->nested_bounds_set = true;
9160 }
9161
9162 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9163
9164 /* Now that the grand has bounds, we are done. */
9165 return PMAP_TRIM_STATE_DONE;
9166 }
9167
9168 /* If the subord doesn't have bounds set yet, compute them from vstart and a non-zero size. */
9169 if ((!subord->nested_bounds_set) && size) {
9170 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9171 const addr64_t adjust_offmask = pt_attr_leaf_table_offmask(pt_attr);
9172
9173 subord->nested_region_true_start = vstart;
9174 subord->nested_region_true_end = vend;
9175 subord->nested_region_true_start &= ~adjust_offmask;
9176
9177 if (__improbable(os_add_overflow(subord->nested_region_true_end, adjust_offmask, &subord->nested_region_true_end))) {
9178 panic("%s: padded true end wraps around, "
9179 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9180 __func__, grand, subord, (void*)vstart, size, state);
9181 }
9182
9183 subord->nested_region_true_end &= ~adjust_offmask;
9184 subord->nested_bounds_set = true;
9185 }
9186
9187 /* If the subord has bounds set now, let the grand inherit and continue to trim. Otherwise, we are done. */
9188 if (subord->nested_bounds_set) {
9189 /* Inherit the bounds from subord. */
9190 grand->nested_region_true_start = subord->nested_region_true_start;
9191 grand->nested_region_true_end = subord->nested_region_true_end;
9192 grand->nested_bounds_set = true;
9193
9194 /* If we know the bounds, we can trim the pmap. */
9195 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9196
9197 state = PMAP_TRIM_STATE_GRAND_BEFORE;
9198 } else {
9199 /* Don't trim if we don't know the bounds. */
9200 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9201
9202 return PMAP_TRIM_STATE_DONE;
9203 }
9204 }
9205
9206 /* Sanity check here: we are ready to trim, do we know the bounds yet? */
9207 if (!grand->nested_bounds_set) {
9208 panic("%s: !grand->nested_bounds_set, "
9209 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9210 __func__, grand, subord, (void*)vstart, size, state);
9211 }
9212
9213 if (state == PMAP_TRIM_STATE_GRAND_BEFORE) {
9214 pmap_trim_range(grand, grand->nested_region_addr, grand->nested_region_true_start);
9215 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9216 NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, NESTED_NO_BOUNDS_REF_AFTER, release))) {
9217 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9218 (unsigned int)grand->nested_no_bounds_ref_state);
9219 }
9220
9221 #if XNU_MONITOR
9222 if (pmap_pending_preemption()) {
9223 return PMAP_TRIM_STATE_GRAND_AFTER;
9224 }
9225 #endif
9226
9227 state = PMAP_TRIM_STATE_GRAND_AFTER;
9228 }
9229
9230 if (state == PMAP_TRIM_STATE_GRAND_AFTER) {
9231 pmap_trim_range(grand, grand->nested_region_true_end, (grand->nested_region_addr + grand->nested_region_size));
9232 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9233 NESTED_NO_BOUNDS_REF_AFTER, NESTED_NO_BOUNDS_REF_SUBORD, release))) {
9234 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9235 (unsigned int)grand->nested_no_bounds_ref_state);
9236 }
9237
9238 #if XNU_MONITOR
9239 if (pmap_pending_preemption()) {
9240 return PMAP_TRIM_STATE_SUBORD;
9241 }
9242 #endif
9243
9244 state = PMAP_TRIM_STATE_SUBORD;
9245 }
9246
9247 /* START state is guaranteed to compute the bounds for the subord. */
9248 if (!subord->nested_bounds_set) {
9249 panic("%s: !subord->nested_bounds_set, "
9250 "grand=%p, subord=%p, vstart=%p, size=%#llx, state=%u",
9251 __func__, grand, subord, (void*)vstart, size, state);
9252 }
9253
9254 if (state == PMAP_TRIM_STATE_SUBORD) {
9255 /**
9256 * Since 'state' may be an attacker-controlled variable, we use nested_no_bounds_ref_state
9257 * to ensure that pmap_trim_subord (which may free page tables from subord) can only be
9258 * called once grand's nested tables have been fully trimmed, and can only be called once
9259 * for each 'grand' pmap. We use release ordering for the atomics above to ensure that
9260 * the state update is visible only once the preceding trim operation is complete. An
9261 * attacker may be able to trigger multiple concurrent trims on the same 'grand' region,
9262 * but locking within pmap_trim_range() should make that harmless (and all but one will
9263 * ultimately panic due to a failed atomic state CAS). We use acquire ordering here to
9264 * ensure that modifications performed by pmap_trim_subord() can't be reordered ahead
9265 * of the state CAS.
9266 */
9267 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9268 NESTED_NO_BOUNDS_REF_SUBORD, NESTED_NO_BOUNDS_REF_NONE, acquire))) {
9269 panic("%s: grand %p has unexpected no-bounds state %u", __func__, grand,
9270 (unsigned int)grand->nested_no_bounds_ref_state);
9271 }
9272 pmap_trim_subord(subord);
9273 }
9274
9275 return PMAP_TRIM_STATE_DONE;
9276 }
9277
9278 MARK_AS_PMAP_TEXT static void
9279 pmap_trim_self(pmap_t pmap)
9280 {
9281 if ((pmap->nested_no_bounds_ref_state != NESTED_NO_BOUNDS_REF_NONE) && pmap->nested_pmap) {
9282 /* If we have a no bounds ref, we need to drop it. */
9283 pmap_lock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9284 pmap->nested_no_bounds_ref_state = NESTED_NO_BOUNDS_REF_NONE;
9285 boolean_t nested_bounds_set = pmap->nested_pmap->nested_bounds_set;
9286 vm_map_offset_t nested_region_true_start = pmap->nested_pmap->nested_region_true_start;
9287 vm_map_offset_t nested_region_true_end = pmap->nested_pmap->nested_region_true_end;
9288 pmap_unlock(pmap->nested_pmap, PMAP_LOCK_SHARED);
9289
9290 if (nested_bounds_set) {
9291 pmap_trim_range(pmap, pmap->nested_region_addr, nested_region_true_start);
9292 pmap_trim_range(pmap, nested_region_true_end, (pmap->nested_region_addr + pmap->nested_region_size));
9293 }
9294 /*
9295 * Try trimming the nested pmap, in case we had the
9296 * last reference.
9297 */
9298 pmap_trim_subord(pmap->nested_pmap);
9299 }
9300 }
9301
9302 /*
9303 * pmap_trim_subord(grand, subord)
9304 *
9305 * grand = pmap that we have nested subord in
9306 * subord = nested pmap we are attempting to trim
9307 *
9308 * Trims subord if possible
9309 */
9310 MARK_AS_PMAP_TEXT static void
9311 pmap_trim_subord(pmap_t subord)
9312 {
9313 bool contract_subord = false;
9314
9315 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9316
9317 subord->nested_no_bounds_refcnt--;
9318
9319 if ((subord->nested_no_bounds_refcnt == 0) && (subord->nested_bounds_set)) {
9320 /* If this was the last no bounds reference, trim subord. */
9321 contract_subord = true;
9322 }
9323
9324 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9325
9326 if (contract_subord) {
9327 pmap_trim_range(subord, subord->nested_region_addr, subord->nested_region_true_start);
9328 pmap_trim_range(subord, subord->nested_region_true_end, subord->nested_region_addr + subord->nested_region_size);
9329 }
9330 }
9331
9332 /**
9333 * Deallocates the TTEs of the shared region of pmaps down to a given range.
9334 * On PPL-enabled systems, this needs to be done in multiple steps to avoid
9335 * disabling preemption for too long.
9336 *
9337 * @note When we load the shared region we always create pages tables for the
9338 * entire region. In practice, the shared cache may use just a portion
9339 * of that. Before we know the bounds of the shared region, it can
9340 * already be mapped into processes. Therefore, once the bounds are
9341 * known, "trimming" comes in handy to remove the unnecessary page
9342 * tables in the processes the shared region is mapped in, and eventually
9343 * those in the shared region itself. Note that the shared region must
9344 * be trimmed after the user processes because it has the L3 entries
9345 * everyone else is pointing to.
9346 *
9347 * @param grand the pmap in which the pages are nested
9348 * @param subord the pmap from which the pages are shared, or nested
9349 * @param vstart start of the used range in "grand"
9350 * @param size size of the used range
9351 */
9352 void
9353 pmap_trim(
9354 pmap_t grand,
9355 pmap_t subord,
9356 addr64_t vstart,
9357 uint64_t size)
9358 {
9359 pmap_trim_state_t state = PMAP_TRIM_STATE_START;
9360
9361 #if XNU_MONITOR
9362 /* On PPL systems, drives the state machine until its done. */
9363 while (state != PMAP_TRIM_STATE_DONE) {
9364 __assert_only pmap_trim_state_t old_state = state;
9365 state = pmap_trim_ppl(grand, subord, vstart, size, state);
9366
9367 /* Are we making progress? */
9368 assert(old_state != state);
9369 }
9370
9371 pmap_ledger_check_balance(grand);
9372 pmap_ledger_check_balance(subord);
9373 #else
9374 state = pmap_trim_internal(grand, subord, vstart, size, state);
9375
9376 /* On non-PPL systems, we expect the implementation to finish in one call. */
9377 assert(state == PMAP_TRIM_STATE_DONE);
9378 #endif
9379 }
9380
9381 #if HAS_APPLE_PAC
9382 void *
9383 pmap_sign_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9384 {
9385 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9386 panic("attempt to sign user pointer without process independent key");
9387 }
9388
9389 void *res = NULL;
9390 uint64_t current_intr_state = pmap_interrupts_disable();
9391
9392 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9393
9394 __compiler_materialize_and_prevent_reordering_on(value);
9395 switch (key) {
9396 case ptrauth_key_asia:
9397 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asia, discriminator);
9398 break;
9399 case ptrauth_key_asda:
9400 res = ptrauth_sign_unauthenticated(value, ptrauth_key_asda, discriminator);
9401 break;
9402 default:
9403 __builtin_unreachable();
9404 }
9405 __compiler_materialize_and_prevent_reordering_on(res);
9406
9407 ml_disable_user_jop_key(jop_key, saved_jop_state);
9408
9409 pmap_interrupts_restore(current_intr_state);
9410
9411 return res;
9412 }
9413
9414 void *
9415 pmap_sign_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9416 {
9417 return pmap_sign_user_ptr_internal(value, key, discriminator, jop_key);
9418 }
9419
9420 void *
9421 pmap_auth_user_ptr_internal(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9422 {
9423 if ((key != ptrauth_key_asia) && (key != ptrauth_key_asda)) {
9424 panic("attempt to auth user pointer without process independent key");
9425 }
9426
9427 void *res = NULL;
9428 uint64_t current_intr_state = pmap_interrupts_disable();
9429
9430 uint64_t saved_jop_state = ml_enable_user_jop_key(jop_key);
9431 __compiler_materialize_and_prevent_reordering_on(value);
9432 res = ml_auth_ptr_unchecked(value, key, discriminator);
9433 __compiler_materialize_and_prevent_reordering_on(res);
9434 ml_disable_user_jop_key(jop_key, saved_jop_state);
9435
9436 pmap_interrupts_restore(current_intr_state);
9437
9438 return res;
9439 }
9440
9441 void *
9442 pmap_auth_user_ptr(void *value, ptrauth_key key, uint64_t discriminator, uint64_t jop_key)
9443 {
9444 return pmap_auth_user_ptr_internal(value, key, discriminator, jop_key);
9445 }
9446 #endif /* HAS_APPLE_PAC */
9447
9448 /*
9449 * Marker to indicate that a pmap_[un]nest() operation has finished operating on
9450 * the 'subordinate' pmap and has begun operating on the 'grand' pmap. This
9451 * flag is supplied in the low-order bit of the 'vrestart' param as well as the
9452 * return value, to indicate where a preempted [un]nest operation should resume.
9453 * When the return value contains the ending address of the nested region with
9454 * PMAP_NEST_GRAND in the low-order bit, the operation has completed.
9455 */
9456 #define PMAP_NEST_GRAND ((vm_map_offset_t) 0x1)
9457
9458 /*
9459 * kern_return_t pmap_nest(grand, subord, vstart, size)
9460 *
9461 * grand = the pmap that we will nest subord into
9462 * subord = the pmap that goes into the grand
9463 * vstart = start of range in pmap to be inserted
9464 * size = Size of nest area (up to 16TB)
9465 *
9466 * Inserts a pmap into another. This is used to implement shared segments.
9467 *
9468 */
9469
9470 /**
9471 * Embeds a range of mappings from one pmap ('subord') into another ('grand')
9472 * by inserting the twig-level TTEs from 'subord' directly into 'grand'.
9473 * This function operates in 3 main phases:
9474 * 1. Bookkeeping to ensure tracking structures for the nested region are set up.
9475 * 2. Expansion of subord to ensure the required leaf-level page table pages for
9476 * the mapping range are present in subord.
9477 * 3. Copying of twig-level TTEs from subord to grand, such that grand ultimately
9478 * contains pointers to subord's leaf-level pagetable pages for the specified
9479 * VA range.
9480 *
9481 * This function may return early due to pending AST_URGENT preemption; if so
9482 * it will indicate the need to be re-entered.
9483 *
9484 * @param grand pmap to insert the TTEs into. Must be a user pmap.
9485 * @param subord pmap from which to extract the TTEs. Must be a nested pmap.
9486 * @param vstart twig-aligned virtual address for the beginning of the nesting range
9487 * @param size twig-aligned size of the nesting range
9488 * @param vrestart the twig-aligned starting address of the current call. May contain
9489 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 3) above.
9490 * @param krp Should be initialized to KERN_SUCCESS by caller, will be set to
9491 * KERN_RESOURCE_SHORTAGE on allocation failure.
9492 *
9493 * @return the virtual address at which to restart the operation, possibly including
9494 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
9495 * (vstart + size) | PMAP_NEST_GRAND is returned, the operation completed.
9496 */
9497 MARK_AS_PMAP_TEXT vm_map_offset_t
9498 pmap_nest_internal(
9499 pmap_t grand,
9500 pmap_t subord,
9501 addr64_t vstart,
9502 uint64_t size,
9503 vm_map_offset_t vrestart,
9504 kern_return_t *krp)
9505 {
9506 kern_return_t kr = KERN_FAILURE;
9507 vm_map_offset_t vaddr;
9508 tt_entry_t *stte_p;
9509 tt_entry_t *gtte_p;
9510 uint64_t nested_region_unnested_table_bitmap_size;
9511 unsigned int* nested_region_unnested_table_bitmap = NULL;
9512 uint64_t new_nested_region_unnested_table_bitmap_size;
9513 unsigned int* new_nested_region_unnested_table_bitmap = NULL;
9514 int expand_options = 0;
9515 bool deref_subord = true;
9516 bool grand_locked = false;
9517
9518 addr64_t vend;
9519 if (__improbable(os_add_overflow(vstart, size, &vend))) {
9520 panic("%s: %p grand addr wraps around: 0x%llx + 0x%llx", __func__, grand, vstart, size);
9521 }
9522 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
9523 ((vrestart & ~PMAP_NEST_GRAND) < vstart))) {
9524 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
9525 (unsigned long long)vrestart, (unsigned long long)vstart, (unsigned long long)vend);
9526 }
9527
9528 assert(krp != NULL);
9529 validate_pmap_mutable(grand);
9530 validate_pmap(subord);
9531 #if XNU_MONITOR
9532 /*
9533 * Ordering is important here. validate_pmap() has already ensured subord is a
9534 * PPL-controlled pmap pointer, but it could have already been destroyed or could
9535 * be in the process of being destroyed. If destruction is already committed,
9536 * then the check of ref_count below will cover us. If destruction is initiated
9537 * during or after this call, then pmap_destroy() will catch the non-zero
9538 * nested_count.
9539 */
9540 os_atomic_inc(&subord->nested_count, relaxed);
9541 os_atomic_thread_fence(seq_cst);
9542 #endif
9543 if (__improbable(os_atomic_inc_orig(&subord->ref_count, relaxed) <= 0)) {
9544 panic("%s: invalid subordinate pmap %p", __func__, subord);
9545 }
9546
9547 const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
9548 if (__improbable(pmap_get_pt_attr(subord) != pt_attr)) {
9549 panic("%s: attempt to nest pmap %p into pmap %p with mismatched attributes", __func__, subord, grand);
9550 }
9551
9552 #if XNU_MONITOR
9553 expand_options |= PMAP_TT_ALLOCATE_NOWAIT;
9554 #endif
9555
9556 if (__improbable(((size | vstart | (vrestart & ~PMAP_NEST_GRAND)) &
9557 (pt_attr_leaf_table_offmask(pt_attr))) != 0x0ULL)) {
9558 panic("pmap_nest() pmap %p unaligned nesting request 0x%llx, 0x%llx, 0x%llx",
9559 grand, vstart, size, (unsigned long long)vrestart);
9560 }
9561
9562 if (__improbable(subord->type != PMAP_TYPE_NESTED)) {
9563 panic("%s: subordinate pmap %p is of non-nestable type 0x%hhx", __func__, subord, subord->type);
9564 }
9565
9566 if (__improbable(grand->type != PMAP_TYPE_USER)) {
9567 panic("%s: grand pmap %p is of unsupported type 0x%hhx for nesting", __func__, grand, grand->type);
9568 }
9569
9570 if (subord->nested_region_unnested_table_bitmap == NULL) {
9571 nested_region_unnested_table_bitmap_size = (size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9572
9573 if (__improbable((nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9574 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9575 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9576 __func__, nested_region_unnested_table_bitmap_size,
9577 grand, subord, vstart, size);
9578 }
9579
9580 #if XNU_MONITOR
9581 pmap_paddr_t pa = 0;
9582
9583 if (__improbable((nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9584 panic("%s: nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9585 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9586 __FUNCTION__, nested_region_unnested_table_bitmap_size,
9587 grand, subord, vstart, size);
9588 }
9589
9590 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9591
9592 if (kr != KERN_SUCCESS) {
9593 goto nest_cleanup;
9594 }
9595
9596 assert(pa);
9597
9598 nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9599 #else
9600 nested_region_unnested_table_bitmap = kalloc_data(
9601 nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9602 Z_WAITOK | Z_ZERO);
9603 #endif
9604
9605 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9606 kr = KERN_ABORTED;
9607 goto nest_cleanup;
9608 }
9609
9610 if (subord->nested_region_unnested_table_bitmap == NULL) {
9611 subord->nested_region_unnested_table_bitmap_size = (unsigned int) nested_region_unnested_table_bitmap_size;
9612 subord->nested_region_addr = vstart;
9613 subord->nested_region_size = (mach_vm_offset_t) size;
9614
9615 /**
9616 * Ensure that the rest of the subord->nested_region_* fields are
9617 * initialized and visible before setting the nested_region_unnested_table_bitmap
9618 * field (which is used as the flag to say that the rest are initialized).
9619 */
9620 __builtin_arm_dmb(DMB_ISHST);
9621 subord->nested_region_unnested_table_bitmap = nested_region_unnested_table_bitmap;
9622 nested_region_unnested_table_bitmap = NULL;
9623 }
9624 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9625 if (nested_region_unnested_table_bitmap != NULL) {
9626 #if XNU_MONITOR
9627 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9628 #else
9629 kfree_data(nested_region_unnested_table_bitmap,
9630 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9631 #endif
9632 nested_region_unnested_table_bitmap = NULL;
9633 }
9634 }
9635
9636 /**
9637 * Ensure subsequent reads of the subord->nested_region_* fields don't get
9638 * speculated before their initialization.
9639 */
9640 __builtin_arm_dmb(DMB_ISHLD);
9641
9642 if ((subord->nested_region_addr + subord->nested_region_size) < vend) {
9643 uint64_t new_size;
9644
9645 nested_region_unnested_table_bitmap = NULL;
9646 nested_region_unnested_table_bitmap_size = 0ULL;
9647 new_size = vend - subord->nested_region_addr;
9648
9649 new_nested_region_unnested_table_bitmap_size = (new_size >> pt_attr_twig_shift(pt_attr)) / (sizeof(unsigned int) * NBBY) + 1;
9650
9651 if (__improbable((new_nested_region_unnested_table_bitmap_size > UINT_MAX))) {
9652 panic("%s: subord->nested_region_unnested_table_bitmap_size=%llu will truncate, "
9653 "grand=%p, subord=%p, vstart=0x%llx, size=%llx",
9654 __func__, new_nested_region_unnested_table_bitmap_size,
9655 grand, subord, vstart, size);
9656 }
9657
9658 #if XNU_MONITOR
9659 pmap_paddr_t pa = 0;
9660
9661 if (__improbable((new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int)) > PAGE_SIZE)) {
9662 panic("%s: new_nested_region_unnested_table_bitmap_size=%llu will not fit in a page, "
9663 "grand=%p, subord=%p, vstart=0x%llx, new_size=%llx",
9664 __FUNCTION__, new_nested_region_unnested_table_bitmap_size,
9665 grand, subord, vstart, new_size);
9666 }
9667
9668 kr = pmap_pages_alloc_zeroed(&pa, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
9669
9670 if (kr != KERN_SUCCESS) {
9671 goto nest_cleanup;
9672 }
9673
9674 assert(pa);
9675
9676 new_nested_region_unnested_table_bitmap = (unsigned int *)phystokv(pa);
9677 #else
9678 new_nested_region_unnested_table_bitmap = kalloc_data(
9679 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int),
9680 Z_WAITOK | Z_ZERO);
9681 #endif
9682 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9683 kr = KERN_ABORTED;
9684 goto nest_cleanup;
9685 }
9686
9687 if (subord->nested_region_size < new_size) {
9688 bcopy(subord->nested_region_unnested_table_bitmap,
9689 new_nested_region_unnested_table_bitmap, subord->nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9690 nested_region_unnested_table_bitmap_size = subord->nested_region_unnested_table_bitmap_size;
9691 nested_region_unnested_table_bitmap = subord->nested_region_unnested_table_bitmap;
9692 subord->nested_region_unnested_table_bitmap = new_nested_region_unnested_table_bitmap;
9693 subord->nested_region_unnested_table_bitmap_size = (unsigned int) new_nested_region_unnested_table_bitmap_size;
9694 subord->nested_region_size = new_size;
9695 new_nested_region_unnested_table_bitmap = NULL;
9696 }
9697 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9698 if (nested_region_unnested_table_bitmap != NULL) {
9699 #if XNU_MONITOR
9700 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9701 #else
9702 kfree_data(nested_region_unnested_table_bitmap,
9703 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9704 #endif
9705 nested_region_unnested_table_bitmap = NULL;
9706 }
9707 if (new_nested_region_unnested_table_bitmap != NULL) {
9708 #if XNU_MONITOR
9709 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9710 #else
9711 kfree_data(new_nested_region_unnested_table_bitmap,
9712 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9713 #endif
9714 new_nested_region_unnested_table_bitmap = NULL;
9715 }
9716 }
9717
9718 if (!pmap_lock_preempt(subord, PMAP_LOCK_EXCLUSIVE)) {
9719 kr = KERN_ABORTED;
9720 goto nest_cleanup;
9721 }
9722
9723 if (os_atomic_cmpxchg(&grand->nested_pmap, PMAP_NULL, subord, seq_cst)) {
9724 /**
9725 * Ensure that a concurrent call to pmap_set_nested() hasn't turned grand
9726 * into a nested pmap, which would then produce multiple levels of nesting.
9727 */
9728 if (__improbable(os_atomic_load(&grand->type, seq_cst) != PMAP_TYPE_USER)) {
9729 panic("%s: attempt to nest into non-USER pmap %p", __func__, grand);
9730 }
9731 /*
9732 * If this is grand's first nesting operation, keep the reference on subord.
9733 * It will be released by pmap_destroy_internal() when grand is destroyed.
9734 */
9735 deref_subord = false;
9736
9737 if (!subord->nested_bounds_set) {
9738 /*
9739 * We are nesting without the shared regions bounds
9740 * being known. We'll have to trim the pmap later.
9741 */
9742 if (__improbable(!os_atomic_cmpxchg(&grand->nested_no_bounds_ref_state,
9743 NESTED_NO_BOUNDS_REF_NONE, NESTED_NO_BOUNDS_REF_BEFORE_AND_AFTER, relaxed))) {
9744 panic("%s: grand %p already nested", __func__, grand);
9745 }
9746 subord->nested_no_bounds_refcnt++;
9747 }
9748
9749 if (__improbable(vstart < subord->nested_region_addr ||
9750 vend > (subord->nested_region_addr + subord->nested_region_size))) {
9751 panic("%s: grand nested region (%p: [%p, %p)) will fall outside of subord nested region (%p: [%p, %p))",
9752 __func__, grand, (void *) vstart, (void *) vend, subord, (void *) subord->nested_region_addr,
9753 (void *) (subord->nested_region_addr + subord->nested_region_size));
9754 }
9755
9756 grand->nested_region_addr = vstart;
9757 grand->nested_region_size = (mach_vm_offset_t) size;
9758 } else {
9759 if (__improbable(grand->nested_pmap != subord)) {
9760 panic("pmap_nest() pmap %p has a nested pmap", grand);
9761 } else if (__improbable(grand->nested_region_addr > vstart)) {
9762 panic("pmap_nest() pmap %p : attempt to nest outside the nested region", grand);
9763 } else if ((grand->nested_region_addr + grand->nested_region_size) < vend) {
9764 grand->nested_region_size = (mach_vm_offset_t)(vstart - grand->nested_region_addr + size);
9765 }
9766 }
9767
9768 vaddr = vrestart & ~PMAP_NEST_GRAND;
9769 if (vaddr < subord->nested_region_true_start) {
9770 vaddr = subord->nested_region_true_start;
9771 }
9772
9773 addr64_t true_end = vend;
9774 if (true_end > subord->nested_region_true_end) {
9775 true_end = subord->nested_region_true_end;
9776 }
9777 __unused unsigned int ttecount = 0;
9778
9779 if (vrestart & PMAP_NEST_GRAND) {
9780 goto nest_grand;
9781 }
9782
9783 while (vaddr < true_end) {
9784 stte_p = pmap_tte(subord, vaddr);
9785 if (stte_p == PT_ENTRY_NULL || *stte_p == ARM_TTE_EMPTY) {
9786 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9787 kr = pmap_expand(subord, vaddr, expand_options, pt_attr_leaf_level(pt_attr));
9788
9789 if (kr != KERN_SUCCESS) {
9790 goto done;
9791 }
9792
9793 pmap_lock(subord, PMAP_LOCK_EXCLUSIVE);
9794 }
9795 vaddr += pt_attr_twig_size(pt_attr);
9796 vrestart = vaddr;
9797 ++ttecount;
9798 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9799 pmap_pending_preemption())) {
9800 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9801 kr = KERN_SUCCESS;
9802 goto done;
9803 }
9804 }
9805 /*
9806 * copy TTEs from subord pmap into grand pmap
9807 */
9808
9809 vaddr = (vm_map_offset_t) vstart;
9810 if (vaddr < subord->nested_region_true_start) {
9811 vaddr = subord->nested_region_true_start;
9812 }
9813 vrestart = vaddr | PMAP_NEST_GRAND;
9814
9815 nest_grand:
9816 pmap_unlock(subord, PMAP_LOCK_EXCLUSIVE);
9817
9818 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9819 kr = KERN_ABORTED;
9820 goto done;
9821 }
9822 while (vaddr < true_end) {
9823 gtte_p = pmap_tte(grand, vaddr);
9824 if (gtte_p == PT_ENTRY_NULL) {
9825 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9826 kr = pmap_expand(grand, vaddr, expand_options, pt_attr_twig_level(pt_attr));
9827 if (!(grand_locked = pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE))) {
9828 if (kr == KERN_SUCCESS) {
9829 kr = KERN_ABORTED;
9830 }
9831 }
9832
9833 if (kr != KERN_SUCCESS) {
9834 goto done;
9835 }
9836
9837 gtte_p = pmap_tt2e(grand, vaddr);
9838 }
9839 /* Don't leak a page table page. Don't violate break-before-make. */
9840 if (__improbable(*gtte_p != ARM_TTE_EMPTY)) {
9841 panic("%s: attempting to overwrite non-empty TTE %p in pmap %p",
9842 __func__, gtte_p, grand);
9843 }
9844 /**
9845 * It's possible that grand was trimmed by pmap_trim_internal() while the
9846 * lock was dropped, in which case the previously stored "true" start/end
9847 * will no longer be accurate. In that case, we need to avoid nesting
9848 * tables outside the trimmed range, as those tables may be immediately freed
9849 * which would lead to a dangling page table pointer in grand.
9850 * Note that pmap_trim() may concurrently update grand's bounds as we are
9851 * making these checks, but in that case pmap_trim_range() has not yet
9852 * been called on grand and will wait for us to drop grand's lock, so it
9853 * should see any TTEs we've nested here and clear them appropriately.
9854 */
9855 if (__probable((vaddr >= grand->nested_region_true_start) &&
9856 (vaddr < grand->nested_region_true_end))) {
9857 stte_p = pmap_tte(subord, vaddr);
9858 if (__improbable(stte_p == PT_ENTRY_NULL)) {
9859 panic("%s: subord pmap %p not expanded at va 0x%llx", __func__, subord, (unsigned long long)vaddr);
9860 }
9861 *gtte_p = *stte_p;
9862 }
9863
9864 vaddr += pt_attr_twig_size(pt_attr);
9865 vrestart = vaddr | PMAP_NEST_GRAND;
9866 ++ttecount;
9867 if (__improbable(!(ttecount % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
9868 pmap_pending_preemption())) {
9869 break;
9870 }
9871 }
9872 if (vaddr >= true_end) {
9873 vrestart = vend | PMAP_NEST_GRAND;
9874 }
9875
9876 kr = KERN_SUCCESS;
9877 done:
9878
9879 FLUSH_PTE();
9880 __builtin_arm_isb(ISB_SY);
9881
9882 if (grand_locked) {
9883 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
9884 }
9885
9886 nest_cleanup:
9887 #if XNU_MONITOR
9888 if (kr != KERN_SUCCESS) {
9889 pmap_pin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9890 *krp = kr;
9891 pmap_unpin_kernel_pages((vm_offset_t)krp, sizeof(*krp));
9892 }
9893 #else
9894 if (kr != KERN_SUCCESS) {
9895 *krp = kr;
9896 }
9897 #endif
9898 if (nested_region_unnested_table_bitmap != NULL) {
9899 #if XNU_MONITOR
9900 pmap_pages_free(kvtophys_nofail((vm_offset_t)nested_region_unnested_table_bitmap), PAGE_SIZE);
9901 #else
9902 kfree_data(nested_region_unnested_table_bitmap,
9903 nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9904 #endif
9905 }
9906 if (new_nested_region_unnested_table_bitmap != NULL) {
9907 #if XNU_MONITOR
9908 pmap_pages_free(kvtophys_nofail((vm_offset_t)new_nested_region_unnested_table_bitmap), PAGE_SIZE);
9909 #else
9910 kfree_data(new_nested_region_unnested_table_bitmap,
9911 new_nested_region_unnested_table_bitmap_size * sizeof(unsigned int));
9912 #endif
9913 }
9914 if (deref_subord) {
9915 #if XNU_MONITOR
9916 os_atomic_dec(&subord->nested_count, relaxed);
9917 #endif
9918 pmap_destroy_internal(subord);
9919 }
9920 return vrestart;
9921 }
9922
9923 kern_return_t
9924 pmap_nest(
9925 pmap_t grand,
9926 pmap_t subord,
9927 addr64_t vstart,
9928 uint64_t size)
9929 {
9930 kern_return_t kr = KERN_SUCCESS;
9931 vm_map_offset_t vaddr = (vm_map_offset_t)vstart;
9932 vm_map_offset_t vend = vaddr + size;
9933 __unused vm_map_offset_t vlast = vaddr;
9934
9935 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_START,
9936 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(subord),
9937 VM_KERNEL_ADDRHIDE(vstart));
9938
9939 pmap_verify_preemptible();
9940 #if XNU_MONITOR
9941 while (vaddr != (vend | PMAP_NEST_GRAND)) {
9942 vaddr = pmap_nest_ppl(grand, subord, vstart, size, vaddr, &kr);
9943 if (kr == KERN_RESOURCE_SHORTAGE) {
9944 pmap_alloc_page_for_ppl(0);
9945 kr = KERN_SUCCESS;
9946 } else if (kr == KERN_ABORTED) {
9947 /**
9948 * pmap_nest_internal() assumes passed-in kr is KERN_SUCCESS in
9949 * that it won't update kr when KERN_SUCCESS is to be returned.
9950 * Therefore, the KERN_ABORTED needs to be manually cleared here,
9951 * like how it is done in the KERN_RESOURCE_SHORTAGE case.
9952 */
9953 kr = KERN_SUCCESS;
9954 continue;
9955 } else if (kr != KERN_SUCCESS) {
9956 break;
9957 } else if (vaddr == vlast) {
9958 panic("%s: failed to make forward progress from 0x%llx to 0x%llx at 0x%llx",
9959 __func__, (unsigned long long)vstart, (unsigned long long)vend, (unsigned long long)vaddr);
9960 }
9961 vlast = vaddr;
9962 }
9963
9964 pmap_ledger_check_balance(grand);
9965 pmap_ledger_check_balance(subord);
9966 #else
9967 /**
9968 * We don't need to check KERN_RESOURCE_SHORTAGE or KERN_ABORTED because
9969 * we have verified preemptibility. Therefore, pmap_nest_internal() will
9970 * wait for a page or a lock instead of bailing out as in the PPL flavor.
9971 */
9972 while ((vaddr != (vend | PMAP_NEST_GRAND)) && (kr == KERN_SUCCESS)) {
9973 vaddr = pmap_nest_internal(grand, subord, vstart, size, vaddr, &kr);
9974 }
9975 #endif
9976
9977 PMAP_TRACE(2, PMAP_CODE(PMAP__NEST) | DBG_FUNC_END, kr);
9978
9979 return kr;
9980 }
9981
9982 /*
9983 * kern_return_t pmap_unnest(grand, vaddr)
9984 *
9985 * grand = the pmap that will have the virtual range unnested
9986 * vaddr = start of range in pmap to be unnested
9987 * size = size of range in pmap to be unnested
9988 *
9989 */
9990
9991 kern_return_t
9992 pmap_unnest(
9993 pmap_t grand,
9994 addr64_t vaddr,
9995 uint64_t size)
9996 {
9997 return pmap_unnest_options(grand, vaddr, size, 0);
9998 }
9999
10000 /**
10001 * Undoes a prior pmap_nest() operation by removing a range of nesting mappings
10002 * from a top-level pmap ('grand'). The corresponding mappings in the nested
10003 * pmap will be marked non-global to avoid TLB conflicts with pmaps that may
10004 * still have the region nested. The mappings in 'grand' will be left empty
10005 * with the assumption that they will be demand-filled by subsequent access faults.
10006 *
10007 * This function operates in 2 main phases:
10008 * 1. Iteration over the nested pmap's mappings for the specified range to mark
10009 * them non-global.
10010 * 2. Clearing of the twig-level TTEs for the address range in grand.
10011 *
10012 * This function may return early due to pending AST_URGENT preemption; if so
10013 * it will indicate the need to be re-entered.
10014 *
10015 * @param grand pmap from which to unnest mappings
10016 * @param vaddr twig-aligned virtual address for the beginning of the nested range
10017 * @param size twig-aligned size of the nested range
10018 * @param vrestart the page-aligned starting address of the current call. May contain
10019 * PMAP_NEST_GRAND in bit 0 to indicate the operation should skip to step 2) above.
10020 * @param option Extra control flags; may contain PMAP_UNNEST_CLEAN to indicate that
10021 * grand is being torn down and step 1) above is not needed.
10022 *
10023 * @return the virtual address at which to restart the operation, possibly including
10024 * PMAP_NEST_GRAND to indicate the phase at which to restart. If
10025 * (vaddr + size) | PMAP_NEST_GRAND is returned, the operation completed.
10026 */
10027 MARK_AS_PMAP_TEXT vm_map_offset_t
10028 pmap_unnest_options_internal(
10029 pmap_t grand,
10030 addr64_t vaddr,
10031 uint64_t size,
10032 vm_map_offset_t vrestart,
10033 unsigned int option)
10034 {
10035 vm_map_offset_t start;
10036 vm_map_offset_t addr;
10037 tt_entry_t *tte_p;
10038 unsigned int current_index;
10039 unsigned int start_index;
10040 unsigned int max_index;
10041 unsigned int entry_count = 0;
10042
10043 addr64_t vend;
10044 addr64_t true_end;
10045 if (__improbable(os_add_overflow(vaddr, size, &vend))) {
10046 panic("%s: %p vaddr wraps around: 0x%llx + 0x%llx", __func__, grand, vaddr, size);
10047 }
10048 if (__improbable(((vrestart & ~PMAP_NEST_GRAND) > vend) ||
10049 ((vrestart & ~PMAP_NEST_GRAND) < vaddr))) {
10050 panic("%s: vrestart 0x%llx is outside range [0x%llx, 0x%llx)", __func__,
10051 (unsigned long long)vrestart, (unsigned long long)vaddr, (unsigned long long)vend);
10052 }
10053
10054 validate_pmap_mutable(grand);
10055
10056 if ((vaddr < grand->nested_region_addr) || (vend > (grand->nested_region_addr + grand->nested_region_size))) {
10057 panic("%s: %p: unnest request to not-fully-nested region [%p, %p)", __func__, grand, (void*)vaddr, (void*)vend);
10058 }
10059
10060 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(grand);
10061
10062 if (__improbable(((size | vaddr) & pt_attr_twig_offmask(pt_attr)) != 0x0ULL)) {
10063 panic("%s: unaligned base address 0x%llx or size 0x%llx", __func__,
10064 (unsigned long long)vaddr, (unsigned long long)size);
10065 }
10066
10067 if (__improbable(grand->nested_pmap == NULL)) {
10068 panic("%s: %p has no nested pmap", __func__, grand);
10069 }
10070
10071 true_end = vend;
10072 if (true_end > grand->nested_pmap->nested_region_true_end) {
10073 true_end = grand->nested_pmap->nested_region_true_end;
10074 }
10075
10076 if (((option & PMAP_UNNEST_CLEAN) == 0) && !(vrestart & PMAP_NEST_GRAND)) {
10077 if (!pmap_lock_preempt(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE)) {
10078 return vrestart;
10079 }
10080
10081 start = vrestart;
10082 if (start < grand->nested_pmap->nested_region_true_start) {
10083 start = grand->nested_pmap->nested_region_true_start;
10084 }
10085 start_index = (unsigned int)((start - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10086 max_index = (unsigned int)((true_end - grand->nested_region_addr) >> pt_attr_twig_shift(pt_attr));
10087 bool flush_tlb = false;
10088
10089 for (current_index = start_index, addr = start; current_index < max_index; current_index++) {
10090 pt_entry_t *bpte, *cpte;
10091
10092 vm_map_offset_t vlim = (addr + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
10093
10094 bpte = pmap_pte(grand->nested_pmap, addr);
10095
10096 /*
10097 * If we've re-entered this function partway through unnesting a leaf region, the
10098 * 'unnest' bit will be set in the ASID bitmap, but we won't have finished updating
10099 * the run of PTEs. We therefore also need to check for a non-twig-aligned starting
10100 * address.
10101 */
10102 if (!testbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap) ||
10103 (addr & pt_attr_twig_offmask(pt_attr))) {
10104 /*
10105 * Mark the 'twig' region as being unnested. Every mapping entered within
10106 * the nested pmap in this region will now be marked non-global. Do this
10107 * before marking any of the PTEs within the region as non-global to avoid
10108 * the possibility of pmap_enter() subsequently inserting a global mapping
10109 * in the region, which could lead to a TLB conflict if a non-global entry
10110 * is later inserted for the same VA in a pmap which has fully unnested this
10111 * region.
10112 */
10113 setbit(current_index, (int *)grand->nested_pmap->nested_region_unnested_table_bitmap);
10114 for (cpte = bpte; (bpte != NULL) && (addr < vlim); cpte += PAGE_RATIO) {
10115 pmap_paddr_t pa;
10116 unsigned int pai = 0;
10117 boolean_t managed = FALSE;
10118 pt_entry_t spte;
10119
10120 if ((*cpte != ARM_PTE_TYPE_FAULT)
10121 && (!ARM_PTE_IS_COMPRESSED(*cpte, cpte))) {
10122 spte = *((volatile pt_entry_t*)cpte);
10123 while (!managed) {
10124 pa = pte_to_pa(spte);
10125 if (!pa_valid(pa)) {
10126 break;
10127 }
10128 pai = pa_index(pa);
10129 pvh_lock(pai);
10130 spte = *((volatile pt_entry_t*)cpte);
10131 pa = pte_to_pa(spte);
10132 if (pai == pa_index(pa)) {
10133 managed = TRUE;
10134 break; // Leave the PVH locked as we'll unlock it after we update the PTE
10135 }
10136 pvh_unlock(pai);
10137 }
10138
10139 if (((spte & ARM_PTE_NG) != ARM_PTE_NG)) {
10140 write_pte_fast(cpte, (spte | ARM_PTE_NG));
10141 flush_tlb = true;
10142 }
10143
10144 if (managed) {
10145 pvh_assert_locked(pai);
10146 pvh_unlock(pai);
10147 }
10148 }
10149
10150 addr += (pt_attr_page_size(pt_attr) * PAGE_RATIO);
10151 vrestart = addr;
10152 ++entry_count;
10153 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10154 pmap_pending_preemption())) {
10155 goto unnest_subord_done;
10156 }
10157 }
10158 }
10159 addr = vlim;
10160 vrestart = addr;
10161 ++entry_count;
10162 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10163 pmap_pending_preemption())) {
10164 break;
10165 }
10166 }
10167
10168 unnest_subord_done:
10169 if (flush_tlb) {
10170 FLUSH_PTE_STRONG();
10171 PMAP_UPDATE_TLBS(grand->nested_pmap, start, vrestart, false, true);
10172 }
10173
10174 pmap_unlock(grand->nested_pmap, PMAP_LOCK_EXCLUSIVE);
10175 if (current_index < max_index) {
10176 return vrestart;
10177 }
10178 }
10179
10180 /*
10181 * invalidate all pdes for segment at vaddr in pmap grand
10182 */
10183 if (vrestart & PMAP_NEST_GRAND) {
10184 addr = vrestart & ~PMAP_NEST_GRAND;
10185 if (__improbable(addr & pt_attr_twig_offmask(pt_attr)) != 0x0ULL) {
10186 panic("%s: unaligned vrestart 0x%llx", __func__, (unsigned long long)addr);
10187 }
10188 } else {
10189 addr = vaddr;
10190 vrestart = vaddr | PMAP_NEST_GRAND;
10191 }
10192
10193 /**
10194 * If we exit here due to a busy grand pmap lock, vrestart will be marked
10195 * PMAP_NEST_GRAND so that this function jumps straightly into step two
10196 * upon reentry.
10197 */
10198 if (!pmap_lock_preempt(grand, PMAP_LOCK_EXCLUSIVE)) {
10199 return vrestart;
10200 }
10201
10202 if (addr < grand->nested_pmap->nested_region_true_start) {
10203 addr = grand->nested_pmap->nested_region_true_start;
10204 }
10205
10206 start = addr;
10207
10208 while (addr < true_end) {
10209 tte_p = pmap_tte(grand, addr);
10210 /*
10211 * The nested pmap may have been trimmed before pmap_nest() completed for grand,
10212 * so it's possible that a region we're trying to unnest may not have been
10213 * nested in the first place.
10214 */
10215 if (tte_p != NULL) {
10216 *tte_p = ARM_TTE_TYPE_FAULT;
10217 }
10218 addr += pt_attr_twig_size(pt_attr);
10219 vrestart = addr | PMAP_NEST_GRAND;
10220 ++entry_count;
10221 if (__improbable(!(entry_count % PMAP_DEFAULT_PREEMPTION_CHECK_PAGE_INTERVAL) &&
10222 pmap_pending_preemption())) {
10223 break;
10224 }
10225 }
10226 if (addr >= true_end) {
10227 vrestart = vend | PMAP_NEST_GRAND;
10228 }
10229
10230 FLUSH_PTE_STRONG();
10231 PMAP_UPDATE_TLBS(grand, start, addr, false, false);
10232
10233 pmap_unlock(grand, PMAP_LOCK_EXCLUSIVE);
10234
10235 return vrestart;
10236 }
10237
10238 kern_return_t
10239 pmap_unnest_options(
10240 pmap_t grand,
10241 addr64_t vaddr,
10242 uint64_t size,
10243 unsigned int option)
10244 {
10245 vm_map_offset_t vrestart = (vm_map_offset_t)vaddr;
10246 vm_map_offset_t vend = vaddr + size;
10247
10248 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_START,
10249 VM_KERNEL_ADDRHIDE(grand), VM_KERNEL_ADDRHIDE(vaddr));
10250
10251 pmap_verify_preemptible();
10252 while (vrestart != (vend | PMAP_NEST_GRAND)) {
10253 #if XNU_MONITOR
10254 vrestart = pmap_unnest_options_ppl(grand, vaddr, size, vrestart, option);
10255 #else
10256 vrestart = pmap_unnest_options_internal(grand, vaddr, size, vrestart, option);
10257 #endif
10258 }
10259
10260 PMAP_TRACE(2, PMAP_CODE(PMAP__UNNEST) | DBG_FUNC_END, KERN_SUCCESS);
10261
10262 return KERN_SUCCESS;
10263 }
10264
10265 boolean_t
10266 pmap_adjust_unnest_parameters(
10267 __unused pmap_t p,
10268 __unused vm_map_offset_t *s,
10269 __unused vm_map_offset_t *e)
10270 {
10271 return TRUE; /* to get to log_unnest_badness()... */
10272 }
10273
10274 #if PMAP_FORK_NEST
10275 /**
10276 * Perform any necessary pre-nesting of the parent's shared region at fork()
10277 * time.
10278 *
10279 * @note This should only be called from vm_map_fork().
10280 *
10281 * @param old_pmap The pmap of the parent task.
10282 * @param new_pmap The pmap of the child task.
10283 * @param nesting_start An output parameter that is updated with the start
10284 * address of the range that was pre-nested
10285 * @param nesting_end An output parameter that is updated with the end
10286 * address of the range that was pre-nested
10287 *
10288 * @return KERN_SUCCESS if the pre-nesting was succesfully completed.
10289 * KERN_INVALID_ARGUMENT if the arguments were not valid.
10290 */
10291 kern_return_t
10292 pmap_fork_nest(
10293 pmap_t old_pmap,
10294 pmap_t new_pmap,
10295 vm_map_offset_t *nesting_start,
10296 vm_map_offset_t *nesting_end)
10297 {
10298 if (old_pmap == NULL || new_pmap == NULL) {
10299 return KERN_INVALID_ARGUMENT;
10300 }
10301 if (old_pmap->nested_pmap == NULL) {
10302 return KERN_SUCCESS;
10303 }
10304 pmap_nest(new_pmap,
10305 old_pmap->nested_pmap,
10306 old_pmap->nested_region_addr,
10307 old_pmap->nested_region_size);
10308 assertf(new_pmap->nested_pmap == old_pmap->nested_pmap &&
10309 new_pmap->nested_region_addr == old_pmap->nested_region_addr &&
10310 new_pmap->nested_region_size == old_pmap->nested_region_size,
10311 "nested new (%p,0x%llx,0x%llx) old (%p,0x%llx,0x%llx)",
10312 new_pmap->nested_pmap,
10313 new_pmap->nested_region_addr,
10314 new_pmap->nested_region_size,
10315 old_pmap->nested_pmap,
10316 old_pmap->nested_region_addr,
10317 old_pmap->nested_region_size);
10318 *nesting_start = old_pmap->nested_region_addr;
10319 *nesting_end = *nesting_start + old_pmap->nested_region_size;
10320 return KERN_SUCCESS;
10321 }
10322 #endif /* PMAP_FORK_NEST */
10323
10324 /*
10325 * disable no-execute capability on
10326 * the specified pmap
10327 */
10328 #if DEVELOPMENT || DEBUG
10329 void
10330 pmap_disable_NX(
10331 pmap_t pmap)
10332 {
10333 pmap->nx_enabled = FALSE;
10334 }
10335 #else
10336 void
10337 pmap_disable_NX(
10338 __unused pmap_t pmap)
10339 {
10340 }
10341 #endif
10342
10343 /*
10344 * flush a range of hardware TLB entries.
10345 * NOTE: assumes the smallest TLB entry in use will be for
10346 * an ARM small page (4K).
10347 */
10348
10349 #if __ARM_RANGE_TLBI__
10350 #define ARM64_RANGE_TLB_FLUSH_THRESHOLD (ARM64_TLB_RANGE_MIN_PAGES - 1)
10351 #define ARM64_FULL_TLB_FLUSH_THRESHOLD ARM64_TLB_RANGE_MAX_PAGES
10352 #else
10353 #define ARM64_FULL_TLB_FLUSH_THRESHOLD 256
10354 #endif // __ARM_RANGE_TLBI__
10355 static_assert(ARM64_FULL_TLB_FLUSH_THRESHOLD < (1ULL << (sizeof(ppnum_t) * 8)),
10356 "ARM64_FULL_TLB_FLUSH_THRESHOLD is too large so that the integer conversion"
10357 "of npages to 32 bits below may truncate.");
10358
10359 static void
10360 flush_mmu_tlb_region_asid_async(
10361 vm_offset_t va,
10362 size_t length,
10363 pmap_t pmap,
10364 bool last_level_only __unused,
10365 bool strong __unused)
10366 {
10367 unsigned long pmap_page_shift = pt_attr_leaf_shift(pmap_get_pt_attr(pmap));
10368 const uint64_t pmap_page_size = 1ULL << pmap_page_shift;
10369 size_t npages = length >> pmap_page_shift;
10370 uint32_t asid;
10371
10372 asid = pmap->hw_asid;
10373
10374 if (npages > ARM64_FULL_TLB_FLUSH_THRESHOLD) {
10375 boolean_t flush_all = FALSE;
10376
10377 if ((asid == 0) || (pmap->type == PMAP_TYPE_NESTED)) {
10378 flush_all = TRUE;
10379 }
10380 if (flush_all) {
10381 flush_mmu_tlb_async();
10382 } else {
10383 flush_mmu_tlb_asid_async((uint64_t)asid << TLBI_ASID_SHIFT, strong);
10384 }
10385 return;
10386 }
10387 #if __ARM_RANGE_TLBI__
10388 if (npages > ARM64_RANGE_TLB_FLUSH_THRESHOLD) {
10389 /**
10390 * Note that casting npages to 32 bits here is always safe thanks to
10391 * the ARM64_FULL_TLB_FLUSH_THRESHOLD check above.
10392 */
10393 va = generate_rtlbi_param((ppnum_t) npages, asid, va, pmap_page_shift);
10394 if (pmap->type == PMAP_TYPE_NESTED) {
10395 flush_mmu_tlb_allrange_async(va, last_level_only, strong);
10396 } else {
10397 flush_mmu_tlb_range_async(va, last_level_only, strong);
10398 }
10399 return;
10400 }
10401 #endif
10402 vm_offset_t end = tlbi_asid(asid) | tlbi_addr(va + length);
10403 va = tlbi_asid(asid) | tlbi_addr(va);
10404
10405 if (pmap->type == PMAP_TYPE_NESTED) {
10406 flush_mmu_tlb_allentries_async(va, end, pmap_page_size, last_level_only, strong);
10407 } else {
10408 flush_mmu_tlb_entries_async(va, end, pmap_page_size, last_level_only, strong);
10409 }
10410 }
10411
10412 MARK_AS_PMAP_TEXT static void
10413 flush_mmu_tlb_full_asid_async(pmap_t pmap)
10414 {
10415 flush_mmu_tlb_asid_async((uint64_t)(pmap->hw_asid) << TLBI_ASID_SHIFT, false);
10416 }
10417
10418 void
10419 flush_mmu_tlb_region(
10420 vm_offset_t va,
10421 unsigned length)
10422 {
10423 flush_mmu_tlb_region_asid_async(va, length, kernel_pmap, true, false);
10424 sync_tlb_flush();
10425 }
10426
10427 unsigned int
10428 pmap_cache_attributes(
10429 ppnum_t pn)
10430 {
10431 pmap_paddr_t paddr;
10432 unsigned int pai;
10433 unsigned int result;
10434 pp_attr_t pp_attr_current;
10435
10436 paddr = ptoa(pn);
10437
10438 assert(vm_last_phys > vm_first_phys); // Check that pmap has been bootstrapped
10439
10440 if (!pa_valid(paddr)) {
10441 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
10442 return (io_rgn == NULL) ? VM_WIMG_IO : io_rgn->wimg;
10443 }
10444
10445 result = VM_WIMG_DEFAULT;
10446
10447 pai = pa_index(paddr);
10448
10449 pp_attr_current = pp_attr_table[pai];
10450 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10451 result = pp_attr_current & PP_ATTR_WIMG_MASK;
10452 }
10453 return result;
10454 }
10455
10456 MARK_AS_PMAP_TEXT static void
10457 pmap_sync_wimg(ppnum_t pn, unsigned int wimg_bits_prev, unsigned int wimg_bits_new)
10458 {
10459 if ((wimg_bits_prev != wimg_bits_new)
10460 && ((wimg_bits_prev == VM_WIMG_COPYBACK)
10461 || ((wimg_bits_prev == VM_WIMG_INNERWBACK)
10462 && (wimg_bits_new != VM_WIMG_COPYBACK))
10463 || ((wimg_bits_prev == VM_WIMG_WTHRU)
10464 && ((wimg_bits_new != VM_WIMG_COPYBACK) || (wimg_bits_new != VM_WIMG_INNERWBACK))))) {
10465 pmap_sync_page_attributes_phys(pn);
10466 }
10467
10468 if ((wimg_bits_new == VM_WIMG_RT) && (wimg_bits_prev != VM_WIMG_RT)) {
10469 pmap_force_dcache_clean(phystokv(ptoa(pn)), PAGE_SIZE);
10470 }
10471 }
10472
10473 MARK_AS_PMAP_TEXT __unused void
10474 pmap_update_compressor_page_internal(ppnum_t pn, unsigned int prev_cacheattr, unsigned int new_cacheattr)
10475 {
10476 pmap_paddr_t paddr = ptoa(pn);
10477 const unsigned int pai = pa_index(paddr);
10478
10479 if (__improbable(!pa_valid(paddr))) {
10480 panic("%s called on non-managed page 0x%08x", __func__, pn);
10481 }
10482
10483 pvh_lock(pai);
10484
10485 #if XNU_MONITOR
10486 if (__improbable(ppattr_pa_test_monitor(paddr))) {
10487 panic("%s invoked on PPL page 0x%08x", __func__, pn);
10488 }
10489 #endif
10490
10491 pmap_update_cache_attributes_locked(pn, new_cacheattr, true);
10492
10493 pvh_unlock(pai);
10494
10495 pmap_sync_wimg(pn, prev_cacheattr & VM_WIMG_MASK, new_cacheattr & VM_WIMG_MASK);
10496 }
10497
10498 void *
10499 pmap_map_compressor_page(ppnum_t pn)
10500 {
10501 #if __ARM_PTE_PHYSMAP__
10502 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10503 if (cacheattr != VM_WIMG_DEFAULT) {
10504 #if XNU_MONITOR
10505 pmap_update_compressor_page_ppl(pn, cacheattr, VM_WIMG_DEFAULT);
10506 #else
10507 pmap_update_compressor_page_internal(pn, cacheattr, VM_WIMG_DEFAULT);
10508 #endif
10509 }
10510 #endif
10511 return (void*)phystokv(ptoa(pn));
10512 }
10513
10514 void
10515 pmap_unmap_compressor_page(ppnum_t pn __unused, void *kva __unused)
10516 {
10517 #if __ARM_PTE_PHYSMAP__
10518 unsigned int cacheattr = pmap_cache_attributes(pn) & VM_WIMG_MASK;
10519 if (cacheattr != VM_WIMG_DEFAULT) {
10520 #if XNU_MONITOR
10521 pmap_update_compressor_page_ppl(pn, VM_WIMG_DEFAULT, cacheattr);
10522 #else
10523 pmap_update_compressor_page_internal(pn, VM_WIMG_DEFAULT, cacheattr);
10524 #endif
10525 }
10526 #endif
10527 }
10528
10529 /**
10530 * Batch updates the cache attributes of a list of pages. This is a wrapper for
10531 * the ppl call on PPL-enabled platforms or the _internal helper on other platforms.
10532 *
10533 * @param user_page_list List of pages to be updated.
10534 * @param page_cnt Number of pages in total in user_page_list.
10535 * @param cacheattr The new cache attribute.
10536 *
10537 * @return Success if true is returned.
10538 */
10539 bool
10540 pmap_batch_set_cache_attributes(
10541 upl_page_info_array_t user_page_list,
10542 unsigned int page_cnt,
10543 unsigned int cacheattr)
10544 {
10545 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_START, page_cnt, cacheattr, 0xCECC0DE0);
10546
10547 if (page_cnt == 0) {
10548 return true;
10549 }
10550
10551 batch_set_cache_attr_state_t states;
10552 states.page_index = 0;
10553 states.state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS;
10554 states.tlb_flush_pass_needed = false;
10555 states.rt_cache_flush_pass_needed = false;
10556
10557 /* Verify we are being called from a preemptible context. */
10558 pmap_verify_preemptible();
10559
10560 while (states.state != PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE) {
10561 #if XNU_MONITOR
10562 states = pmap_batch_set_cache_attributes_ppl((volatile upl_page_info_t *) user_page_list, states, page_cnt, cacheattr);
10563 #else /* !XNU_MONITOR */
10564 states = pmap_batch_set_cache_attributes_internal(user_page_list, states, page_cnt, cacheattr);
10565 #endif /* XNU_MONITOR */
10566 }
10567
10568 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING) | DBG_FUNC_END, page_cnt, cacheattr, 0xCECC0DEF);
10569 return true;
10570 }
10571
10572 /**
10573 * Flushes TLB entries associated with the page specified by paddr, but do not
10574 * issue barriers yet.
10575 *
10576 * @param paddr The physical address to be flushed from TLB. Must be a managed address.
10577 */
10578 MARK_AS_PMAP_TEXT static void
10579 pmap_flush_tlb_for_paddr_locked_async(pmap_paddr_t paddr)
10580 {
10581 #if __ARM_PTE_PHYSMAP__
10582 /* Flush the physical aperture mappings. */
10583 const vm_offset_t kva = phystokv(paddr);
10584 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
10585 #endif /* __ARM_PTE_PHYSMAP__ */
10586
10587 /* Flush the mappings tracked in the ptes. */
10588 const unsigned int pai = pa_index(paddr);
10589 pv_entry_t **pv_h = pai_to_pvh(pai);
10590
10591 pt_entry_t *pte_p = PT_ENTRY_NULL;
10592 pv_entry_t *pve_p = PV_ENTRY_NULL;
10593
10594 pvh_assert_locked(pai);
10595
10596 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
10597 pte_p = pvh_ptep(pv_h);
10598 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
10599 pve_p = pvh_pve_list(pv_h);
10600 pte_p = PT_ENTRY_NULL;
10601 }
10602
10603 int pve_ptep_idx = 0;
10604 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
10605 if (pve_p != PV_ENTRY_NULL) {
10606 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
10607 if (pte_p == PT_ENTRY_NULL) {
10608 goto flush_tlb_skip_pte;
10609 }
10610 }
10611
10612 #ifdef PVH_FLAG_IOMMU
10613 if (pvh_ptep_is_iommu(pte_p)) {
10614 goto flush_tlb_skip_pte;
10615 }
10616 #endif /* PVH_FLAG_IOMMU */
10617 pmap_t pmap = ptep_get_pmap(pte_p);
10618 vm_map_address_t va = ptep_get_va(pte_p);
10619
10620 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
10621 pmap, true, false);
10622
10623 flush_tlb_skip_pte:
10624 pte_p = PT_ENTRY_NULL;
10625 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
10626 pve_ptep_idx = 0;
10627 pve_p = pve_next(pve_p);
10628 }
10629 }
10630 }
10631
10632 /**
10633 * Updates the pp_attr_table entry indexed by pai with cacheattr atomically.
10634 *
10635 * @param pai The Physical Address Index of the entry.
10636 * @param cacheattr The new cache attribute.
10637 */
10638 MARK_AS_PMAP_TEXT static void
10639 pmap_update_pp_attr_wimg_bits_locked(unsigned int pai, unsigned int cacheattr)
10640 {
10641 pvh_assert_locked(pai);
10642
10643 pp_attr_t pp_attr_current, pp_attr_template;
10644 do {
10645 pp_attr_current = pp_attr_table[pai];
10646 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10647
10648 /**
10649 * WIMG bits should only be updated under the PVH lock, but we should do
10650 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10651 */
10652 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10653 }
10654
10655 /**
10656 * Batch updates the cache attributes of a list of pages in three passes.
10657 *
10658 * In pass one, the pp_attr_table and the pte are updated for the pages in the list.
10659 * In pass two, TLB entries are flushed for each page in the list if necessary.
10660 * In pass three, caches are cleaned for each page in the list if necessary.
10661 *
10662 * When running in PPL, this function may decide to return to the caller in response
10663 * to AST_URGENT.
10664 *
10665 * @param user_page_list List of pages to be updated.
10666 * @param states The state of the state machine. See definition of batch_set_cache_attr_state_t.
10667 * @param page_cnt Number of pages in total in user_page_list.
10668 * @param cacheattr The new cache attributes.
10669 *
10670 * @return The new state of the state machine.
10671 */
10672 MARK_AS_PMAP_TEXT batch_set_cache_attr_state_t
10673 pmap_batch_set_cache_attributes_internal(
10674 #if XNU_MONITOR
10675 volatile upl_page_info_t *user_page_list,
10676 #else /* !XNU_MONITOR */
10677 upl_page_info_array_t user_page_list,
10678 #endif /* XNU_MONITOR */
10679 batch_set_cache_attr_state_t states,
10680 unsigned int page_cnt,
10681 unsigned int cacheattr)
10682 {
10683 uint64_t page_index = states.page_index;
10684 uint64_t state = states.state;
10685 bool tlb_flush_pass_needed = !!(states.tlb_flush_pass_needed);
10686 bool rt_cache_flush_pass_needed = !!(states.rt_cache_flush_pass_needed);
10687
10688 /* For verifying progress. */
10689 __assert_only const uint64_t page_index_old = page_index;
10690 __assert_only const uint64_t state_old = state;
10691
10692 /* Assert page_index and state are within their range. */
10693 if (!(page_index < page_cnt && state < PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE)) {
10694 panic("%s: invalid input; page_index: %llu, page_cnt: %u, state: %llu", __func__, page_index, page_cnt, state);
10695 }
10696
10697 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_UPDATE_PASS) {
10698 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE1, page_index);
10699 /* Update cache attributes of the pages until there's an urgent AST or it's done. */
10700 while (page_index < page_cnt) {
10701 const ppnum_t pn = user_page_list[page_index].phys_addr;
10702 const pmap_paddr_t paddr = ptoa(pn);
10703
10704 if (!pa_valid(paddr)) {
10705 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10706 }
10707
10708 const unsigned int pai = pa_index(paddr);
10709
10710 /* Lock the page. */
10711 pvh_lock(pai);
10712
10713 #if XNU_MONITOR
10714 if (ppattr_pa_test_monitor(paddr)) {
10715 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10716 }
10717 #endif /* XNU_MONITOR */
10718 const pp_attr_t pp_attr_current = pp_attr_table[pai];
10719
10720 unsigned int wimg_bits_prev = VM_WIMG_DEFAULT;
10721 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10722 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10723 }
10724
10725 const pp_attr_t pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr);
10726
10727 unsigned int wimg_bits_new = VM_WIMG_DEFAULT;
10728 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10729 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10730 }
10731
10732 /* Update the cache attributes in PTE and PP_ATTR table. */
10733 if (wimg_bits_new != wimg_bits_prev) {
10734 tlb_flush_pass_needed |= pmap_update_cache_attributes_locked(pn, cacheattr, false);
10735 pmap_update_pp_attr_wimg_bits_locked(pai, cacheattr);
10736 }
10737
10738 if (wimg_bits_new == VM_WIMG_RT && wimg_bits_prev != VM_WIMG_RT) {
10739 rt_cache_flush_pass_needed = true;
10740 }
10741
10742 pvh_unlock(pai);
10743
10744 page_index++;
10745
10746 #if XNU_MONITOR
10747 /**
10748 * Check for AST_URGENT every page, as the pve list search in cache
10749 * update can take non-constant time.
10750 */
10751 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10752 goto pbscai_exit;
10753 }
10754 #endif /* XNU_MONITOR */
10755 }
10756
10757 /* page_index == page_cnt && !pmap_pending_preemption() */
10758 if (tlb_flush_pass_needed) {
10759 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS;
10760 } else if (rt_cache_flush_pass_needed) {
10761 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10762 } else {
10763 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10764 }
10765 page_index = 0;
10766
10767 /* Sync the PTE writes before potential TLB/Cache flushes. */
10768 FLUSH_PTE_STRONG();
10769
10770 #if XNU_MONITOR
10771 if (__improbable(pmap_pending_preemption())) {
10772 goto pbscai_exit;
10773 }
10774 #endif /* XNU_MONITOR */
10775 }
10776
10777 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_TLBFLUSH_PASS) {
10778 /**
10779 * Pass 2: for each physical page and for each mapping, we need to flush
10780 * the TLB for it.
10781 */
10782 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE2, page_index);
10783 while (page_index < page_cnt) {
10784 const ppnum_t pn = user_page_list[page_index].phys_addr;
10785
10786 const pmap_paddr_t paddr = ptoa(pn);
10787 if (!pa_valid(paddr)) {
10788 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10789 }
10790
10791 const unsigned int pai = pa_index(paddr);
10792
10793 pvh_lock(pai);
10794 pmap_flush_tlb_for_paddr_locked_async(paddr);
10795 pvh_unlock(pai);
10796
10797 page_index++;
10798
10799 #if XNU_MONITOR
10800 /**
10801 * Check for AST_URGENT every page, as the pve list search in cache
10802 * update can take non-constant time.
10803 */
10804 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10805 goto pbscai_exit;
10806 }
10807 #endif /* XNU_MONITOR */
10808 }
10809
10810 #if HAS_FEAT_XS
10811 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
10812 arm64_sync_tlb(false);
10813 #else
10814 /**
10815 * For targets that distinguish between mild and strong DSB, mild DSB
10816 * will not drain the prefetcher. This can lead to prefetch-driven
10817 * cache fills that defeat the uncacheable requirement of the RT memory type.
10818 * In those cases, strong DSB must instead be employed to drain the prefetcher.
10819 */
10820 arm64_sync_tlb((cacheattr & VM_WIMG_MASK) == VM_WIMG_RT);
10821 #endif
10822
10823 if (rt_cache_flush_pass_needed) {
10824 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS;
10825 } else {
10826 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10827 }
10828 page_index = 0;
10829
10830 #if XNU_MONITOR
10831 if (__improbable(pmap_pending_preemption())) {
10832 goto pbscai_exit;
10833 }
10834 #endif /* XNU_MONITOR */
10835 }
10836
10837 if (state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_CACHEFLUSH_PASS) {
10838 /* Pass 3: Flush the cache if the page is recently set to RT */
10839 PMAP_TRACE(2, PMAP_CODE(PMAP__BATCH_UPDATE_CACHING), page_cnt, cacheattr, 0xCECC0DE3, page_index);
10840 #if !XNU_MONITOR
10841 /**
10842 * On non-PPL platforms, we disable preemption to ensure we are not preempted
10843 * in the state where DC by VA instructions remain enabled.
10844 */
10845 disable_preemption();
10846 #endif /* !XNU_MONITOR */
10847
10848 assert(get_preemption_level() > 0);
10849
10850 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10851 /**
10852 * On APPLEVIRTUALPLATFORM, HID register accesses cause a synchronous exception
10853 * and the host will handle cache maintenance for it. So we don't need to
10854 * worry about enabling the ops here for AVP.
10855 */
10856 enable_dc_mva_ops();
10857 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10858
10859 while (page_index < page_cnt) {
10860 const pmap_paddr_t paddr = ptoa(user_page_list[page_index].phys_addr);
10861
10862 if (!pa_valid(paddr)) {
10863 panic("%s: page is not managed; addr: 0x%016llx", __func__, paddr);
10864 }
10865
10866 CleanPoC_DcacheRegion_Force_nopreempt_nohid(phystokv(paddr), PAGE_SIZE);
10867
10868 page_index++;
10869
10870 #if XNU_MONITOR
10871 if (__improbable(pmap_pending_preemption() && (page_index < page_cnt))) {
10872 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10873 disable_dc_mva_ops();
10874 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10875 goto pbscai_exit;
10876 }
10877 #endif /* XNU_MONITOR */
10878 }
10879
10880 #if defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM
10881 disable_dc_mva_ops();
10882 #endif /* defined(APPLE_ARM64_ARCH_FAMILY) && !APPLEVIRTUALPLATFORM */
10883
10884 #if !XNU_MONITOR
10885 enable_preemption();
10886 #endif /* !XNU_MONITOR */
10887
10888 state = PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE;
10889 page_index = 0;
10890 }
10891
10892 #if XNU_MONITOR
10893 pbscai_exit:
10894 #endif /* XNU_MONITOR */
10895 /* Assert page_index and state are within their range. */
10896 assert(page_index < page_cnt || state == PMAP_BATCH_SET_CACHE_ATTRIBUTES_DONE);
10897
10898 /* Make sure we are making progress in this call. */
10899 assert(page_index > page_index_old || state > state_old);
10900
10901 batch_set_cache_attr_state_t states_new;
10902 states_new.page_index = page_index;
10903 states_new.state = state;
10904 states_new.tlb_flush_pass_needed = tlb_flush_pass_needed ? 1 : 0;
10905 states_new.rt_cache_flush_pass_needed = rt_cache_flush_pass_needed ? 1 : 0;
10906 return states_new;
10907 }
10908
10909 MARK_AS_PMAP_TEXT static void
10910 pmap_set_cache_attributes_priv(
10911 ppnum_t pn,
10912 unsigned int cacheattr,
10913 boolean_t external __unused)
10914 {
10915 pmap_paddr_t paddr;
10916 unsigned int pai;
10917 pp_attr_t pp_attr_current;
10918 pp_attr_t pp_attr_template;
10919 unsigned int wimg_bits_prev, wimg_bits_new;
10920
10921 paddr = ptoa(pn);
10922
10923 if (!pa_valid(paddr)) {
10924 return; /* Not a managed page. */
10925 }
10926
10927 if (cacheattr & VM_WIMG_USE_DEFAULT) {
10928 cacheattr = VM_WIMG_DEFAULT;
10929 }
10930
10931 pai = pa_index(paddr);
10932
10933 pvh_lock(pai);
10934
10935 #if XNU_MONITOR
10936 if (external && ppattr_pa_test_monitor(paddr)) {
10937 panic("%s invoked on PPL page 0x%llx", __func__, (uint64_t)paddr);
10938 } else if (!external && !ppattr_pa_test_monitor(paddr)) {
10939 panic("%s invoked on non-PPL page 0x%llx", __func__, (uint64_t)paddr);
10940 }
10941 #endif
10942
10943 do {
10944 pp_attr_current = pp_attr_table[pai];
10945 wimg_bits_prev = VM_WIMG_DEFAULT;
10946 if (pp_attr_current & PP_ATTR_WIMG_MASK) {
10947 wimg_bits_prev = pp_attr_current & PP_ATTR_WIMG_MASK;
10948 }
10949
10950 pp_attr_template = (pp_attr_current & ~PP_ATTR_WIMG_MASK) | PP_ATTR_WIMG(cacheattr & (VM_WIMG_MASK));
10951
10952 /**
10953 * WIMG bits should only be updated under the PVH lock, but we should do
10954 * this in a CAS loop to avoid losing simultaneous updates to other bits like refmod.
10955 */
10956 } while (!OSCompareAndSwap16(pp_attr_current, pp_attr_template, &pp_attr_table[pai]));
10957
10958 wimg_bits_new = VM_WIMG_DEFAULT;
10959 if (pp_attr_template & PP_ATTR_WIMG_MASK) {
10960 wimg_bits_new = pp_attr_template & PP_ATTR_WIMG_MASK;
10961 }
10962
10963 if (wimg_bits_new != wimg_bits_prev) {
10964 pmap_update_cache_attributes_locked(pn, cacheattr, true);
10965 }
10966
10967 pvh_unlock(pai);
10968
10969 pmap_sync_wimg(pn, wimg_bits_prev, wimg_bits_new);
10970 }
10971
10972 MARK_AS_PMAP_TEXT void
10973 pmap_set_cache_attributes_internal(
10974 ppnum_t pn,
10975 unsigned int cacheattr)
10976 {
10977 pmap_set_cache_attributes_priv(pn, cacheattr, TRUE);
10978 }
10979
10980 void
10981 pmap_set_cache_attributes(
10982 ppnum_t pn,
10983 unsigned int cacheattr)
10984 {
10985 #if XNU_MONITOR
10986 pmap_set_cache_attributes_ppl(pn, cacheattr);
10987 #else
10988 pmap_set_cache_attributes_internal(pn, cacheattr);
10989 #endif
10990 }
10991
10992 /**
10993 * Updates the page numbered ppnum to have attribute specified by attributes.
10994 * If a TLB flush is necessary, it will be performed if perform_tlbi is true.
10995 * The necessity of the TLB flush is returned in case this function is called
10996 * in a batched manner and the TLB flush is intended to be done at a different
10997 * timing.
10998 *
10999 * @param ppnum Page Number of the page to be updated.
11000 * @param attributes The new cache attributes.
11001 * @param perform_tlbi When a TLB flush is needed, whether to perform the tlbi
11002 * immediately.
11003 *
11004 * @return Returns true if a TLB flush is needed for this update regardless of
11005 * whether a flush has occurred already.
11006 */
11007 MARK_AS_PMAP_TEXT bool
11008 pmap_update_cache_attributes_locked(
11009 ppnum_t ppnum,
11010 unsigned attributes,
11011 bool perform_tlbi)
11012 {
11013 pmap_paddr_t phys = ptoa(ppnum);
11014 pv_entry_t *pve_p;
11015 pt_entry_t *pte_p;
11016 pv_entry_t **pv_h;
11017 pt_entry_t tmplate;
11018 unsigned int pai;
11019 boolean_t tlb_flush_needed = false;
11020
11021 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_START, ppnum, attributes);
11022
11023 if (pmap_panic_dev_wimg_on_managed) {
11024 switch (attributes & VM_WIMG_MASK) {
11025 case VM_WIMG_IO: // nGnRnE
11026 case VM_WIMG_POSTED: // nGnRE
11027 /* supported on DRAM, but slow, so we disallow */
11028
11029 case VM_WIMG_POSTED_REORDERED: // nGRE
11030 case VM_WIMG_POSTED_COMBINED_REORDERED: // GRE
11031 /* unsupported on DRAM */
11032
11033 panic("%s: trying to use unsupported VM_WIMG type for managed page, VM_WIMG=%x, ppnum=%#x",
11034 __FUNCTION__, attributes & VM_WIMG_MASK, ppnum);
11035 break;
11036
11037 default:
11038 /* not device type memory, all good */
11039
11040 break;
11041 }
11042 }
11043
11044 #if __ARM_PTE_PHYSMAP__
11045 vm_offset_t kva = phystokv(phys);
11046 pte_p = pmap_pte(kernel_pmap, kva);
11047
11048 tmplate = *pte_p;
11049 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11050 #if XNU_MONITOR
11051 tmplate |= (wimg_to_pte(attributes, phys) & ~ARM_PTE_XPRR_MASK);
11052 #else
11053 tmplate |= wimg_to_pte(attributes, phys);
11054 #endif
11055 if (tmplate & ARM_PTE_HINT_MASK) {
11056 panic("%s: physical aperture PTE %p has hint bit set, va=%p, pte=0x%llx",
11057 __FUNCTION__, pte_p, (void *)kva, tmplate);
11058 }
11059
11060 if (perform_tlbi) {
11061 write_pte_strong(pte_p, tmplate);
11062 flush_mmu_tlb_region_asid_async(kva, PAGE_SIZE, kernel_pmap, true, false);
11063 } else {
11064 write_pte_fast(pte_p, tmplate);
11065 }
11066 tlb_flush_needed = true;
11067 #endif
11068
11069 pai = pa_index(phys);
11070
11071 pv_h = pai_to_pvh(pai);
11072
11073 pte_p = PT_ENTRY_NULL;
11074 pve_p = PV_ENTRY_NULL;
11075 if (pvh_test_type(pv_h, PVH_TYPE_PTEP)) {
11076 pte_p = pvh_ptep(pv_h);
11077 } else if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
11078 pve_p = pvh_pve_list(pv_h);
11079 pte_p = PT_ENTRY_NULL;
11080 }
11081
11082 int pve_ptep_idx = 0;
11083 while ((pve_p != PV_ENTRY_NULL) || (pte_p != PT_ENTRY_NULL)) {
11084 vm_map_address_t va;
11085 pmap_t pmap;
11086
11087 if (pve_p != PV_ENTRY_NULL) {
11088 pte_p = pve_get_ptep(pve_p, pve_ptep_idx);
11089 if (pte_p == PT_ENTRY_NULL) {
11090 goto cache_skip_pve;
11091 }
11092 }
11093
11094 #ifdef PVH_FLAG_IOMMU
11095 if (pvh_ptep_is_iommu(pte_p)) {
11096 goto cache_skip_pve;
11097 }
11098 #endif
11099 pmap = ptep_get_pmap(pte_p);
11100 #if HAS_FEAT_XS
11101 /**
11102 * TODO: we don't currently allow XS MAIR types on managed memory (see wimg_to_pte()),
11103 * but if we change that we'll need to allow for "strong" TLBIs and DSBs in this function.
11104 */
11105 assert(!pte_is_xs(pmap_get_pt_attr(pmap), *pte_p));
11106 #endif /* HAS_FEAT_XS */
11107 va = ptep_get_va(pte_p);
11108
11109 tmplate = *pte_p;
11110 tmplate &= ~(ARM_PTE_ATTRINDXMASK | ARM_PTE_SHMASK);
11111 tmplate |= pmap_get_pt_ops(pmap)->wimg_to_pte(attributes, phys);
11112
11113 if (perform_tlbi) {
11114 write_pte_strong(pte_p, tmplate);
11115 pmap_get_pt_ops(pmap)->flush_tlb_region_async(va, pt_attr_page_size(pmap_get_pt_attr(pmap)) * PAGE_RATIO,
11116 pmap, true, false);
11117 } else {
11118 write_pte_fast(pte_p, tmplate);
11119 }
11120 tlb_flush_needed = true;
11121
11122 cache_skip_pve:
11123 pte_p = PT_ENTRY_NULL;
11124 if ((pve_p != PV_ENTRY_NULL) && (++pve_ptep_idx == PTE_PER_PVE)) {
11125 pve_ptep_idx = 0;
11126 pve_p = pve_next(pve_p);
11127 }
11128 }
11129 if (perform_tlbi && tlb_flush_needed) {
11130 #if HAS_FEAT_XS
11131 /* With FEAT_XS, ordinary DSBs drain the prefetcher. */
11132 arm64_sync_tlb(false);
11133 #else
11134 /**
11135 * For targets that distinguish between mild and strong DSB, mild DSB
11136 * will not drain the prefetcher. This can lead to prefetch-driven
11137 * cache fills that defeat the uncacheable requirement of the RT memory type.
11138 * In those cases, strong DSB must instead be employed to drain the prefetcher.
11139 */
11140 arm64_sync_tlb((attributes & VM_WIMG_MASK) == VM_WIMG_RT);
11141 #endif
11142 }
11143
11144 PMAP_TRACE(2, PMAP_CODE(PMAP__UPDATE_CACHING) | DBG_FUNC_END, ppnum, attributes);
11145
11146 return tlb_flush_needed;
11147 }
11148
11149 /**
11150 * Mark a pmap as being dedicated to use for a commpage mapping.
11151 * The pmap itself will never be activated on a CPU; its mappings will
11152 * only be embedded in userspace pmaps at a fixed virtual address.
11153 *
11154 * @param pmap the pmap to mark as belonging to a commpage.
11155 */
11156 static void
11157 pmap_set_commpage(pmap_t pmap)
11158 {
11159 #if XNU_MONITOR
11160 assert(!pmap_ppl_locked_down);
11161 #endif
11162 assert(pmap->type == PMAP_TYPE_USER);
11163 pmap->type = PMAP_TYPE_COMMPAGE;
11164 /*
11165 * Free the pmap's ASID. This pmap should not ever be directly
11166 * activated in a CPU's TTBR. Freeing the ASID will not only reduce
11167 * ASID space contention but will also cause pmap_switch() to panic
11168 * if an attacker tries to activate this pmap. Disable preemption to
11169 * accommodate the *_nopreempt spinlock in free_asid().
11170 */
11171 mp_disable_preemption();
11172 pmap_get_pt_ops(pmap)->free_id(pmap);
11173 mp_enable_preemption();
11174 }
11175
11176 static void
11177 pmap_update_tt3e(
11178 pmap_t pmap,
11179 vm_address_t address,
11180 tt_entry_t template)
11181 {
11182 tt_entry_t *ptep, pte;
11183
11184 ptep = pmap_tt3e(pmap, address);
11185 if (ptep == NULL) {
11186 panic("%s: no ptep?", __FUNCTION__);
11187 }
11188
11189 pte = *ptep;
11190 pte = tte_to_pa(pte) | template;
11191 write_pte_strong(ptep, pte);
11192 }
11193
11194 /* Note absence of non-global bit */
11195 #define PMAP_COMM_PAGE_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11196 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11197 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_NX \
11198 | ARM_PTE_PNX | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11199
11200 /* Note absence of non-global bit and no-execute bit. */
11201 #define PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE (ARM_PTE_TYPE_VALID \
11202 | ARM_PTE_ATTRINDX(CACHE_ATTRINDX_WRITEBACK) \
11203 | ARM_PTE_SH(SH_INNER_MEMORY) | ARM_PTE_PNX \
11204 | ARM_PTE_AP(AP_RORO) | ARM_PTE_AF)
11205
11206 void
11207 pmap_create_commpages(vm_map_address_t *kernel_data_addr, vm_map_address_t *kernel_text_addr,
11208 vm_map_address_t *kernel_ro_data_addr, vm_map_address_t *user_text_addr)
11209 {
11210 kern_return_t kr;
11211 pmap_paddr_t data_pa = 0; // data address
11212 pmap_paddr_t ro_data_pa = 0; // kernel read-only data address
11213 pmap_paddr_t text_pa = 0; // text address
11214
11215 *kernel_data_addr = 0;
11216 *kernel_text_addr = 0;
11217 *user_text_addr = 0;
11218
11219 #if XNU_MONITOR
11220 data_pa = pmap_alloc_page_for_kern(0);
11221 assert(data_pa);
11222 memset((char *) phystokv(data_pa), 0, PAGE_SIZE);
11223 ro_data_pa = pmap_alloc_page_for_kern(0);
11224 assert(ro_data_pa);
11225 memset((char *) phystokv(ro_data_pa), 0, PAGE_SIZE);
11226 #if CONFIG_ARM_PFZ
11227 text_pa = pmap_alloc_page_for_kern(0);
11228 assert(text_pa);
11229 memset((char *) phystokv(text_pa), 0, PAGE_SIZE);
11230 #endif
11231
11232 #else /* XNU_MONITOR */
11233 (void) pmap_pages_alloc_zeroed(&data_pa, PAGE_SIZE, 0);
11234 /*
11235 * For non-PPL devices, we have neither page lockdown nor a physical aperture
11236 * mapped at page granularity, so a separate page for kernel RO data would not
11237 * be useful.
11238 */
11239 ro_data_pa = data_pa;
11240 #if CONFIG_ARM_PFZ
11241 (void) pmap_pages_alloc_zeroed(&text_pa, PAGE_SIZE, 0);
11242 #endif
11243
11244 #endif /* XNU_MONITOR */
11245
11246 /*
11247 * In order to avoid burning extra pages on mapping the shared page, we
11248 * create a dedicated pmap for the shared page. We forcibly nest the
11249 * translation tables from this pmap into other pmaps. The level we
11250 * will nest at depends on the MMU configuration (page size, TTBR range,
11251 * etc). Typically, this is at L1 for 4K tasks and L2 for 16K tasks.
11252 *
11253 * Note that this is NOT "the nested pmap" (which is used to nest the
11254 * shared cache).
11255 *
11256 * Note that we update parameters of the entry for our unique needs (NG
11257 * entry, etc.).
11258 */
11259 commpage_pmap_default = pmap_create_options(NULL, 0x0, 0);
11260 assert(commpage_pmap_default != NULL);
11261 pmap_set_commpage(commpage_pmap_default);
11262
11263 /* The user 64-bit mappings... */
11264 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11265 assert(kr == KERN_SUCCESS);
11266 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11267
11268 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11269 assert(kr == KERN_SUCCESS);
11270 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11271 #if CONFIG_ARM_PFZ
11272 /* User mapping of comm page text section for 64 bit mapping only
11273 *
11274 * We don't insert it into the 32 bit mapping because we don't want 32 bit
11275 * user processes to get this page mapped in, they should never call into
11276 * this page.
11277 *
11278 * The data comm page is in a pre-reserved L3 VA range and the text commpage
11279 * is slid in the same L3 as the data commpage. It is either outside the
11280 * max of user VA or is pre-reserved in the vm_map_exec(). This means that
11281 * it is reserved and unavailable to mach VM for future mappings.
11282 */
11283 const pt_attr_t * const pt_attr = pmap_get_pt_attr(commpage_pmap_default);
11284 int num_ptes = pt_attr_leaf_size(pt_attr) >> PTE_SHIFT;
11285
11286 vm_map_address_t commpage_text_va = 0;
11287
11288 do {
11289 int text_leaf_index = random() % num_ptes;
11290
11291 // Generate a VA for the commpage text with the same root and twig index as data
11292 // comm page, but with new leaf index we've just generated.
11293 commpage_text_va = (_COMM_PAGE64_BASE_ADDRESS & ~pt_attr_leaf_index_mask(pt_attr));
11294 commpage_text_va |= (text_leaf_index << pt_attr_leaf_shift(pt_attr));
11295 } while ((commpage_text_va == _COMM_PAGE64_BASE_ADDRESS) || (commpage_text_va == _COMM_PAGE64_RO_ADDRESS)); // Try again if we collide (should be unlikely)
11296
11297 // Assert that this is empty
11298 __assert_only pt_entry_t *ptep = pmap_pte(commpage_pmap_default, commpage_text_va);
11299 assert(ptep != PT_ENTRY_NULL);
11300 assert(*ptep == ARM_TTE_EMPTY);
11301
11302 // At this point, we've found the address we want to insert our comm page at
11303 kr = pmap_enter_addr(commpage_pmap_default, commpage_text_va, text_pa, VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11304 assert(kr == KERN_SUCCESS);
11305 // Mark it as global page R/X so that it doesn't get thrown out on tlb flush
11306 pmap_update_tt3e(commpage_pmap_default, commpage_text_va, PMAP_COMM_PAGE_TEXT_PTE_TEMPLATE);
11307
11308 *user_text_addr = commpage_text_va;
11309 #endif
11310
11311 /* ...and the user 32-bit mappings. */
11312 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11313 assert(kr == KERN_SUCCESS);
11314 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11315
11316 kr = pmap_enter_addr(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11317 assert(kr == KERN_SUCCESS);
11318 pmap_update_tt3e(commpage_pmap_default, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11319 #if __ARM_MIXED_PAGE_SIZE__
11320 /**
11321 * To handle 4K tasks a new view/pmap of the shared page is needed. These are a
11322 * new set of page tables that point to the exact same 16K shared page as
11323 * before. Only the first 4K of the 16K shared page is mapped since that's
11324 * the only part that contains relevant data.
11325 */
11326 commpage_pmap_4k = pmap_create_options(NULL, 0x0, PMAP_CREATE_FORCE_4K_PAGES);
11327 assert(commpage_pmap_4k != NULL);
11328 pmap_set_commpage(commpage_pmap_4k);
11329
11330 /* The user 64-bit mappings... */
11331 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11332 assert(kr == KERN_SUCCESS);
11333 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11334
11335 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11336 assert(kr == KERN_SUCCESS);
11337 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE64_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11338
11339 /* ...and the user 32-bit mapping. */
11340 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11341 assert(kr == KERN_SUCCESS);
11342 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_BASE_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11343
11344 kr = pmap_enter_addr(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, ro_data_pa, VM_PROT_READ, VM_PROT_NONE, VM_WIMG_USE_DEFAULT, TRUE);
11345 assert(kr == KERN_SUCCESS);
11346 pmap_update_tt3e(commpage_pmap_4k, _COMM_PAGE32_RO_ADDRESS, PMAP_COMM_PAGE_PTE_TEMPLATE);
11347 #endif
11348
11349 /* For manipulation in kernel, go straight to physical page */
11350 *kernel_data_addr = phystokv(data_pa);
11351 assert(commpage_ro_data_kva == 0);
11352 *kernel_ro_data_addr = commpage_ro_data_kva = phystokv(ro_data_pa);
11353 assert(commpage_text_kva == 0);
11354 *kernel_text_addr = commpage_text_kva = (text_pa ? phystokv(text_pa) : 0);
11355 }
11356
11357
11358 /*
11359 * Asserts to ensure that the TTEs we nest to map the shared page do not overlap
11360 * with user controlled TTEs for regions that aren't explicitly reserved by the
11361 * VM (e.g., _COMM_PAGE64_NESTING_START/_COMM_PAGE64_BASE_ADDRESS).
11362 */
11363 #if (ARM_PGSHIFT == 14)
11364 /**
11365 * Ensure that 64-bit devices with 32-bit userspace VAs (arm64_32) can nest the
11366 * commpage completely above the maximum 32-bit userspace VA.
11367 */
11368 static_assert((_COMM_PAGE32_BASE_ADDRESS & ~ARM_TT_L2_OFFMASK) >= VM_MAX_ADDRESS);
11369
11370 /**
11371 * Normally there'd be an assert to check that 64-bit devices with 64-bit
11372 * userspace VAs can nest the commpage completely above the maximum 64-bit
11373 * userpace VA, but that technically isn't true on macOS. On those systems, the
11374 * commpage lives within the userspace VA range, but is protected by the VM as
11375 * a reserved region (see vm_reserved_regions[] definition for more info).
11376 */
11377
11378 #elif (ARM_PGSHIFT == 12)
11379 /**
11380 * Ensure that 64-bit devices using 4K pages can nest the commpage completely
11381 * above the maximum userspace VA.
11382 */
11383 static_assert((_COMM_PAGE64_BASE_ADDRESS & ~ARM_TT_L1_OFFMASK) >= MACH_VM_MAX_ADDRESS);
11384 #else
11385 #error Nested shared page mapping is unsupported on this config
11386 #endif
11387
11388 MARK_AS_PMAP_TEXT kern_return_t
11389 pmap_insert_commpage_internal(
11390 pmap_t pmap)
11391 {
11392 kern_return_t kr = KERN_SUCCESS;
11393 vm_offset_t commpage_vaddr;
11394 pt_entry_t *ttep, *src_ttep;
11395 int options = 0;
11396 pmap_t commpage_pmap = commpage_pmap_default;
11397
11398 /* Validate the pmap input before accessing its data. */
11399 validate_pmap_mutable(pmap);
11400
11401 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11402 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11403
11404 #if __ARM_MIXED_PAGE_SIZE__
11405 #if !__ARM_16K_PG__
11406 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11407 #error "pmap_insert_commpage_internal requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11408 #endif /* !__ARM_16K_PG__ */
11409
11410 /* Choose the correct shared page pmap to use. */
11411 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11412 if (pmap_page_size == 16384) {
11413 commpage_pmap = commpage_pmap_default;
11414 } else if (pmap_page_size == 4096) {
11415 commpage_pmap = commpage_pmap_4k;
11416 } else {
11417 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11418 }
11419 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11420
11421 #if XNU_MONITOR
11422 options |= PMAP_OPTIONS_NOWAIT;
11423 #endif /* XNU_MONITOR */
11424
11425 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11426 #error We assume a single page.
11427 #endif
11428
11429 if (pmap_is_64bit(pmap)) {
11430 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11431 } else {
11432 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11433 }
11434
11435
11436 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11437
11438 /*
11439 * For 4KB pages, we either "nest" at the level one page table (1GB) or level
11440 * two (2MB) depending on the address space layout. For 16KB pages, each level
11441 * one entry is 64GB, so we must go to the second level entry (32MB) in order
11442 * to "nest".
11443 *
11444 * Note: This is not "nesting" in the shared cache sense. This definition of
11445 * nesting just means inserting pointers to pre-allocated tables inside of
11446 * the passed in pmap to allow us to share page tables (which map the shared
11447 * page) for every task. This saves at least one page of memory per process
11448 * compared to creating new page tables in every process for mapping the
11449 * shared page.
11450 */
11451
11452 /**
11453 * Allocate the twig page tables if needed, and slam a pointer to the shared
11454 * page's tables into place.
11455 */
11456 while ((ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr)) == TT_ENTRY_NULL) {
11457 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11458
11459 kr = pmap_expand(pmap, commpage_vaddr, options, commpage_level);
11460
11461 if (kr != KERN_SUCCESS) {
11462 #if XNU_MONITOR
11463 if (kr == KERN_RESOURCE_SHORTAGE) {
11464 return kr;
11465 } else
11466 #endif
11467 if (kr == KERN_ABORTED) {
11468 return kr;
11469 } else {
11470 panic("Failed to pmap_expand for commpage, pmap=%p", pmap);
11471 }
11472 }
11473
11474 pmap_lock(pmap, PMAP_LOCK_EXCLUSIVE);
11475 }
11476
11477 if (*ttep != ARM_PTE_EMPTY) {
11478 panic("%s: Found something mapped at the commpage address?!", __FUNCTION__);
11479 }
11480
11481 src_ttep = pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr);
11482
11483 *ttep = *src_ttep;
11484 FLUSH_PTE_STRONG();
11485
11486 pmap_unlock(pmap, PMAP_LOCK_EXCLUSIVE);
11487
11488 return kr;
11489 }
11490
11491 static void
11492 pmap_unmap_commpage(
11493 pmap_t pmap)
11494 {
11495 pt_entry_t *ttep;
11496 vm_offset_t commpage_vaddr;
11497 pmap_t commpage_pmap = commpage_pmap_default;
11498
11499 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11500 const unsigned int commpage_level = pt_attr_commpage_level(pt_attr);
11501
11502 #if __ARM_MIXED_PAGE_SIZE__
11503 #if !__ARM_16K_PG__
11504 /* The following code assumes that commpage_pmap_default is a 16KB pmap. */
11505 #error "pmap_unmap_commpage requires a 16KB default kernel page size when __ARM_MIXED_PAGE_SIZE__ is enabled"
11506 #endif /* !__ARM_16K_PG__ */
11507
11508 /* Choose the correct shared page pmap to use. */
11509 const uint64_t pmap_page_size = pt_attr_page_size(pt_attr);
11510 if (pmap_page_size == 16384) {
11511 commpage_pmap = commpage_pmap_default;
11512 } else if (pmap_page_size == 4096) {
11513 commpage_pmap = commpage_pmap_4k;
11514 } else {
11515 panic("No shared page pmap exists for the wanted page size: %llu", pmap_page_size);
11516 }
11517 #endif /* __ARM_MIXED_PAGE_SIZE__ */
11518
11519 #if _COMM_PAGE_AREA_LENGTH != PAGE_SIZE
11520 #error We assume a single page.
11521 #endif
11522
11523 if (pmap_is_64bit(pmap)) {
11524 commpage_vaddr = _COMM_PAGE64_BASE_ADDRESS;
11525 } else {
11526 commpage_vaddr = _COMM_PAGE32_BASE_ADDRESS;
11527 }
11528
11529
11530 ttep = pmap_ttne(pmap, commpage_level, commpage_vaddr);
11531
11532 if (ttep == NULL) {
11533 return;
11534 }
11535
11536 /* It had better be mapped to the shared page. */
11537 if (*ttep != ARM_TTE_EMPTY && *ttep != *pmap_ttne(commpage_pmap, commpage_level, commpage_vaddr)) {
11538 panic("%s: Something other than commpage mapped in shared page slot?", __FUNCTION__);
11539 }
11540
11541 *ttep = ARM_TTE_EMPTY;
11542 FLUSH_PTE_STRONG();
11543
11544 flush_mmu_tlb_region_asid_async(commpage_vaddr, PAGE_SIZE, pmap, false, false);
11545 sync_tlb_flush();
11546 }
11547
11548 void
11549 pmap_insert_commpage(
11550 pmap_t pmap)
11551 {
11552 kern_return_t kr = KERN_FAILURE;
11553 #if XNU_MONITOR
11554 do {
11555 kr = pmap_insert_commpage_ppl(pmap);
11556
11557 if (kr == KERN_RESOURCE_SHORTAGE) {
11558 pmap_alloc_page_for_ppl(0);
11559 }
11560 } while (kr == KERN_RESOURCE_SHORTAGE || kr == KERN_ABORTED);
11561
11562 pmap_ledger_check_balance(pmap);
11563 #else
11564 do {
11565 kr = pmap_insert_commpage_internal(pmap);
11566 } while (kr == KERN_ABORTED);
11567 #endif
11568
11569 if (kr != KERN_SUCCESS) {
11570 panic("%s: failed to insert the shared page, kr=%d, "
11571 "pmap=%p",
11572 __FUNCTION__, kr,
11573 pmap);
11574 }
11575 }
11576
11577 static boolean_t
11578 pmap_is_64bit(
11579 pmap_t pmap)
11580 {
11581 return pmap->is_64bit;
11582 }
11583
11584 bool
11585 pmap_is_exotic(
11586 pmap_t pmap __unused)
11587 {
11588 return false;
11589 }
11590
11591
11592 /* ARMTODO -- an implementation that accounts for
11593 * holes in the physical map, if any.
11594 */
11595 boolean_t
11596 pmap_valid_page(
11597 ppnum_t pn)
11598 {
11599 return pa_valid(ptoa(pn));
11600 }
11601
11602 boolean_t
11603 pmap_bootloader_page(
11604 ppnum_t pn)
11605 {
11606 pmap_paddr_t paddr = ptoa(pn);
11607
11608 if (pa_valid(paddr)) {
11609 return FALSE;
11610 }
11611 pmap_io_range_t *io_rgn = pmap_find_io_attr(paddr);
11612 return (io_rgn != NULL) && (io_rgn->wimg & PMAP_IO_RANGE_CARVEOUT);
11613 }
11614
11615 MARK_AS_PMAP_TEXT boolean_t
11616 pmap_is_empty_internal(
11617 pmap_t pmap,
11618 vm_map_offset_t va_start,
11619 vm_map_offset_t va_end)
11620 {
11621 vm_map_offset_t block_start, block_end;
11622 tt_entry_t *tte_p;
11623
11624 if (pmap == NULL) {
11625 return TRUE;
11626 }
11627
11628 validate_pmap(pmap);
11629
11630 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
11631 unsigned int initial_not_in_kdp = not_in_kdp;
11632
11633 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11634 pmap_lock(pmap, PMAP_LOCK_SHARED);
11635 }
11636
11637
11638 /* TODO: This will be faster if we increment ttep at each level. */
11639 block_start = va_start;
11640
11641 while (block_start < va_end) {
11642 pt_entry_t *bpte_p, *epte_p;
11643 pt_entry_t *pte_p;
11644
11645 block_end = (block_start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr);
11646 if (block_end > va_end) {
11647 block_end = va_end;
11648 }
11649
11650 tte_p = pmap_tte(pmap, block_start);
11651 if ((tte_p != PT_ENTRY_NULL)
11652 && ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE)) {
11653 pte_p = (pt_entry_t *) ttetokv(*tte_p);
11654 bpte_p = &pte_p[pte_index(pt_attr, block_start)];
11655 epte_p = &pte_p[pte_index(pt_attr, block_end)];
11656
11657 for (pte_p = bpte_p; pte_p < epte_p; pte_p++) {
11658 if (*pte_p != ARM_PTE_EMPTY) {
11659 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11660 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11661 }
11662 return FALSE;
11663 }
11664 }
11665 }
11666 block_start = block_end;
11667 }
11668
11669 if ((pmap != kernel_pmap) && (initial_not_in_kdp)) {
11670 pmap_unlock(pmap, PMAP_LOCK_SHARED);
11671 }
11672
11673 return TRUE;
11674 }
11675
11676 boolean_t
11677 pmap_is_empty(
11678 pmap_t pmap,
11679 vm_map_offset_t va_start,
11680 vm_map_offset_t va_end)
11681 {
11682 #if XNU_MONITOR
11683 return pmap_is_empty_ppl(pmap, va_start, va_end);
11684 #else
11685 return pmap_is_empty_internal(pmap, va_start, va_end);
11686 #endif
11687 }
11688
11689 vm_map_offset_t
11690 pmap_max_offset(
11691 boolean_t is64,
11692 unsigned int option)
11693 {
11694 return (is64) ? pmap_max_64bit_offset(option) : pmap_max_32bit_offset(option);
11695 }
11696
11697 vm_map_offset_t
11698 pmap_max_64bit_offset(
11699 __unused unsigned int option)
11700 {
11701 vm_map_offset_t max_offset_ret = 0;
11702
11703 #if defined(__arm64__)
11704 const vm_map_offset_t min_max_offset = ARM64_MIN_MAX_ADDRESS; // end of shared region + 512MB for various purposes
11705 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11706 max_offset_ret = arm64_pmap_max_offset_default;
11707 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11708 max_offset_ret = min_max_offset;
11709 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11710 max_offset_ret = MACH_VM_MAX_ADDRESS;
11711 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11712 if (arm64_pmap_max_offset_default) {
11713 max_offset_ret = arm64_pmap_max_offset_default;
11714 } else if (max_mem > 0xC0000000) {
11715 // devices with > 3GB of memory
11716 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_LARGE;
11717 } else if (max_mem > 0x40000000) {
11718 // devices with > 1GB and <= 3GB of memory
11719 max_offset_ret = ARM64_MAX_OFFSET_DEVICE_SMALL;
11720 } else {
11721 // devices with <= 1 GB of memory
11722 max_offset_ret = min_max_offset;
11723 }
11724 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11725 if (arm64_pmap_max_offset_default) {
11726 // Allow the boot-arg to override jumbo size
11727 max_offset_ret = arm64_pmap_max_offset_default;
11728 } else {
11729 max_offset_ret = MACH_VM_MAX_ADDRESS; // Max offset is 64GB for pmaps with special "jumbo" blessing
11730 }
11731 } else {
11732 panic("pmap_max_64bit_offset illegal option 0x%x", option);
11733 }
11734
11735 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11736 if (option != ARM_PMAP_MAX_OFFSET_DEFAULT) {
11737 assert(max_offset_ret >= min_max_offset);
11738 }
11739 #else
11740 panic("Can't run pmap_max_64bit_offset on non-64bit architectures");
11741 #endif
11742
11743 return max_offset_ret;
11744 }
11745
11746 vm_map_offset_t
11747 pmap_max_32bit_offset(
11748 unsigned int option)
11749 {
11750 vm_map_offset_t max_offset_ret = 0;
11751
11752 if (option == ARM_PMAP_MAX_OFFSET_DEFAULT) {
11753 max_offset_ret = arm_pmap_max_offset_default;
11754 } else if (option == ARM_PMAP_MAX_OFFSET_MIN) {
11755 max_offset_ret = VM_MAX_ADDRESS;
11756 } else if (option == ARM_PMAP_MAX_OFFSET_MAX) {
11757 max_offset_ret = VM_MAX_ADDRESS;
11758 } else if (option == ARM_PMAP_MAX_OFFSET_DEVICE) {
11759 if (arm_pmap_max_offset_default) {
11760 max_offset_ret = arm_pmap_max_offset_default;
11761 } else if (max_mem > 0x20000000) {
11762 max_offset_ret = VM_MAX_ADDRESS;
11763 } else {
11764 max_offset_ret = VM_MAX_ADDRESS;
11765 }
11766 } else if (option == ARM_PMAP_MAX_OFFSET_JUMBO) {
11767 max_offset_ret = VM_MAX_ADDRESS;
11768 } else {
11769 panic("pmap_max_32bit_offset illegal option 0x%x", option);
11770 }
11771
11772 assert(max_offset_ret <= MACH_VM_MAX_ADDRESS);
11773 return max_offset_ret;
11774 }
11775
11776 #if CONFIG_DTRACE
11777 /*
11778 * Constrain DTrace copyin/copyout actions
11779 */
11780 extern kern_return_t dtrace_copyio_preflight(addr64_t);
11781 extern kern_return_t dtrace_copyio_postflight(addr64_t);
11782
11783 kern_return_t
11784 dtrace_copyio_preflight(
11785 __unused addr64_t va)
11786 {
11787 if (current_map() == kernel_map) {
11788 return KERN_FAILURE;
11789 } else {
11790 return KERN_SUCCESS;
11791 }
11792 }
11793
11794 kern_return_t
11795 dtrace_copyio_postflight(
11796 __unused addr64_t va)
11797 {
11798 return KERN_SUCCESS;
11799 }
11800 #endif /* CONFIG_DTRACE */
11801
11802
11803 void
11804 pmap_flush_context_init(__unused pmap_flush_context *pfc)
11805 {
11806 }
11807
11808
11809 void
11810 pmap_flush(
11811 __unused pmap_flush_context *cpus_to_flush)
11812 {
11813 /* not implemented yet */
11814 return;
11815 }
11816
11817 #if XNU_MONITOR
11818
11819 /*
11820 * Enforce that the address range described by kva and nbytes is not currently
11821 * PPL-owned, and won't become PPL-owned while pinned. This is to prevent
11822 * unintentionally writing to PPL-owned memory.
11823 */
11824 void
11825 pmap_pin_kernel_pages(vm_offset_t kva, size_t nbytes)
11826 {
11827 vm_offset_t end;
11828 if (os_add_overflow(kva, nbytes, &end)) {
11829 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11830 }
11831 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11832 pmap_paddr_t pa = kvtophys_nofail(ckva);
11833 unsigned int pai = pa_index(pa);
11834 pp_attr_t attr;
11835 if (__improbable(!pa_valid(pa))) {
11836 panic("%s(%s): attempt to pin mapping of non-managed page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11837 }
11838 pvh_lock(pai);
11839 if (__improbable(ckva == phystokv(pa))) {
11840 panic("%s(%p): attempt to pin static mapping for page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11841 }
11842 do {
11843 attr = pp_attr_table[pai] & ~PP_ATTR_NO_MONITOR;
11844 if (__improbable(attr & PP_ATTR_MONITOR)) {
11845 panic("%s(%p): physical page 0x%llx belongs to PPL", __func__, (void*)kva, (uint64_t)pa);
11846 }
11847 } while (!OSCompareAndSwap16(attr, attr | PP_ATTR_NO_MONITOR, &pp_attr_table[pai]));
11848 pvh_unlock(pai);
11849 if (__improbable(kvtophys_nofail(ckva) != pa)) {
11850 panic("%s(%p): VA no longer mapped to physical page 0x%llx", __func__, (void*)kva, (uint64_t)pa);
11851 }
11852 }
11853 }
11854
11855 void
11856 pmap_unpin_kernel_pages(vm_offset_t kva, size_t nbytes)
11857 {
11858 vm_offset_t end;
11859 if (os_add_overflow(kva, nbytes, &end)) {
11860 panic("%s(%p, 0x%llx): overflow", __func__, (void*)kva, (uint64_t)nbytes);
11861 }
11862 for (vm_offset_t ckva = trunc_page(kva); ckva < end; ckva = round_page(ckva + 1)) {
11863 pmap_paddr_t pa = kvtophys_nofail(ckva);
11864
11865 if (!(pp_attr_table[pa_index(pa)] & PP_ATTR_NO_MONITOR)) {
11866 panic("%s(%p): physical page 0x%llx not pinned", __func__, (void*)kva, (uint64_t)pa);
11867 }
11868 assert(!(pp_attr_table[pa_index(pa)] & PP_ATTR_MONITOR));
11869 ppattr_pa_clear_no_monitor(pa);
11870 }
11871 }
11872
11873 /**
11874 * Lock down a page, making all mappings read-only, and preventing further
11875 * mappings or removal of this particular kva's mapping. Effectively, it makes
11876 * the physical page at kva immutable (see the ppl_writable parameter for an
11877 * exception to this).
11878 *
11879 * @param kva Valid address to any mapping of the physical page to lockdown.
11880 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11881 * @param ppl_writable True if the PPL should still be able to write to the page
11882 * using the physical aperture mapping. False will make the
11883 * page read-only for both the kernel and PPL in the
11884 * physical aperture.
11885 */
11886
11887 MARK_AS_PMAP_TEXT static void
11888 pmap_ppl_lockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
11889 {
11890 pmap_ppl_lockdown_page_with_prot(kva, lockdown_flag, ppl_writable, VM_PROT_READ);
11891 }
11892
11893 /**
11894 * Lock down a page, giving all mappings the specified maximum permissions, and
11895 * preventing further mappings or removal of this particular kva's mapping.
11896 * Effectively, it makes the physical page at kva immutable (see the ppl_writable
11897 * parameter for an exception to this).
11898 *
11899 * @param kva Valid address to any mapping of the physical page to lockdown.
11900 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11901 * @param ppl_writable True if the PPL should still be able to write to the page
11902 * using the physical aperture mapping. False will make the
11903 * page read-only for both the kernel and PPL in the
11904 * physical aperture.
11905 * @param prot Maximum permissions to allow in existing alias mappings
11906 */
11907 MARK_AS_PMAP_TEXT static void
11908 pmap_ppl_lockdown_page_with_prot(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable, vm_prot_t prot)
11909 {
11910 const pmap_paddr_t pa = kvtophys_nofail(kva);
11911 const unsigned int pai = pa_index(pa);
11912
11913 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
11914 pvh_lock(pai);
11915 pv_entry_t **pvh = pai_to_pvh(pai);
11916 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11917
11918 if (__improbable(ppattr_pa_test_monitor(pa))) {
11919 panic("%s: %#lx (page %llx) belongs to PPL", __func__, kva, pa);
11920 }
11921
11922 if (__improbable(pvh_flags & (PVH_FLAG_LOCKDOWN_MASK | PVH_FLAG_EXEC))) {
11923 panic("%s: %#lx already locked down/executable (%#llx)",
11924 __func__, kva, (uint64_t)pvh_flags);
11925 }
11926
11927
11928 pvh_set_flags(pvh, pvh_flags | lockdown_flag);
11929
11930 /* Update the physical aperture mapping to prevent kernel write access. */
11931 const unsigned int new_xprr_perm =
11932 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11933 pmap_set_xprr_perm(pai, XPRR_KERN_RW_PERM, new_xprr_perm);
11934
11935 pvh_unlock(pai);
11936
11937 pmap_page_protect_options_internal((ppnum_t)atop(pa), prot, 0, NULL);
11938
11939 /**
11940 * Double-check that the mapping didn't change physical addresses before the
11941 * LOCKDOWN flag was set (there is a brief window between the above
11942 * kvtophys() and pvh_lock() calls where the mapping could have changed).
11943 *
11944 * This doesn't solve the ABA problem, but this doesn't have to since once
11945 * the pvh_lock() is grabbed no new mappings can be created on this physical
11946 * page without the LOCKDOWN flag already set (so any future mappings can
11947 * only be RO, and no existing mappings can be removed).
11948 */
11949 if (kvtophys_nofail(kva) != pa) {
11950 panic("%s: Physical address of mapping changed while setting LOCKDOWN "
11951 "flag %#lx %#llx", __func__, kva, (uint64_t)pa);
11952 }
11953 }
11954
11955 /**
11956 * Helper for releasing a page from being locked down to the PPL, making it writable to the
11957 * kernel once again.
11958 *
11959 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11960 * to unlockdown a page that was never locked down, will panic.
11961 *
11962 * @param pai physical page index to release from lockdown. PVH lock for this page must be held.
11963 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11964 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11965 * passed to the paired pmap_ppl_lockdown_page() call. Any
11966 * deviation will result in a panic.
11967 */
11968 MARK_AS_PMAP_TEXT static void
11969 pmap_ppl_unlockdown_page_locked(unsigned int pai, uint64_t lockdown_flag, bool ppl_writable)
11970 {
11971 pvh_assert_locked(pai);
11972 pv_entry_t **pvh = pai_to_pvh(pai);
11973 const vm_offset_t pvh_flags = pvh_get_flags(pvh);
11974
11975 if (__improbable(!(pvh_flags & lockdown_flag))) {
11976 panic("%s: unlockdown attempt on not locked down pai %d, type=0x%llx, PVH flags=0x%llx",
11977 __func__, pai, (unsigned long long)lockdown_flag, (unsigned long long)pvh_flags);
11978 }
11979
11980
11981 pvh_set_flags(pvh, pvh_flags & ~lockdown_flag);
11982
11983 /* Restore the pre-lockdown physical aperture mapping permissions. */
11984 const unsigned int old_xprr_perm =
11985 (ppl_writable) ? XPRR_PPL_RW_PERM : XPRR_KERN_RO_PERM;
11986 pmap_set_xprr_perm(pai, old_xprr_perm, XPRR_KERN_RW_PERM);
11987 }
11988
11989 /**
11990 * Release a page from being locked down to the PPL, making it writable to the
11991 * kernel once again.
11992 *
11993 * @note This must be paired with a pmap_ppl_lockdown_page() call. Any attempts
11994 * to unlockdown a page that was never locked down, will panic.
11995 *
11996 * @param kva Valid address to any mapping of the physical page to unlockdown.
11997 * @param lockdown_flag Bit within PVH_FLAG_LOCKDOWN_MASK specifying the lockdown reason
11998 * @param ppl_writable This must match whatever `ppl_writable` parameter was
11999 * passed to the paired pmap_ppl_lockdown_page() call. Any
12000 * deviation will result in a panic.
12001 */
12002 MARK_AS_PMAP_TEXT static void
12003 pmap_ppl_unlockdown_page(vm_address_t kva, uint64_t lockdown_flag, bool ppl_writable)
12004 {
12005 const pmap_paddr_t pa = kvtophys_nofail(kva);
12006 const unsigned int pai = pa_index(pa);
12007
12008 assert(lockdown_flag & PVH_FLAG_LOCKDOWN_MASK);
12009 pvh_lock(pai);
12010 pmap_ppl_unlockdown_page_locked(pai, lockdown_flag, ppl_writable);
12011 pvh_unlock(pai);
12012 }
12013
12014 #else /* XNU_MONITOR */
12015
12016 void __unused
12017 pmap_pin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12018 {
12019 }
12020
12021 void __unused
12022 pmap_unpin_kernel_pages(vm_offset_t kva __unused, size_t nbytes __unused)
12023 {
12024 }
12025
12026 #endif /* !XNU_MONITOR */
12027
12028
12029 MARK_AS_PMAP_TEXT static inline void
12030 pmap_cs_lockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12031 {
12032 #if XNU_MONITOR
12033 pmap_ppl_lockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12034 #else
12035 pmap_ppl_lockdown_pages(kva, size, 0, ppl_writable);
12036 #endif
12037 }
12038
12039 MARK_AS_PMAP_TEXT static inline void
12040 pmap_cs_unlockdown_pages(vm_address_t kva, vm_size_t size, bool ppl_writable)
12041 {
12042 #if XNU_MONITOR
12043 pmap_ppl_unlockdown_pages(kva, size, PVH_FLAG_LOCKDOWN_CS, ppl_writable);
12044 #else
12045 pmap_ppl_unlockdown_pages(kva, size, 0, ppl_writable);
12046 #endif
12047 }
12048
12049 /**
12050 * Perform basic validation checks on the destination only and
12051 * corresponding offset/sizes prior to writing to a read only allocation.
12052 *
12053 * @note Should be called before writing to an allocation from the read
12054 * only allocator.
12055 *
12056 * @param zid The ID of the zone the allocation belongs to.
12057 * @param va VA of element being modified (destination).
12058 * @param offset Offset being written to, in the element.
12059 * @param new_data_size Size of modification.
12060 *
12061 */
12062
12063 MARK_AS_PMAP_TEXT static void
12064 pmap_ro_zone_validate_element_dst(
12065 zone_id_t zid,
12066 vm_offset_t va,
12067 vm_offset_t offset,
12068 vm_size_t new_data_size)
12069 {
12070 if (__improbable((zid < ZONE_ID__FIRST_RO) || (zid > ZONE_ID__LAST_RO))) {
12071 panic("%s: ZoneID %u outside RO range %u - %u", __func__, zid,
12072 ZONE_ID__FIRST_RO, ZONE_ID__LAST_RO);
12073 }
12074
12075 vm_size_t elem_size = zone_ro_size_params[zid].z_elem_size;
12076
12077 /* Check element is from correct zone and properly aligned */
12078 zone_require_ro(zid, elem_size, (void*)va);
12079
12080 if (__improbable(new_data_size > (elem_size - offset))) {
12081 panic("%s: New data size %lu too large for elem size %lu at addr %p",
12082 __func__, (uintptr_t)new_data_size, (uintptr_t)elem_size, (void*)va);
12083 }
12084 if (__improbable(offset >= elem_size)) {
12085 panic("%s: Offset %lu too large for elem size %lu at addr %p",
12086 __func__, (uintptr_t)offset, (uintptr_t)elem_size, (void*)va);
12087 }
12088 }
12089
12090
12091 /**
12092 * Perform basic validation checks on the source, destination and
12093 * corresponding offset/sizes prior to writing to a read only allocation.
12094 *
12095 * @note Should be called before writing to an allocation from the read
12096 * only allocator.
12097 *
12098 * @param zid The ID of the zone the allocation belongs to.
12099 * @param va VA of element being modified (destination).
12100 * @param offset Offset being written to, in the element.
12101 * @param new_data Pointer to new data (source).
12102 * @param new_data_size Size of modification.
12103 *
12104 */
12105
12106 MARK_AS_PMAP_TEXT static void
12107 pmap_ro_zone_validate_element(
12108 zone_id_t zid,
12109 vm_offset_t va,
12110 vm_offset_t offset,
12111 const vm_offset_t new_data,
12112 vm_size_t new_data_size)
12113 {
12114 vm_offset_t sum = 0;
12115
12116 if (__improbable(os_add_overflow(new_data, new_data_size, &sum))) {
12117 panic("%s: Integer addition overflow %p + %lu = %lu",
12118 __func__, (void*)new_data, (uintptr_t)new_data_size, (uintptr_t)sum);
12119 }
12120
12121 pmap_ro_zone_validate_element_dst(zid, va, offset, new_data_size);
12122 }
12123
12124 /**
12125 * Ensure that physical page is locked down before writing to it.
12126 *
12127 * @note Should be called before writing to an allocation from the read
12128 * only allocator. This function pairs with pmap_ro_zone_unlock_phy_page,
12129 * ensure that it is called after the modification.
12130 *
12131 *
12132 * @param pa Physical address of the element being modified.
12133 * @param va Virtual address of element being modified.
12134 * @param size Size of the modification.
12135 *
12136 */
12137
12138 MARK_AS_PMAP_TEXT static void
12139 pmap_ro_zone_lock_phy_page(
12140 const pmap_paddr_t pa,
12141 vm_offset_t va,
12142 vm_size_t size)
12143 {
12144 if (__improbable(trunc_page(va + size - 1) != trunc_page(va))) {
12145 panic("%s: va 0x%llx size 0x%llx crosses page boundary",
12146 __func__, (unsigned long long)va, (unsigned long long)size);
12147 }
12148 const unsigned int pai = pa_index(pa);
12149 pvh_lock(pai);
12150
12151 /* Ensure that the physical page is locked down */
12152 #if XNU_MONITOR
12153 pv_entry_t **pvh = pai_to_pvh(pai);
12154 if (!(pvh_get_flags(pvh) & PVH_FLAG_LOCKDOWN_RO)) {
12155 panic("%s: Physical page not locked down %llx", __func__, pa);
12156 }
12157 #endif /* XNU_MONITOR */
12158 }
12159
12160 /**
12161 * Unlock physical page after writing to it.
12162 *
12163 * @note Should be called after writing to an allocation from the read
12164 * only allocator. This function pairs with pmap_ro_zone_lock_phy_page,
12165 * ensure that it has been called prior to the modification.
12166 *
12167 * @param pa Physical address of the element that was modified.
12168 * @param va Virtual address of element that was modified.
12169 * @param size Size of the modification.
12170 *
12171 */
12172
12173 MARK_AS_PMAP_TEXT static void
12174 pmap_ro_zone_unlock_phy_page(
12175 const pmap_paddr_t pa,
12176 vm_offset_t va __unused,
12177 vm_size_t size __unused)
12178 {
12179 const unsigned int pai = pa_index(pa);
12180 pvh_unlock(pai);
12181 }
12182
12183 /**
12184 * Function to copy kauth_cred from new_data to kv.
12185 * Function defined in "kern_prot.c"
12186 *
12187 * @note Will be removed upon completion of
12188 * <rdar://problem/72635194> Compiler PAC support for memcpy.
12189 *
12190 * @param kv Address to copy new data to.
12191 * @param new_data Pointer to new data.
12192 *
12193 */
12194
12195 extern void
12196 kauth_cred_copy(const uintptr_t kv, const uintptr_t new_data);
12197
12198 /**
12199 * Zalloc-specific memcpy that writes through the physical aperture
12200 * and ensures the element being modified is from a read-only zone.
12201 *
12202 * @note Designed to work only with the zone allocator's read-only submap.
12203 *
12204 * @param zid The ID of the zone to allocate from.
12205 * @param va VA of element to be modified.
12206 * @param offset Offset from element.
12207 * @param new_data Pointer to new data.
12208 * @param new_data_size Size of modification.
12209 *
12210 */
12211
12212 void
12213 pmap_ro_zone_memcpy(
12214 zone_id_t zid,
12215 vm_offset_t va,
12216 vm_offset_t offset,
12217 const vm_offset_t new_data,
12218 vm_size_t new_data_size)
12219 {
12220 #if XNU_MONITOR
12221 pmap_ro_zone_memcpy_ppl(zid, va, offset, new_data, new_data_size);
12222 #else /* XNU_MONITOR */
12223 pmap_ro_zone_memcpy_internal(zid, va, offset, new_data, new_data_size);
12224 #endif /* XNU_MONITOR */
12225 }
12226
12227 MARK_AS_PMAP_TEXT void
12228 pmap_ro_zone_memcpy_internal(
12229 zone_id_t zid,
12230 vm_offset_t va,
12231 vm_offset_t offset,
12232 const vm_offset_t new_data,
12233 vm_size_t new_data_size)
12234 {
12235 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12236
12237 if (!new_data || new_data_size == 0) {
12238 return;
12239 }
12240
12241 pmap_ro_zone_validate_element(zid, va, offset, new_data, new_data_size);
12242 pmap_ro_zone_lock_phy_page(pa, va, new_data_size);
12243 memcpy((void*)phystokv(pa), (void*)new_data, new_data_size);
12244 pmap_ro_zone_unlock_phy_page(pa, va, new_data_size);
12245 }
12246
12247 /**
12248 * Zalloc-specific function to atomically mutate fields of an element that
12249 * belongs to a read-only zone, via the physcial aperture.
12250 *
12251 * @note Designed to work only with the zone allocator's read-only submap.
12252 *
12253 * @param zid The ID of the zone the element belongs to.
12254 * @param va VA of element to be modified.
12255 * @param offset Offset in element.
12256 * @param op Atomic operation to perform.
12257 * @param value Mutation value.
12258 *
12259 */
12260
12261 uint64_t
12262 pmap_ro_zone_atomic_op(
12263 zone_id_t zid,
12264 vm_offset_t va,
12265 vm_offset_t offset,
12266 zro_atomic_op_t op,
12267 uint64_t value)
12268 {
12269 #if XNU_MONITOR
12270 return pmap_ro_zone_atomic_op_ppl(zid, va, offset, op, value);
12271 #else /* XNU_MONITOR */
12272 return pmap_ro_zone_atomic_op_internal(zid, va, offset, op, value);
12273 #endif /* XNU_MONITOR */
12274 }
12275
12276 MARK_AS_PMAP_TEXT uint64_t
12277 pmap_ro_zone_atomic_op_internal(
12278 zone_id_t zid,
12279 vm_offset_t va,
12280 vm_offset_t offset,
12281 zro_atomic_op_t op,
12282 uint64_t value)
12283 {
12284 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12285 vm_size_t value_size = op & 0xf;
12286
12287 pmap_ro_zone_validate_element_dst(zid, va, offset, value_size);
12288 pmap_ro_zone_lock_phy_page(pa, va, value_size);
12289 value = __zalloc_ro_mut_atomic(phystokv(pa), op, value);
12290 pmap_ro_zone_unlock_phy_page(pa, va, value_size);
12291
12292 return value;
12293 }
12294
12295 /**
12296 * bzero for allocations from read only zones, that writes through the
12297 * physical aperture.
12298 *
12299 * @note This is called by the zfree path of all allocations from read
12300 * only zones.
12301 *
12302 * @param zid The ID of the zone the allocation belongs to.
12303 * @param va VA of element to be zeroed.
12304 * @param offset Offset in the element.
12305 * @param size Size of allocation.
12306 *
12307 */
12308
12309 void
12310 pmap_ro_zone_bzero(
12311 zone_id_t zid,
12312 vm_offset_t va,
12313 vm_offset_t offset,
12314 vm_size_t size)
12315 {
12316 #if XNU_MONITOR
12317 pmap_ro_zone_bzero_ppl(zid, va, offset, size);
12318 #else /* XNU_MONITOR */
12319 pmap_ro_zone_bzero_internal(zid, va, offset, size);
12320 #endif /* XNU_MONITOR */
12321 }
12322
12323 MARK_AS_PMAP_TEXT void
12324 pmap_ro_zone_bzero_internal(
12325 zone_id_t zid,
12326 vm_offset_t va,
12327 vm_offset_t offset,
12328 vm_size_t size)
12329 {
12330 const pmap_paddr_t pa = kvtophys_nofail(va + offset);
12331 pmap_ro_zone_validate_element(zid, va, offset, 0, size);
12332 pmap_ro_zone_lock_phy_page(pa, va, size);
12333 bzero((void*)phystokv(pa), size);
12334 pmap_ro_zone_unlock_phy_page(pa, va, size);
12335 }
12336
12337 /**
12338 * Removes write access from the Physical Aperture.
12339 *
12340 * @note For non-PPL devices, it simply makes all virtual mappings RO.
12341 * @note Designed to work only with the zone allocator's read-only submap.
12342 *
12343 * @param va VA of the page to restore write access to.
12344 *
12345 */
12346 MARK_AS_PMAP_TEXT static void
12347 pmap_phys_write_disable(vm_address_t va)
12348 {
12349 #if XNU_MONITOR
12350 pmap_ppl_lockdown_page(va, PVH_FLAG_LOCKDOWN_RO, true);
12351 #else /* XNU_MONITOR */
12352 pmap_page_protect(atop_kernel(kvtophys(va)), VM_PROT_READ);
12353 #endif /* XNU_MONITOR */
12354 }
12355
12356 #define PMAP_RESIDENT_INVALID ((mach_vm_size_t)-1)
12357
12358 MARK_AS_PMAP_TEXT mach_vm_size_t
12359 pmap_query_resident_internal(
12360 pmap_t pmap,
12361 vm_map_address_t start,
12362 vm_map_address_t end,
12363 mach_vm_size_t *compressed_bytes_p)
12364 {
12365 mach_vm_size_t resident_bytes = 0;
12366 mach_vm_size_t compressed_bytes = 0;
12367
12368 pt_entry_t *bpte, *epte;
12369 pt_entry_t *pte_p;
12370 tt_entry_t *tte_p;
12371
12372 if (pmap == NULL) {
12373 return PMAP_RESIDENT_INVALID;
12374 }
12375
12376 validate_pmap(pmap);
12377
12378 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12379
12380 /* Ensure that this request is valid, and addresses exactly one TTE. */
12381 if (__improbable((start % pt_attr_page_size(pt_attr)) ||
12382 (end % pt_attr_page_size(pt_attr)))) {
12383 panic("%s: address range %p, %p not page-aligned to 0x%llx", __func__, (void*)start, (void*)end, pt_attr_page_size(pt_attr));
12384 }
12385
12386 if (__improbable((end < start) || (end > ((start + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr))))) {
12387 panic("%s: invalid address range %p, %p", __func__, (void*)start, (void*)end);
12388 }
12389
12390 pmap_lock(pmap, PMAP_LOCK_SHARED);
12391 tte_p = pmap_tte(pmap, start);
12392 if (tte_p == (tt_entry_t *) NULL) {
12393 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12394 return PMAP_RESIDENT_INVALID;
12395 }
12396 if ((*tte_p & ARM_TTE_TYPE_MASK) == ARM_TTE_TYPE_TABLE) {
12397 pte_p = (pt_entry_t *) ttetokv(*tte_p);
12398 bpte = &pte_p[pte_index(pt_attr, start)];
12399 epte = &pte_p[pte_index(pt_attr, end)];
12400
12401 for (; bpte < epte; bpte++) {
12402 if (ARM_PTE_IS_COMPRESSED(*bpte, bpte)) {
12403 compressed_bytes += pt_attr_page_size(pt_attr);
12404 } else if (pa_valid(pte_to_pa(*bpte))) {
12405 resident_bytes += pt_attr_page_size(pt_attr);
12406 }
12407 }
12408 }
12409 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12410
12411 if (compressed_bytes_p) {
12412 pmap_pin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12413 *compressed_bytes_p += compressed_bytes;
12414 pmap_unpin_kernel_pages((vm_offset_t)compressed_bytes_p, sizeof(*compressed_bytes_p));
12415 }
12416
12417 return resident_bytes;
12418 }
12419
12420 mach_vm_size_t
12421 pmap_query_resident(
12422 pmap_t pmap,
12423 vm_map_address_t start,
12424 vm_map_address_t end,
12425 mach_vm_size_t *compressed_bytes_p)
12426 {
12427 mach_vm_size_t total_resident_bytes;
12428 mach_vm_size_t compressed_bytes;
12429 vm_map_address_t va;
12430
12431
12432 if (pmap == PMAP_NULL) {
12433 if (compressed_bytes_p) {
12434 *compressed_bytes_p = 0;
12435 }
12436 return 0;
12437 }
12438
12439 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12440
12441 total_resident_bytes = 0;
12442 compressed_bytes = 0;
12443
12444 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_START,
12445 VM_KERNEL_ADDRHIDE(pmap), VM_KERNEL_ADDRHIDE(start),
12446 VM_KERNEL_ADDRHIDE(end));
12447
12448 va = start;
12449 while (va < end) {
12450 vm_map_address_t l;
12451 mach_vm_size_t resident_bytes;
12452
12453 l = ((va + pt_attr_twig_size(pt_attr)) & ~pt_attr_twig_offmask(pt_attr));
12454
12455 if (l > end) {
12456 l = end;
12457 }
12458 #if XNU_MONITOR
12459 resident_bytes = pmap_query_resident_ppl(pmap, va, l, compressed_bytes_p);
12460 #else
12461 resident_bytes = pmap_query_resident_internal(pmap, va, l, compressed_bytes_p);
12462 #endif
12463 if (resident_bytes == PMAP_RESIDENT_INVALID) {
12464 break;
12465 }
12466
12467 total_resident_bytes += resident_bytes;
12468
12469 va = l;
12470 }
12471
12472 if (compressed_bytes_p) {
12473 *compressed_bytes_p = compressed_bytes;
12474 }
12475
12476 PMAP_TRACE(3, PMAP_CODE(PMAP__QUERY_RESIDENT) | DBG_FUNC_END,
12477 total_resident_bytes);
12478
12479 return total_resident_bytes;
12480 }
12481
12482 #if MACH_ASSERT
12483 static void
12484 pmap_check_ledgers(
12485 pmap_t pmap)
12486 {
12487 int pid;
12488 char *procname;
12489
12490 if (pmap->pmap_pid == 0 || pmap->pmap_pid == -1) {
12491 /*
12492 * This pmap was not or is no longer fully associated
12493 * with a task (e.g. the old pmap after a fork()/exec() or
12494 * spawn()). Its "ledger" still points at a task that is
12495 * now using a different (and active) address space, so
12496 * we can't check that all the pmap ledgers are balanced here.
12497 *
12498 * If the "pid" is set, that means that we went through
12499 * pmap_set_process() in task_terminate_internal(), so
12500 * this task's ledger should not have been re-used and
12501 * all the pmap ledgers should be back to 0.
12502 */
12503 return;
12504 }
12505
12506 pid = pmap->pmap_pid;
12507 procname = pmap->pmap_procname;
12508
12509 vm_map_pmap_check_ledgers(pmap, pmap->ledger, pid, procname);
12510 }
12511 #endif /* MACH_ASSERT */
12512
12513 void
12514 pmap_advise_pagezero_range(__unused pmap_t p, __unused uint64_t a)
12515 {
12516 }
12517
12518 /**
12519 * The minimum shared region nesting size is used by the VM to determine when to
12520 * break up large mappings to nested regions. The smallest size that these
12521 * mappings can be broken into is determined by what page table level those
12522 * regions are being nested in at and the size of the page tables.
12523 *
12524 * For instance, if a nested region is nesting at L2 for a process utilizing
12525 * 16KB page tables, then the minimum nesting size would be 32MB (size of an L2
12526 * block entry).
12527 *
12528 * @param pmap The target pmap to determine the block size based on whether it's
12529 * using 16KB or 4KB page tables.
12530 */
12531 uint64_t
12532 pmap_shared_region_size_min(__unused pmap_t pmap)
12533 {
12534 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
12535
12536 /**
12537 * We always nest the shared region at L2 (32MB for 16KB pages, 2MB for
12538 * 4KB pages). This means that a target pmap will contain L2 entries that
12539 * point to shared L3 page tables in the shared region pmap.
12540 */
12541 return pt_attr_twig_size(pt_attr);
12542 }
12543
12544 boolean_t
12545 pmap_enforces_execute_only(
12546 pmap_t pmap)
12547 {
12548 return pmap != kernel_pmap;
12549 }
12550
12551 MARK_AS_PMAP_TEXT void
12552 pmap_set_vm_map_cs_enforced_internal(
12553 pmap_t pmap,
12554 bool new_value)
12555 {
12556 validate_pmap_mutable(pmap);
12557 pmap->pmap_vm_map_cs_enforced = new_value;
12558 }
12559
12560 void
12561 pmap_set_vm_map_cs_enforced(
12562 pmap_t pmap,
12563 bool new_value)
12564 {
12565 #if XNU_MONITOR
12566 pmap_set_vm_map_cs_enforced_ppl(pmap, new_value);
12567 #else
12568 pmap_set_vm_map_cs_enforced_internal(pmap, new_value);
12569 #endif
12570 }
12571
12572 extern int cs_process_enforcement_enable;
12573 bool
12574 pmap_get_vm_map_cs_enforced(
12575 pmap_t pmap)
12576 {
12577 if (cs_process_enforcement_enable) {
12578 return true;
12579 }
12580 return pmap->pmap_vm_map_cs_enforced;
12581 }
12582
12583 MARK_AS_PMAP_TEXT void
12584 pmap_set_jit_entitled_internal(
12585 __unused pmap_t pmap)
12586 {
12587 return;
12588 }
12589
12590 void
12591 pmap_set_jit_entitled(
12592 pmap_t pmap)
12593 {
12594 #if XNU_MONITOR
12595 pmap_set_jit_entitled_ppl(pmap);
12596 #else
12597 pmap_set_jit_entitled_internal(pmap);
12598 #endif
12599 }
12600
12601 bool
12602 pmap_get_jit_entitled(
12603 __unused pmap_t pmap)
12604 {
12605 return false;
12606 }
12607
12608 MARK_AS_PMAP_TEXT void
12609 pmap_set_tpro_internal(
12610 __unused pmap_t pmap)
12611 {
12612 return;
12613 }
12614
12615 void
12616 pmap_set_tpro(
12617 pmap_t pmap)
12618 {
12619 #if XNU_MONITOR
12620 pmap_set_tpro_ppl(pmap);
12621 #else /* XNU_MONITOR */
12622 pmap_set_tpro_internal(pmap);
12623 #endif /* XNU_MONITOR */
12624 }
12625
12626 bool
12627 pmap_get_tpro(
12628 __unused pmap_t pmap)
12629 {
12630 return false;
12631 }
12632
12633 uint64_t pmap_query_page_info_retries MARK_AS_PMAP_DATA;
12634
12635 MARK_AS_PMAP_TEXT kern_return_t
12636 pmap_query_page_info_internal(
12637 pmap_t pmap,
12638 vm_map_offset_t va,
12639 int *disp_p)
12640 {
12641 pmap_paddr_t pa;
12642 int disp;
12643 unsigned int pai;
12644 pt_entry_t *pte_p, pte;
12645 pv_entry_t **pv_h, *pve_p;
12646
12647 if (pmap == PMAP_NULL || pmap == kernel_pmap) {
12648 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12649 *disp_p = 0;
12650 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12651 return KERN_INVALID_ARGUMENT;
12652 }
12653
12654 validate_pmap(pmap);
12655 pmap_lock(pmap, PMAP_LOCK_SHARED);
12656
12657 try_again:
12658 disp = 0;
12659 pte_p = pmap_pte(pmap, va);
12660 if (pte_p == PT_ENTRY_NULL) {
12661 goto done;
12662 }
12663 pte = *(volatile pt_entry_t*)pte_p;
12664 pa = pte_to_pa(pte);
12665 if (pa == 0) {
12666 if (ARM_PTE_IS_COMPRESSED(pte, pte_p)) {
12667 disp |= PMAP_QUERY_PAGE_COMPRESSED;
12668 if (pte & ARM_PTE_COMPRESSED_ALT) {
12669 disp |= PMAP_QUERY_PAGE_COMPRESSED_ALTACCT;
12670 }
12671 }
12672 } else {
12673 disp |= PMAP_QUERY_PAGE_PRESENT;
12674 pai = pa_index(pa);
12675 if (!pa_valid(pa)) {
12676 goto done;
12677 }
12678 pvh_lock(pai);
12679 if (pte != *(volatile pt_entry_t*)pte_p) {
12680 /* something changed: try again */
12681 pvh_unlock(pai);
12682 pmap_query_page_info_retries++;
12683 goto try_again;
12684 }
12685 pv_h = pai_to_pvh(pai);
12686 pve_p = PV_ENTRY_NULL;
12687 int pve_ptep_idx = 0;
12688 if (pvh_test_type(pv_h, PVH_TYPE_PVEP)) {
12689 pve_p = pvh_pve_list(pv_h);
12690 while (pve_p != PV_ENTRY_NULL &&
12691 (pve_ptep_idx = pve_find_ptep_index(pve_p, pte_p)) == -1) {
12692 pve_p = pve_next(pve_p);
12693 }
12694 }
12695
12696 if (ppattr_pve_is_altacct(pai, pve_p, pve_ptep_idx)) {
12697 disp |= PMAP_QUERY_PAGE_ALTACCT;
12698 } else if (ppattr_test_reusable(pai)) {
12699 disp |= PMAP_QUERY_PAGE_REUSABLE;
12700 } else if (ppattr_pve_is_internal(pai, pve_p, pve_ptep_idx)) {
12701 disp |= PMAP_QUERY_PAGE_INTERNAL;
12702 }
12703 pvh_unlock(pai);
12704 }
12705
12706 done:
12707 pmap_unlock(pmap, PMAP_LOCK_SHARED);
12708 pmap_pin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12709 *disp_p = disp;
12710 pmap_unpin_kernel_pages((vm_offset_t)disp_p, sizeof(*disp_p));
12711 return KERN_SUCCESS;
12712 }
12713
12714 kern_return_t
12715 pmap_query_page_info(
12716 pmap_t pmap,
12717 vm_map_offset_t va,
12718 int *disp_p)
12719 {
12720 #if XNU_MONITOR
12721 return pmap_query_page_info_ppl(pmap, va, disp_p);
12722 #else
12723 return pmap_query_page_info_internal(pmap, va, disp_p);
12724 #endif
12725 }
12726
12727
12728
12729 uint32_t
12730 pmap_user_va_bits(pmap_t pmap __unused)
12731 {
12732 #if __ARM_MIXED_PAGE_SIZE__
12733 uint64_t tcr_value = pmap_get_pt_attr(pmap)->pta_tcr_value;
12734 return 64 - ((tcr_value >> TCR_T0SZ_SHIFT) & TCR_TSZ_MASK);
12735 #else
12736 return 64 - T0SZ_BOOT;
12737 #endif
12738 }
12739
12740 uint32_t
12741 pmap_kernel_va_bits(void)
12742 {
12743 return 64 - T1SZ_BOOT;
12744 }
12745
12746 static vm_map_size_t
12747 pmap_user_va_size(pmap_t pmap)
12748 {
12749 return 1ULL << pmap_user_va_bits(pmap);
12750 }
12751
12752
12753
12754 bool
12755 pmap_in_ppl(void)
12756 {
12757 // Unsupported
12758 return false;
12759 }
12760
12761 __attribute__((__noreturn__))
12762 void
12763 pmap_iofilter_protected_write(__unused vm_address_t addr, __unused uint64_t value, __unused uint64_t width)
12764 {
12765 panic("%s called on an unsupported platform.", __FUNCTION__);
12766 }
12767
12768 void *
12769 pmap_claim_reserved_ppl_page(void)
12770 {
12771 // Unsupported
12772 return NULL;
12773 }
12774
12775 void
12776 pmap_free_reserved_ppl_page(void __unused *kva)
12777 {
12778 // Unsupported
12779 }
12780
12781
12782 #if PMAP_CS_PPL_MONITOR
12783
12784 /* Immutable part of the trust cache runtime */
12785 SECURITY_READ_ONLY_LATE(TrustCacheRuntime_t) ppl_trust_cache_rt;
12786
12787 /* Mutable part of the trust cache runtime */
12788 MARK_AS_PMAP_DATA TrustCacheMutableRuntime_t ppl_trust_cache_mut_rt;
12789
12790 /* Lock for the trust cache runtime */
12791 MARK_AS_PMAP_DATA decl_lck_rw_data(, ppl_trust_cache_rt_lock);
12792
12793 MARK_AS_PMAP_TEXT kern_return_t
12794 pmap_check_trust_cache_runtime_for_uuid_internal(
12795 const uint8_t check_uuid[kUUIDSize])
12796 {
12797 kern_return_t ret = KERN_DENIED;
12798
12799 if (amfi->TrustCache.version < 3) {
12800 /* AMFI change hasn't landed in the build */
12801 pmap_cs_log_error("unable to check for loaded trust cache: interface not supported");
12802 return KERN_NOT_SUPPORTED;
12803 }
12804
12805 /* Lock the runtime as shared */
12806 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
12807
12808 TCReturn_t tc_ret = amfi->TrustCache.checkRuntimeForUUID(
12809 &ppl_trust_cache_rt,
12810 check_uuid,
12811 NULL);
12812
12813 /* Unlock the runtime */
12814 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
12815
12816 if (tc_ret.error == kTCReturnSuccess) {
12817 ret = KERN_SUCCESS;
12818 } else if (tc_ret.error == kTCReturnNotFound) {
12819 ret = KERN_NOT_FOUND;
12820 } else {
12821 ret = KERN_FAILURE;
12822 pmap_cs_log_error("trust cache UUID check failed (TCReturn: 0x%02X | 0x%02X | %u)",
12823 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12824 }
12825
12826 return ret;
12827 }
12828
12829 kern_return_t
12830 pmap_check_trust_cache_runtime_for_uuid(
12831 const uint8_t check_uuid[kUUIDSize])
12832 {
12833 return pmap_check_trust_cache_runtime_for_uuid_ppl(check_uuid);
12834 }
12835
12836 MARK_AS_PMAP_TEXT kern_return_t
12837 pmap_load_trust_cache_with_type_internal(
12838 TCType_t type,
12839 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12840 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12841 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12842 {
12843 kern_return_t ret = KERN_DENIED;
12844 pmap_img4_payload_t *payload = NULL;
12845 size_t img4_payload_len = 0;
12846 size_t payload_len_aligned = 0;
12847 size_t manifest_len_aligned = 0;
12848
12849 /* Ignore the auxiliary manifest until we add support for it */
12850 (void)img4_aux_manifest;
12851 (void)img4_aux_manifest_len;
12852
12853
12854 #if PMAP_CS_INCLUDE_CODE_SIGNING
12855 if (pmap_cs) {
12856 if ((type == kTCTypeStatic) || (type == kTCTypeEngineering) || (type == kTCTypeLegacy)) {
12857 panic("trust cache type not loadable from interface: %u", type);
12858 } else if (type >= kTCTypeTotal) {
12859 panic("attempted to load an unsupported trust cache type: %u", type);
12860 }
12861
12862 /* Validate entitlement for the calling process */
12863 if (TCTypeConfig[type].entitlementValue != NULL) {
12864 const bool entitlement_satisfied = check_entitlement_pmap(
12865 NULL,
12866 "com.apple.private.pmap.load-trust-cache",
12867 TCTypeConfig[type].entitlementValue,
12868 false,
12869 true);
12870
12871 if (entitlement_satisfied == false) {
12872 panic("attempted to load trust cache without entitlement: %u", type);
12873 }
12874 }
12875 }
12876 #endif
12877
12878 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
12879 ret = pmap_reserve_ppl_page();
12880 if (ret != KERN_SUCCESS) {
12881 if (ret != KERN_RESOURCE_SHORTAGE) {
12882 pmap_cs_log_error("unable to load trust cache (type: %u): unable to reserve page", type);
12883 }
12884 return ret;
12885 }
12886
12887 /* Align the passed in lengths to the page size -- round_page is overflow safe */
12888 payload_len_aligned = round_page(pmap_img4_payload_len);
12889 manifest_len_aligned = round_page(img4_manifest_len);
12890
12891 /* Ensure we have valid data passed in */
12892 pmap_cs_assert_addr(pmap_img4_payload, payload_len_aligned, false, false);
12893 pmap_cs_assert_addr(img4_manifest, manifest_len_aligned, false, false);
12894
12895 /*
12896 * Lockdown the data passed in. The pmap image4 payload also contains the trust cache
12897 * data structure used by libTrustCache to manage the payload. We need to be able to
12898 * write to that data structure, so we keep the payload PPL writable.
12899 */
12900 pmap_cs_lockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12901 pmap_cs_lockdown_pages(img4_manifest, manifest_len_aligned, false);
12902
12903 /* Should be safe to read from this now */
12904 payload = (pmap_img4_payload_t*)pmap_img4_payload;
12905
12906 /* Acquire a writable version of the trust cache data structure */
12907 TrustCache_t *trust_cache = &payload->trust_cache;
12908 trust_cache = (TrustCache_t*)phystokv(kvtophys_nofail((vm_offset_t)trust_cache));
12909
12910 /* Calculate the correct length of the img4 payload */
12911 if (os_sub_overflow(pmap_img4_payload_len, sizeof(pmap_img4_payload_t), &img4_payload_len)) {
12912 panic("underflow on the img4_payload_len: %lu", pmap_img4_payload_len);
12913 }
12914
12915 /* Exclusively lock the runtime */
12916 lck_rw_lock_exclusive(&ppl_trust_cache_rt_lock);
12917
12918 /* Load the trust cache */
12919 TCReturn_t tc_ret = amfi->TrustCache.load(
12920 &ppl_trust_cache_rt,
12921 type,
12922 trust_cache,
12923 (const uintptr_t)payload->img4_payload, img4_payload_len,
12924 (const uintptr_t)img4_manifest, img4_manifest_len);
12925
12926 /* Unlock the runtime */
12927 lck_rw_unlock_exclusive(&ppl_trust_cache_rt_lock);
12928
12929 if (tc_ret.error == kTCReturnSuccess) {
12930 ret = KERN_SUCCESS;
12931 } else {
12932 if (tc_ret.error == kTCReturnDuplicate) {
12933 ret = KERN_ALREADY_IN_SET;
12934 } else {
12935 pmap_cs_log_error("unable to load trust cache (TCReturn: 0x%02X | 0x%02X | %u)",
12936 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
12937
12938 ret = KERN_FAILURE;
12939 }
12940
12941 /* Unlock the payload data */
12942 pmap_cs_unlockdown_pages(pmap_img4_payload, payload_len_aligned, true);
12943 trust_cache = NULL;
12944 payload = NULL;
12945 }
12946
12947 /* Unlock the manifest since it is no longer needed */
12948 pmap_cs_unlockdown_pages(img4_manifest, manifest_len_aligned, false);
12949
12950 /* Return the CoreCrypto reserved page back to the free list */
12951 pmap_release_reserved_ppl_page();
12952
12953 return ret;
12954 }
12955
12956 kern_return_t
12957 pmap_load_trust_cache_with_type(
12958 TCType_t type,
12959 const vm_address_t pmap_img4_payload, const vm_size_t pmap_img4_payload_len,
12960 const vm_address_t img4_manifest, const vm_size_t img4_manifest_len,
12961 const vm_address_t img4_aux_manifest, const vm_size_t img4_aux_manifest_len)
12962 {
12963 kern_return_t ret = KERN_DENIED;
12964
12965 ret = pmap_load_trust_cache_with_type_ppl(
12966 type,
12967 pmap_img4_payload, pmap_img4_payload_len,
12968 img4_manifest, img4_manifest_len,
12969 img4_aux_manifest, img4_aux_manifest_len);
12970
12971 while (ret == KERN_RESOURCE_SHORTAGE) {
12972 /* Allocate a page from the free list */
12973 pmap_alloc_page_for_ppl(0);
12974
12975 /* Attempt the call again */
12976 ret = pmap_load_trust_cache_with_type_ppl(
12977 type,
12978 pmap_img4_payload, pmap_img4_payload_len,
12979 img4_manifest, img4_manifest_len,
12980 img4_aux_manifest, img4_aux_manifest_len);
12981 }
12982
12983 return ret;
12984 }
12985
12986 MARK_AS_PMAP_TEXT kern_return_t
12987 pmap_query_trust_cache_safe(
12988 TCQueryType_t query_type,
12989 const uint8_t cdhash[kTCEntryHashSize],
12990 TrustCacheQueryToken_t *query_token)
12991 {
12992 kern_return_t ret = KERN_NOT_FOUND;
12993
12994 /* Validate the query type preemptively */
12995 if (query_type >= kTCQueryTypeTotal) {
12996 pmap_cs_log_error("unable to query trust cache: invalid query type: %u", query_type);
12997 return KERN_INVALID_ARGUMENT;
12998 }
12999
13000 /* Lock the runtime as shared */
13001 lck_rw_lock_shared(&ppl_trust_cache_rt_lock);
13002
13003 TCReturn_t tc_ret = amfi->TrustCache.query(
13004 &ppl_trust_cache_rt,
13005 query_type,
13006 cdhash,
13007 query_token);
13008
13009 /* Unlock the runtime */
13010 lck_rw_unlock_shared(&ppl_trust_cache_rt_lock);
13011
13012 if (tc_ret.error == kTCReturnSuccess) {
13013 ret = KERN_SUCCESS;
13014 } else if (tc_ret.error == kTCReturnNotFound) {
13015 ret = KERN_NOT_FOUND;
13016 } else {
13017 ret = KERN_FAILURE;
13018 pmap_cs_log_error("trust cache query failed (TCReturn: 0x%02X | 0x%02X | %u)",
13019 tc_ret.component, tc_ret.error, tc_ret.uniqueError);
13020 }
13021
13022 return ret;
13023 }
13024
13025 MARK_AS_PMAP_TEXT kern_return_t
13026 pmap_query_trust_cache_internal(
13027 TCQueryType_t query_type,
13028 const uint8_t cdhash[kTCEntryHashSize],
13029 TrustCacheQueryToken_t *query_token)
13030 {
13031 kern_return_t ret = KERN_NOT_FOUND;
13032 TrustCacheQueryToken_t query_token_safe = {0};
13033 uint8_t cdhash_safe[kTCEntryHashSize] = {0};
13034
13035 /* Copy in the CDHash into PPL storage */
13036 memcpy(cdhash_safe, cdhash, kTCEntryHashSize);
13037
13038 /* Query through the safe API since we're in the PPL now */
13039 ret = pmap_query_trust_cache_safe(query_type, cdhash_safe, &query_token_safe);
13040
13041 if (query_token != NULL) {
13042 pmap_pin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13043 memcpy((void*)query_token, (void*)&query_token_safe, sizeof(*query_token));
13044 pmap_unpin_kernel_pages((vm_offset_t)query_token, sizeof(*query_token));
13045 }
13046
13047 return ret;
13048 }
13049
13050 kern_return_t
13051 pmap_query_trust_cache(
13052 TCQueryType_t query_type,
13053 const uint8_t cdhash[kTCEntryHashSize],
13054 TrustCacheQueryToken_t *query_token)
13055 {
13056 kern_return_t ret = KERN_NOT_FOUND;
13057
13058 ret = pmap_query_trust_cache_ppl(
13059 query_type,
13060 cdhash,
13061 query_token);
13062
13063 return ret;
13064 }
13065
13066 MARK_AS_PMAP_DATA bool ppl_developer_mode_set = false;
13067 MARK_AS_PMAP_DATA bool ppl_developer_mode_storage = false;
13068
13069 MARK_AS_PMAP_TEXT void
13070 pmap_toggle_developer_mode_internal(
13071 bool state)
13072 {
13073 bool state_set = os_atomic_load(&ppl_developer_mode_set, relaxed);
13074
13075 /*
13076 * Only the following state transitions are allowed:
13077 * -- not set --> false
13078 * -- not set --> true
13079 * -- true --> false
13080 * -- true --> true
13081 * -- false --> false
13082 *
13083 * We never allow false --> true transitions.
13084 */
13085 bool current = os_atomic_load(&ppl_developer_mode_storage, relaxed);
13086
13087 if ((current == false) && (state == true) && state_set) {
13088 panic("PMAP_CS: attempted to enable developer mode incorrectly");
13089 }
13090
13091 /* We're going to update the developer mode state, so update this first */
13092 os_atomic_store(&ppl_developer_mode_set, true, relaxed);
13093
13094 /* Update the developer mode state on the system */
13095 os_atomic_store(&ppl_developer_mode_storage, state, relaxed);
13096 }
13097
13098 void
13099 pmap_toggle_developer_mode(
13100 bool state)
13101 {
13102 pmap_toggle_developer_mode_ppl(state);
13103 }
13104
13105 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enabled = false;
13106 SECURITY_READ_ONLY_LATE(bool) ppl_lockdown_mode_enforce_jit = true;
13107
13108 #pragma mark Image4 - New
13109
13110 typedef struct _pmap_image4_dispatch {
13111 image4_cs_trap_t selector;
13112 image4_cs_trap_handler_t handler;
13113 } pmap_image4_dispatch_t;
13114
13115 MARK_AS_PMAP_TEXT static errno_t
13116 _pmap_image4_monitor_trap_set_release_type(
13117 const pmap_image4_dispatch_t *dispatch,
13118 const void *input_data)
13119 {
13120 /*
13121 * csmx_release_type --> __cs_copy
13122 */
13123 image4_cs_trap_argv_kmod_set_release_type_t input = {0};
13124
13125 /* Copy the input data to prevent ToCToU */
13126 memcpy(&input, input_data, sizeof(input));
13127
13128 /* Dispatch to AppleImage4 */
13129 return dispatch->handler(
13130 dispatch->selector,
13131 &input, sizeof(input),
13132 NULL, NULL);
13133 }
13134
13135
13136
13137 MARK_AS_PMAP_TEXT static errno_t
13138 _pmap_image4_monitor_trap_nonce_set(
13139 const pmap_image4_dispatch_t *dispatch,
13140 const void *input_data)
13141 {
13142 /*
13143 * csmx_clear --> __cs_copy
13144 * csmx_cipher --> __cs_copy
13145 */
13146 image4_cs_trap_argv_nonce_set_t input = {0};
13147
13148 /* Copy the input data to prevent ToCToU */
13149 memcpy(&input, input_data, sizeof(input));
13150
13151 /* Dispatch to AppleImage4 */
13152 return dispatch->handler(
13153 dispatch->selector,
13154 &input, sizeof(input),
13155 NULL, NULL);
13156 }
13157
13158 MARK_AS_PMAP_TEXT static errno_t
13159 _pmap_image4_monitor_trap_nonce_roll(
13160 const pmap_image4_dispatch_t *dispatch,
13161 const void *input_data)
13162 {
13163 image4_cs_trap_argv_nonce_roll_t input = {0};
13164
13165 /* Copy the input data to prevent ToCToU */
13166 memcpy(&input, input_data, sizeof(input));
13167
13168 /* Dispatch to AppleImage4 */
13169 return dispatch->handler(
13170 dispatch->selector,
13171 &input, sizeof(input),
13172 NULL, NULL);
13173 }
13174
13175 MARK_AS_PMAP_TEXT static errno_t
13176 _pmap_image4_monitor_trap_image_activate(
13177 const pmap_image4_dispatch_t *dispatch,
13178 const void *input_data)
13179 {
13180 /*
13181 * csmx_payload (csmx_payload_len) --> __cs_xfer
13182 * csmx_manifest (csmx_manifest_len) --> __cs_borrow
13183 */
13184 image4_cs_trap_argv_image_activate_t input = {0};
13185
13186 /* Copy the input data to prevent ToCToU */
13187 memcpy(&input, input_data, sizeof(input));
13188
13189 /* Validate the payload region */
13190 pmap_cs_assert_addr(
13191 input.csmx_payload, round_page(input.csmx_payload_len),
13192 false, false);
13193
13194 /* Validate the manifest region */
13195 pmap_cs_assert_addr(
13196 input.csmx_manifest, round_page(input.csmx_manifest_len),
13197 false, false);
13198
13199 /* Lockdown the payload region */
13200 pmap_cs_lockdown_pages(
13201 input.csmx_payload, round_page(input.csmx_payload_len), false);
13202
13203 /* Lockdown the manifest region */
13204 pmap_cs_lockdown_pages(
13205 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13206
13207 /* Dispatch the handler */
13208 errno_t err = dispatch->handler(
13209 dispatch->selector,
13210 &input, sizeof(input),
13211 NULL, NULL);
13212
13213 /*
13214 * Image activation always returns the manifest back to the kernel since it isn't
13215 * needed once the evaluation of the image has been completed. The payload must
13216 * remain owned by the monitor if the activation was successful.
13217 */
13218 if (err != 0) {
13219 /* Unlock the payload region */
13220 pmap_cs_unlockdown_pages(
13221 input.csmx_payload, round_page(input.csmx_payload_len), false);
13222 }
13223
13224 /* Unlock the manifest region */
13225 pmap_cs_unlockdown_pages(
13226 input.csmx_manifest, round_page(input.csmx_manifest_len), false);
13227
13228 return err;
13229 }
13230
13231 MARK_AS_PMAP_TEXT static errno_t
13232 _pmap_image4_monitor_trap_passthrough(
13233 __unused const pmap_image4_dispatch_t *dispatch,
13234 __unused const void *input_data,
13235 __unused size_t input_size)
13236 {
13237 #if DEVELOPMENT || DEBUG || KASAN
13238 return dispatch->handler(dispatch->selector, input_data, input_size, NULL, NULL);
13239 #else
13240 pmap_cs_log_error("%llu: image4 dispatch: pass-through not supported", selector);
13241 return ENOSYS;
13242 #endif
13243 }
13244
13245 MARK_AS_PMAP_TEXT errno_t
13246 pmap_image4_monitor_trap_internal(
13247 image4_cs_trap_t selector,
13248 const void *input_data,
13249 size_t input_size)
13250 {
13251 kern_return_t ret = KERN_DENIED;
13252 errno_t err = EPERM;
13253
13254 /* Acquire the handler for this selector */
13255 image4_cs_trap_handler_t handler = image4_cs_trap_resolve_handler(selector);
13256 if (handler == NULL) {
13257 pmap_cs_log_error("%llu: image4 dispatch: invalid selector", selector);
13258 return EINVAL;
13259 }
13260
13261 /* Verify input size for the handler */
13262 if (input_size != image4_cs_trap_vector_size(selector)) {
13263 pmap_cs_log_error("%llu: image4 dispatch: invalid input: %lu ", selector, input_size);
13264 return EINVAL;
13265 }
13266
13267 /* AppleImage4 validation uses CoreCrypto -- requires a spare page */
13268 ret = pmap_reserve_ppl_page();
13269 if (ret != KERN_SUCCESS) {
13270 if (ret == KERN_RESOURCE_SHORTAGE) {
13271 return ENOMEM;
13272 }
13273 pmap_cs_log_error("image4 dispatch: unable to reserve page: %d", ret);
13274 return EPERM;
13275 }
13276
13277 /* Setup dispatch parameters */
13278 pmap_image4_dispatch_t dispatch = {
13279 .selector = selector,
13280 .handler = handler
13281 };
13282
13283 switch (selector) {
13284 case IMAGE4_CS_TRAP_KMOD_SET_RELEASE_TYPE:
13285 err = _pmap_image4_monitor_trap_set_release_type(&dispatch, input_data);
13286 break;
13287
13288 case IMAGE4_CS_TRAP_NONCE_SET:
13289 err = _pmap_image4_monitor_trap_nonce_set(&dispatch, input_data);
13290 break;
13291
13292 case IMAGE4_CS_TRAP_NONCE_ROLL:
13293 err = _pmap_image4_monitor_trap_nonce_roll(&dispatch, input_data);
13294 break;
13295
13296 case IMAGE4_CS_TRAP_IMAGE_ACTIVATE:
13297 err = _pmap_image4_monitor_trap_image_activate(&dispatch, input_data);
13298 break;
13299
13300 default:
13301 err = _pmap_image4_monitor_trap_passthrough(&dispatch, input_data, input_size);
13302 break;
13303 }
13304
13305 /* Return the CoreCrypto reserved page back to the free list */
13306 pmap_release_reserved_ppl_page();
13307
13308 return err;
13309 }
13310
13311 errno_t
13312 pmap_image4_monitor_trap(
13313 image4_cs_trap_t selector,
13314 const void *input_data,
13315 size_t input_size)
13316 {
13317 errno_t err = EPERM;
13318
13319 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13320 while (err == ENOMEM) {
13321 /* Allocate a page from the free list */
13322 pmap_alloc_page_for_ppl(0);
13323
13324 /* Call the monitor dispatch again */
13325 err = pmap_image4_monitor_trap_ppl(selector, input_data, input_size);
13326 }
13327
13328 return err;
13329 }
13330
13331 #endif /* PMAP_CS_PPL_MONITOR */
13332
13333 #if PMAP_CS_INCLUDE_CODE_SIGNING
13334
13335 static int
13336 pmap_cs_profiles_rbtree_compare(
13337 void *profile0,
13338 void *profile1)
13339 {
13340 if (profile0 < profile1) {
13341 return -1;
13342 } else if (profile0 > profile1) {
13343 return 1;
13344 }
13345 return 0;
13346 }
13347
13348 /* Red-black tree for managing provisioning profiles */
13349 MARK_AS_PMAP_DATA static
13350 RB_HEAD(pmap_cs_profiles_rbtree, _pmap_cs_profile) pmap_cs_registered_profiles;
13351
13352 RB_PROTOTYPE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13353 RB_GENERATE(pmap_cs_profiles_rbtree, _pmap_cs_profile, link, pmap_cs_profiles_rbtree_compare);
13354
13355 /* Lock for the profile red-black tree */
13356 MARK_AS_PMAP_DATA decl_lck_rw_data(, pmap_cs_profiles_rbtree_lock);
13357
13358 void
13359 pmap_initialize_provisioning_profiles(void)
13360 {
13361 /* Initialize the profiles red-black tree lock */
13362 lck_rw_init(&pmap_cs_profiles_rbtree_lock, &pmap_lck_grp, 0);
13363 pmap_cs_profiles_rbtree_lock.lck_rw_can_sleep = FALSE;
13364
13365 /* Initialize the red-black tree itself */
13366 RB_INIT(&pmap_cs_registered_profiles);
13367
13368 printf("initialized PPL provisioning profile data\n");
13369 }
13370
13371 static bool
13372 pmap_is_testflight_profile(
13373 pmap_cs_profile_t *profile_obj)
13374 {
13375 const char *entitlement_name = "beta-reports-active";
13376 const size_t entitlement_length = strlen(entitlement_name);
13377 CEQueryOperation_t query[2] = {0};
13378
13379 /* If the profile provisions no entitlements, then it isn't a test flight one */
13380 if (profile_obj->entitlements_ctx == NULL) {
13381 return false;
13382 }
13383
13384 /* Build our CoreEntitlements query */
13385 query[0].opcode = kCEOpSelectKey;
13386 memcpy(query[0].parameters.stringParameter.data, entitlement_name, entitlement_length);
13387 query[0].parameters.stringParameter.length = entitlement_length;
13388 query[1] = CEMatchBool(true);
13389
13390 CEError_t ce_err = amfi->CoreEntitlements.ContextQuery(
13391 profile_obj->entitlements_ctx,
13392 query, 2);
13393
13394 if (ce_err == amfi->CoreEntitlements.kNoError) {
13395 return true;
13396 }
13397
13398 return false;
13399 }
13400
13401 static bool
13402 pmap_is_development_profile(
13403 pmap_cs_profile_t *profile_obj)
13404 {
13405 /* Check for UPP */
13406 const der_vm_context_t upp_ctx = amfi->CoreEntitlements.der_vm_execute(
13407 *profile_obj->profile_ctx,
13408 CESelectDictValue("ProvisionsAllDevices"));
13409 if (amfi->CoreEntitlements.der_vm_context_is_valid(upp_ctx) == true) {
13410 if (amfi->CoreEntitlements.der_vm_bool_from_context(upp_ctx) == true) {
13411 pmap_cs_log_info("%p: [UPP] non-development profile", profile_obj);
13412 return false;
13413 }
13414 }
13415
13416 /* Check for TestFlight profile */
13417 if (pmap_is_testflight_profile(profile_obj) == true) {
13418 pmap_cs_log_info("%p: [TestFlight] non-development profile", profile_obj);
13419 return false;
13420 }
13421
13422 pmap_cs_log_info("%p: development profile", profile_obj);
13423 return true;
13424 }
13425
13426 static kern_return_t
13427 pmap_initialize_profile_entitlements(
13428 pmap_cs_profile_t *profile_obj)
13429 {
13430 const der_vm_context_t entitlements_der_ctx = amfi->CoreEntitlements.der_vm_execute(
13431 *profile_obj->profile_ctx,
13432 CESelectDictValue("Entitlements"));
13433
13434 if (amfi->CoreEntitlements.der_vm_context_is_valid(entitlements_der_ctx) == false) {
13435 memset(&profile_obj->entitlements_ctx_storage, 0, sizeof(struct CEQueryContext));
13436 profile_obj->entitlements_ctx = NULL;
13437
13438 pmap_cs_log_info("%p: profile provisions no entitlements", profile_obj);
13439 return KERN_NOT_FOUND;
13440 }
13441
13442 const uint8_t *der_start = entitlements_der_ctx.state.der_start;
13443 const uint8_t *der_end = entitlements_der_ctx.state.der_end;
13444
13445 CEValidationResult ce_result = {0};
13446 CEError_t ce_err = amfi->CoreEntitlements.Validate(
13447 pmap_cs_core_entitlements_runtime,
13448 &ce_result,
13449 der_start, der_end);
13450 if (ce_err != amfi->CoreEntitlements.kNoError) {
13451 pmap_cs_log_error("unable to validate profile entitlements: %s",
13452 amfi->CoreEntitlements.GetErrorString(ce_err));
13453
13454 return KERN_ABORTED;
13455 }
13456
13457 struct CEQueryContext query_ctx = {0};
13458 ce_err = amfi->CoreEntitlements.AcquireUnmanagedContext(
13459 pmap_cs_core_entitlements_runtime,
13460 ce_result,
13461 &query_ctx);
13462 if (ce_err != amfi->CoreEntitlements.kNoError) {
13463 pmap_cs_log_error("unable to acquire context for profile entitlements: %s",
13464 amfi->CoreEntitlements.GetErrorString(ce_err));
13465
13466 return KERN_ABORTED;
13467 }
13468
13469 /* Setup the entitlements context within the profile object */
13470 profile_obj->entitlements_ctx_storage = query_ctx;
13471 profile_obj->entitlements_ctx = &profile_obj->entitlements_ctx_storage;
13472
13473 pmap_cs_log_info("%p: profile entitlements successfully setup", profile_obj);
13474 return KERN_SUCCESS;
13475 }
13476
13477 kern_return_t
13478 pmap_register_provisioning_profile_internal(
13479 const vm_address_t payload_addr,
13480 const vm_size_t payload_size)
13481 {
13482 kern_return_t ret = KERN_DENIED;
13483 pmap_cs_profile_t *profile_obj = NULL;
13484 pmap_profile_payload_t *profile_payload = NULL;
13485 vm_size_t max_profile_blob_size = 0;
13486 const uint8_t *profile_content = NULL;
13487 size_t profile_content_length = 0;
13488
13489
13490 /* CoreTrust validation uses CoreCrypto -- requires a spare page */
13491 ret = pmap_reserve_ppl_page();
13492 if (ret != KERN_SUCCESS) {
13493 if (ret != KERN_RESOURCE_SHORTAGE) {
13494 pmap_cs_log_error("unable to register profile: unable to reserve page: %d", ret);
13495 }
13496 return ret;
13497 }
13498
13499 /* Ensure we have valid data passed in */
13500 pmap_cs_assert_addr(payload_addr, payload_size, false, false);
13501
13502 /*
13503 * Lockdown the data passed in. The pmap profile payload also contains the profile
13504 * data structure used by the PPL to manage the payload. We need to be able to write
13505 * to that data structure, so we keep the payload PPL writable.
13506 */
13507 pmap_cs_lockdown_pages(payload_addr, payload_size, true);
13508
13509 /* Should be safe to read from this now */
13510 profile_payload = (pmap_profile_payload_t*)payload_addr;
13511
13512 /* Ensure the profile blob size provided is valid */
13513 if (os_sub_overflow(payload_size, sizeof(*profile_payload), &max_profile_blob_size)) {
13514 panic("PMAP_CS: underflow on the max_profile_blob_size: %lu", payload_size);
13515 } else if (profile_payload->profile_blob_size > max_profile_blob_size) {
13516 panic("PMAP_CS: overflow on the profile_blob_size: %lu", profile_payload->profile_blob_size);
13517 }
13518
13519 #if PMAP_CS_INCLUDE_INTERNAL_CODE
13520 const bool allow_development_root_cert = true;
13521 #else
13522 const bool allow_development_root_cert = false;
13523 #endif
13524
13525 int ct_result = coretrust->CTEvaluateProvisioningProfile(
13526 profile_payload->profile_blob, profile_payload->profile_blob_size,
13527 allow_development_root_cert,
13528 &profile_content, &profile_content_length);
13529
13530 /* Release the PPL page allocated for CoreCrypto */
13531 pmap_release_reserved_ppl_page();
13532
13533 if (ct_result != 0) {
13534 panic("PMAP_CS: profile does not validate through CoreTrust: %d", ct_result);
13535 } else if ((profile_content == NULL) || profile_content_length == 0) {
13536 panic("PMAP_CS: profile does not have any content: %p | %lu",
13537 profile_content, profile_content_length);
13538 }
13539
13540 der_vm_context_t profile_ctx_storage = amfi->CoreEntitlements.der_vm_context_create(
13541 pmap_cs_core_entitlements_runtime,
13542 CCDER_CONSTRUCTED_SET,
13543 false,
13544 profile_content, profile_content + profile_content_length);
13545 if (amfi->CoreEntitlements.der_vm_context_is_valid(profile_ctx_storage) == false) {
13546 panic("PMAP_CS: unable to create a CoreEntitlements context for the profile");
13547 }
13548
13549 /* Acquire a writable version of the profile data structure */
13550 profile_obj = &profile_payload->profile_obj_storage;
13551 profile_obj = (pmap_cs_profile_t*)phystokv(kvtophys_nofail((vm_offset_t)profile_obj));
13552
13553 profile_obj->original_payload = profile_payload;
13554 profile_obj->profile_ctx_storage = profile_ctx_storage;
13555 profile_obj->profile_ctx = &profile_obj->profile_ctx_storage;
13556 os_atomic_store(&profile_obj->reference_count, 0, release);
13557
13558 /* Setup the entitlements provisioned by the profile */
13559 ret = pmap_initialize_profile_entitlements(profile_obj);
13560 if ((ret != KERN_SUCCESS) && (ret != KERN_NOT_FOUND)) {
13561 panic("PMAP_CS: fatal error while setting up profile entitlements: %d", ret);
13562 }
13563
13564 /* Setup properties of the profile */
13565 profile_obj->development_profile = pmap_is_development_profile(profile_obj);
13566
13567 /* Mark as validated since it passed all checks */
13568 profile_obj->profile_validated = true;
13569
13570 /* Add the profile to the red-black tree */
13571 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13572 if (RB_INSERT(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) != NULL) {
13573 panic("PMAP_CS: Anomaly, profile already exists in the tree: %p", profile_obj);
13574 }
13575 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13576
13577 pmap_cs_log_info("%p: profile successfully registered", profile_obj);
13578 return KERN_SUCCESS;
13579 }
13580
13581 kern_return_t
13582 pmap_register_provisioning_profile(
13583 const vm_address_t payload_addr,
13584 const vm_size_t payload_size)
13585 {
13586 kern_return_t ret = KERN_DENIED;
13587
13588 ret = pmap_register_provisioning_profile_ppl(
13589 payload_addr,
13590 payload_size);
13591
13592 while (ret == KERN_RESOURCE_SHORTAGE) {
13593 /* Allocate a page from the free list */
13594 pmap_alloc_page_for_ppl(0);
13595
13596 /* Attempt the call again */
13597 ret = pmap_register_provisioning_profile_ppl(
13598 payload_addr,
13599 payload_size);
13600 }
13601
13602 return ret;
13603 }
13604
13605 kern_return_t
13606 pmap_unregister_provisioning_profile_internal(
13607 pmap_cs_profile_t *profile_obj)
13608 {
13609 kern_return_t ret = KERN_DENIED;
13610
13611 /* Lock the red-black tree exclusively */
13612 lck_rw_lock_exclusive(&pmap_cs_profiles_rbtree_lock);
13613
13614 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13615 panic("PMAP_CS: unregistering an unknown profile: %p", profile_obj);
13616 }
13617
13618 uint32_t reference_count = os_atomic_load(&profile_obj->reference_count, acquire);
13619 if (reference_count != 0) {
13620 ret = KERN_FAILURE;
13621 goto exit;
13622 }
13623
13624 /* Remove the profile from the red-black tree */
13625 RB_REMOVE(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj);
13626
13627 /* Unregistration was a success */
13628 ret = KERN_SUCCESS;
13629
13630 exit:
13631 /* Unlock the red-black tree */
13632 lck_rw_unlock_exclusive(&pmap_cs_profiles_rbtree_lock);
13633
13634 if (ret == KERN_SUCCESS) {
13635 /* Get the original payload address */
13636 const pmap_profile_payload_t *profile_payload = profile_obj->original_payload;
13637 const vm_address_t payload_addr = (const vm_address_t)profile_payload;
13638
13639 /* Get the original payload size */
13640 vm_size_t payload_size = profile_payload->profile_blob_size + sizeof(*profile_payload);
13641 payload_size = round_page(payload_size);
13642
13643 /* Unlock the profile payload */
13644 pmap_cs_unlockdown_pages(payload_addr, payload_size, true);
13645 pmap_cs_log_info("%p: profile successfully unregistered: %p | %lu", profile_obj,
13646 profile_payload, payload_size);
13647
13648 profile_obj = NULL;
13649 }
13650 return ret;
13651 }
13652
13653 kern_return_t
13654 pmap_unregister_provisioning_profile(
13655 pmap_cs_profile_t *profile_obj)
13656 {
13657 return pmap_unregister_provisioning_profile_ppl(profile_obj);
13658 }
13659
13660 kern_return_t
13661 pmap_associate_provisioning_profile_internal(
13662 pmap_cs_code_directory_t *cd_entry,
13663 pmap_cs_profile_t *profile_obj)
13664 {
13665 kern_return_t ret = KERN_DENIED;
13666
13667 /* Acquire the lock on the code directory */
13668 pmap_cs_lock_code_directory(cd_entry);
13669
13670 if (cd_entry->trust != PMAP_CS_UNTRUSTED) {
13671 pmap_cs_log_error("disallowing profile association with verified signature");
13672 goto exit;
13673 } else if (cd_entry->profile_obj != NULL) {
13674 pmap_cs_log_error("disallowing multiple profile associations with signature");
13675 goto exit;
13676 }
13677
13678 /* Lock the red-black tree as shared */
13679 lck_rw_lock_shared(&pmap_cs_profiles_rbtree_lock);
13680
13681 if (RB_FIND(pmap_cs_profiles_rbtree, &pmap_cs_registered_profiles, profile_obj) == NULL) {
13682 panic("PMAP_CS: associating an unknown profile: %p", profile_obj);
13683 } else if (profile_obj->profile_validated == false) {
13684 panic("PMAP_CS: attempted association with unverified profile: %p", profile_obj);
13685 }
13686
13687 /* Associate the profile with the signature */
13688 cd_entry->profile_obj = profile_obj;
13689
13690 /* Increment the reference count on the profile object */
13691 uint32_t reference_count = os_atomic_add(&profile_obj->reference_count, 1, relaxed);
13692 if (reference_count == 0) {
13693 panic("PMAP_CS: overflow on reference count for profile: %p", profile_obj);
13694 }
13695
13696 /* Unlock the red-black tree */
13697 lck_rw_unlock_shared(&pmap_cs_profiles_rbtree_lock);
13698
13699 /* Association was a success */
13700 pmap_cs_log_info("associated profile %p with signature %p", profile_obj, cd_entry);
13701 ret = KERN_SUCCESS;
13702
13703 exit:
13704 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13705
13706 return ret;
13707 }
13708
13709 kern_return_t
13710 pmap_associate_provisioning_profile(
13711 pmap_cs_code_directory_t *cd_entry,
13712 pmap_cs_profile_t *profile_obj)
13713 {
13714 return pmap_associate_provisioning_profile_ppl(cd_entry, profile_obj);
13715 }
13716
13717 kern_return_t
13718 pmap_disassociate_provisioning_profile_internal(
13719 pmap_cs_code_directory_t *cd_entry)
13720 {
13721 pmap_cs_profile_t *profile_obj = NULL;
13722 kern_return_t ret = KERN_DENIED;
13723
13724 /* Acquire the lock on the code directory */
13725 pmap_cs_lock_code_directory(cd_entry);
13726
13727 if (cd_entry->profile_obj == NULL) {
13728 ret = KERN_NOT_FOUND;
13729 goto exit;
13730 }
13731 profile_obj = cd_entry->profile_obj;
13732
13733 /* Disassociate the profile from the signature */
13734 cd_entry->profile_obj = NULL;
13735
13736 /* Disassociation was a success */
13737 ret = KERN_SUCCESS;
13738
13739 exit:
13740 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13741
13742 if (ret == KERN_SUCCESS) {
13743 /* Decrement the reference count on the profile object */
13744 uint32_t reference_count = os_atomic_sub(&profile_obj->reference_count, 1, release);
13745 if (reference_count == UINT32_MAX) {
13746 panic("PMAP_CS: underflow on reference count for profile: %p", profile_obj);
13747 }
13748 pmap_cs_log_info("disassociated profile %p from signature %p", profile_obj, cd_entry);
13749 }
13750 return ret;
13751 }
13752
13753 kern_return_t
13754 pmap_disassociate_provisioning_profile(
13755 pmap_cs_code_directory_t *cd_entry)
13756 {
13757 return pmap_disassociate_provisioning_profile_ppl(cd_entry);
13758 }
13759
13760 kern_return_t
13761 pmap_associate_kernel_entitlements_internal(
13762 pmap_cs_code_directory_t *cd_entry,
13763 const void *kernel_entitlements)
13764 {
13765 kern_return_t ret = KERN_DENIED;
13766
13767 if (kernel_entitlements == NULL) {
13768 panic("PMAP_CS: attempted to associate NULL kernel entitlements: %p", cd_entry);
13769 }
13770
13771 /* Acquire the lock on the code directory */
13772 pmap_cs_lock_code_directory(cd_entry);
13773
13774 if (cd_entry->trust == PMAP_CS_UNTRUSTED) {
13775 ret = KERN_DENIED;
13776 goto out;
13777 } else if (cd_entry->kernel_entitlements != NULL) {
13778 ret = KERN_DENIED;
13779 goto out;
13780 }
13781 cd_entry->kernel_entitlements = kernel_entitlements;
13782
13783 /* Association was a success */
13784 ret = KERN_SUCCESS;
13785
13786 out:
13787 lck_rw_unlock_exclusive(&cd_entry->rwlock);
13788 return ret;
13789 }
13790
13791 kern_return_t
13792 pmap_associate_kernel_entitlements(
13793 pmap_cs_code_directory_t *cd_entry,
13794 const void *kernel_entitlements)
13795 {
13796 return pmap_associate_kernel_entitlements_ppl(cd_entry, kernel_entitlements);
13797 }
13798
13799 kern_return_t
13800 pmap_resolve_kernel_entitlements_internal(
13801 pmap_t pmap,
13802 const void **kernel_entitlements)
13803 {
13804 const void *entitlements = NULL;
13805 pmap_cs_code_directory_t *cd_entry = NULL;
13806 kern_return_t ret = KERN_DENIED;
13807
13808 /* Validate the PMAP object */
13809 validate_pmap(pmap);
13810
13811 /* Ensure no kernel PMAP */
13812 if (pmap == kernel_pmap) {
13813 return KERN_NOT_FOUND;
13814 }
13815
13816 /* Attempt a shared lock on the PMAP */
13817 if (pmap_lock_preempt(pmap, PMAP_LOCK_SHARED) != true) {
13818 return KERN_ABORTED;
13819 }
13820
13821 /*
13822 * Acquire the code signature from the PMAP. This function is called when
13823 * performing an entitlement check, and since we've confirmed this isn't
13824 * the kernel_pmap, at this stage, each pmap _should_ have a main region
13825 * with a code signature.
13826 */
13827 cd_entry = pmap_cs_code_directory_from_region(pmap->pmap_cs_main);
13828 if (cd_entry == NULL) {
13829 ret = KERN_NOT_FOUND;
13830 goto out;
13831 }
13832
13833 entitlements = cd_entry->kernel_entitlements;
13834 if (entitlements == NULL) {
13835 ret = KERN_NOT_FOUND;
13836 goto out;
13837 }
13838
13839 /* Pin and write out the entitlements object pointer */
13840 if (kernel_entitlements != NULL) {
13841 pmap_pin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13842 *kernel_entitlements = entitlements;
13843 pmap_unpin_kernel_pages((vm_offset_t)kernel_entitlements, sizeof(*kernel_entitlements));
13844 }
13845
13846 /* Successfully resolved the entitlements */
13847 ret = KERN_SUCCESS;
13848
13849 out:
13850 /* Unlock the code signature object */
13851 if (cd_entry != NULL) {
13852 lck_rw_unlock_shared(&cd_entry->rwlock);
13853 cd_entry = NULL;
13854 }
13855
13856 /* Unlock the PMAP object */
13857 pmap_unlock(pmap, PMAP_LOCK_SHARED);
13858
13859 return ret;
13860 }
13861
13862 kern_return_t
13863 pmap_resolve_kernel_entitlements(
13864 pmap_t pmap,
13865 const void **kernel_entitlements)
13866 {
13867 kern_return_t ret = KERN_DENIED;
13868
13869 do {
13870 ret = pmap_resolve_kernel_entitlements_ppl(pmap, kernel_entitlements);
13871 } while (ret == KERN_ABORTED);
13872
13873 return ret;
13874 }
13875
13876 kern_return_t
13877 pmap_accelerate_entitlements_internal(
13878 pmap_cs_code_directory_t *cd_entry)
13879 {
13880 const coreentitlements_t *CoreEntitlements = NULL;
13881 const CS_SuperBlob *superblob = NULL;
13882 pmap_cs_ce_acceleration_buffer_t *acceleration_buf = NULL;
13883 size_t signature_length = 0;
13884 size_t acceleration_length = 0;
13885 size_t required_length = 0;
13886 kern_return_t ret = KERN_DENIED;
13887
13888 /* Setup the CoreEntitlements interface */
13889 CoreEntitlements = &amfi->CoreEntitlements;
13890
13891 CEError_t ce_err = CoreEntitlements->kMalformedEntitlements;
13892
13893 /* Acquire the lock on the code directory */
13894 pmap_cs_lock_code_directory(cd_entry);
13895
13896 /*
13897 * Only reconstituted code signatures can be accelerated. This is only a policy
13898 * decision we make since this allows us to re-use any unused space within the
13899 * locked down code signature region. There is also a decent bit of validation
13900 * within the reconstitution function to ensure blobs are ordered and do not
13901 * contain any padding around them which can cause issues here.
13902 *
13903 * This also serves as a check to ensure the signature is trusted.
13904 */
13905 if (cd_entry->unneeded_code_signature_unlocked == false) {
13906 ret = KERN_DENIED;
13907 goto out;
13908 }
13909
13910 if (cd_entry->ce_ctx == NULL) {
13911 ret = KERN_SUCCESS;
13912 goto out;
13913 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) == true) {
13914 ret = KERN_SUCCESS;
13915 goto out;
13916 }
13917
13918 /* We only support accelerating when size <= PAGE_SIZE */
13919 ce_err = CoreEntitlements->IndexSizeForContext(cd_entry->ce_ctx, &acceleration_length);
13920 if (ce_err != CoreEntitlements->kNoError) {
13921 if (ce_err == CoreEntitlements->kNotEligibleForAcceleration) {
13922 /* Small entitlement blobs aren't eligible */
13923 ret = KERN_SUCCESS;
13924 goto out;
13925 }
13926 panic("PMAP_CS: unable to gauge index size for entitlements acceleration: %p | %s",
13927 cd_entry, CoreEntitlements->GetErrorString(ce_err));
13928 } else if (acceleration_length > PAGE_SIZE) {
13929 ret = KERN_ABORTED;
13930 goto out;
13931 }
13932 assert(acceleration_length > 0);
13933
13934 superblob = cd_entry->superblob;
13935 signature_length = ntohl(superblob->length);
13936
13937 /* Adjust the required length for the overhead structure -- can't overflow */
13938 required_length = acceleration_length + sizeof(pmap_cs_ce_acceleration_buffer_t);
13939 if (required_length > PAGE_SIZE) {
13940 ret = KERN_ABORTED;
13941 goto out;
13942 }
13943
13944 /*
13945 * First we'll check if the code signature has enough space within the locked down
13946 * region of memory to hold the buffer. If not, then we'll see if we can bucket
13947 * allocate the buffer, and if not, we'll just allocate an entire page from the
13948 * free list.
13949 *
13950 * When we're storing the buffer within the code signature, we also need to make
13951 * sure we account for alignment of the buffer.
13952 */
13953 const vm_address_t align_mask = sizeof(void*) - 1;
13954 size_t required_length_within_sig = required_length + align_mask;
13955
13956 if ((cd_entry->superblob_size - signature_length) >= required_length_within_sig) {
13957 vm_address_t aligned_buf = (vm_address_t)cd_entry->superblob + signature_length;
13958 aligned_buf = (aligned_buf + align_mask) & ~align_mask;
13959
13960 /* We need to resolve to the physical aperture */
13961 pmap_paddr_t phys_addr = kvtophys(aligned_buf);
13962 acceleration_buf = (void*)phystokv(phys_addr);
13963
13964 /* Ensure the offset within the page wasn't lost */
13965 assert((aligned_buf & PAGE_MASK) == ((vm_address_t)acceleration_buf & PAGE_MASK));
13966
13967 acceleration_buf->allocated = false;
13968 pmap_cs_log_debug("[alloc] acceleration buffer thru signature: %p", acceleration_buf);
13969 } else {
13970 if (required_length <= pmap_cs_blob_limit) {
13971 struct pmap_cs_blob *bucket = NULL;
13972 size_t bucket_size = 0;
13973
13974 /* Allocate a buffer from the blob allocator */
13975 ret = pmap_cs_blob_alloc(&bucket, required_length, &bucket_size);
13976 if (ret != KERN_SUCCESS) {
13977 goto out;
13978 }
13979 acceleration_buf = (void*)bucket->blob;
13980 pmap_cs_log_debug("[alloc] acceleration buffer thru bucket: %p", acceleration_buf);
13981 } else {
13982 pmap_paddr_t phys_addr = 0;
13983 ret = pmap_pages_alloc_zeroed(&phys_addr, PAGE_SIZE, PMAP_PAGES_ALLOCATE_NOWAIT);
13984 if (ret != KERN_SUCCESS) {
13985 goto out;
13986 }
13987 acceleration_buf = (void*)phystokv(phys_addr);
13988 pmap_cs_log_debug("[alloc] acceleration buffer thru page: %p", acceleration_buf);
13989 }
13990 acceleration_buf->allocated = true;
13991 }
13992 acceleration_buf->magic = PMAP_CS_ACCELERATION_BUFFER_MAGIC;
13993 acceleration_buf->length = acceleration_length;
13994
13995 /* Take the acceleration buffer lock */
13996 pmap_simple_lock(&pmap_cs_acceleration_buf_lock);
13997
13998 /* Setup the global acceleration buffer state */
13999 pmap_cs_acceleration_buf = acceleration_buf;
14000
14001 /* Accelerate the entitlements */
14002 ce_err = CoreEntitlements->BuildIndexForContext(cd_entry->ce_ctx);
14003 if (ce_err != CoreEntitlements->kNoError) {
14004 panic("PMAP_CS: unable to accelerate entitlements: %p | %s",
14005 cd_entry, CoreEntitlements->GetErrorString(ce_err));
14006 } else if (CoreEntitlements->ContextIsAccelerated(cd_entry->ce_ctx) != true) {
14007 panic("PMAP_CS: entitlements not marked as accelerated: %p", cd_entry);
14008 }
14009
14010 /*
14011 * The global acceleration buffer lock is unlocked by the allocation function itself
14012 * (pmap_cs_alloc_index) so we don't need to unlock it here. Moreover, we cannot add
14013 * an assert that the lock is unlocked here since another thread could have acquired
14014 * it by now.
14015 */
14016 ret = KERN_SUCCESS;
14017
14018 out:
14019 lck_rw_unlock_exclusive(&cd_entry->rwlock);
14020 return ret;
14021 }
14022
14023 kern_return_t
14024 pmap_accelerate_entitlements(
14025 pmap_cs_code_directory_t *cd_entry)
14026 {
14027 kern_return_t ret = KERN_DENIED;
14028
14029 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14030 while (ret == KERN_RESOURCE_SHORTAGE) {
14031 /* Allocate a page for the PPL */
14032 pmap_alloc_page_for_ppl(0);
14033
14034 /* Try again */
14035 ret = pmap_accelerate_entitlements_ppl(cd_entry);
14036 }
14037
14038 return ret;
14039 }
14040
14041 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14042
14043 MARK_AS_PMAP_TEXT bool
14044 pmap_lookup_in_loaded_trust_caches_internal(
14045 const uint8_t cdhash[CS_CDHASH_LEN])
14046 {
14047 kern_return_t kr = KERN_NOT_FOUND;
14048
14049 #if PMAP_CS_PPL_MONITOR
14050 /*
14051 * If we have the PPL monitor, then this function can only be called from
14052 * within the PPL. Calling it directly would've caused a panic, so we can
14053 * assume that we're in the PPL here.
14054 */
14055 uint8_t cdhash_safe[CS_CDHASH_LEN];
14056 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14057
14058 kr = pmap_query_trust_cache_safe(
14059 kTCQueryTypeLoadable,
14060 cdhash_safe,
14061 NULL);
14062 #else
14063 kr = query_trust_cache(
14064 kTCQueryTypeLoadable,
14065 cdhash,
14066 NULL);
14067 #endif
14068
14069 if (kr == KERN_SUCCESS) {
14070 return true;
14071 }
14072 return false;
14073 }
14074
14075 bool
14076 pmap_lookup_in_loaded_trust_caches(
14077 const uint8_t cdhash[CS_CDHASH_LEN])
14078 {
14079 #if XNU_MONITOR
14080 return pmap_lookup_in_loaded_trust_caches_ppl(cdhash);
14081 #else
14082 return pmap_lookup_in_loaded_trust_caches_internal(cdhash);
14083 #endif
14084 }
14085
14086 MARK_AS_PMAP_TEXT uint32_t
14087 pmap_lookup_in_static_trust_cache_internal(
14088 const uint8_t cdhash[CS_CDHASH_LEN])
14089 {
14090 TrustCacheQueryToken_t query_token = {0};
14091 kern_return_t kr = KERN_NOT_FOUND;
14092 uint64_t flags = 0;
14093 uint8_t hash_type = 0;
14094
14095 #if PMAP_CS_PPL_MONITOR
14096 /*
14097 * If we have the PPL monitor, then this function can only be called from
14098 * within the PPL. Calling it directly would've caused a panic, so we can
14099 * assume that we're in the PPL here.
14100 */
14101 uint8_t cdhash_safe[CS_CDHASH_LEN];
14102 memcpy(cdhash_safe, cdhash, CS_CDHASH_LEN);
14103
14104 kr = pmap_query_trust_cache_safe(
14105 kTCQueryTypeStatic,
14106 cdhash_safe,
14107 &query_token);
14108 #else
14109 kr = query_trust_cache(
14110 kTCQueryTypeStatic,
14111 cdhash,
14112 &query_token);
14113 #endif
14114
14115 if (kr == KERN_SUCCESS) {
14116 amfi->TrustCache.queryGetFlags(&query_token, &flags);
14117 amfi->TrustCache.queryGetHashType(&query_token, &hash_type);
14118
14119 return (TC_LOOKUP_FOUND << TC_LOOKUP_RESULT_SHIFT) |
14120 (hash_type << TC_LOOKUP_HASH_TYPE_SHIFT) |
14121 ((uint8_t)flags << TC_LOOKUP_FLAGS_SHIFT);
14122 }
14123
14124 return 0;
14125 }
14126
14127 uint32_t
14128 pmap_lookup_in_static_trust_cache(const uint8_t cdhash[CS_CDHASH_LEN])
14129 {
14130 #if XNU_MONITOR
14131 return pmap_lookup_in_static_trust_cache_ppl(cdhash);
14132 #else
14133 return pmap_lookup_in_static_trust_cache_internal(cdhash);
14134 #endif
14135 }
14136
14137 #if PMAP_CS_INCLUDE_CODE_SIGNING
14138
14139 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_compilation_service_cdhash_lock, 0);
14140 MARK_AS_PMAP_DATA uint8_t pmap_compilation_service_cdhash[CS_CDHASH_LEN] = { 0 };
14141
14142 MARK_AS_PMAP_TEXT void
14143 pmap_set_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14144 {
14145
14146 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14147 memcpy(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN);
14148 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14149
14150 pmap_cs_log_info("Added Compilation Service CDHash through the PPL: 0x%02X 0x%02X 0x%02X 0x%02X",
14151 cdhash[0], cdhash[1], cdhash[2], cdhash[4]);
14152 }
14153
14154 MARK_AS_PMAP_TEXT bool
14155 pmap_match_compilation_service_cdhash_internal(const uint8_t cdhash[CS_CDHASH_LEN])
14156 {
14157 bool match = false;
14158
14159 /* Lockdown mode disallows compilation service */
14160 if (ppl_lockdown_mode_enabled == true) {
14161 return false;
14162 }
14163
14164 pmap_simple_lock(&pmap_compilation_service_cdhash_lock);
14165 if (bcmp(pmap_compilation_service_cdhash, cdhash, CS_CDHASH_LEN) == 0) {
14166 match = true;
14167 }
14168 pmap_simple_unlock(&pmap_compilation_service_cdhash_lock);
14169
14170 if (match) {
14171 pmap_cs_log_info("Matched Compilation Service CDHash through the PPL");
14172 }
14173
14174 return match;
14175 }
14176
14177 void
14178 pmap_set_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14179 {
14180 #if XNU_MONITOR
14181 pmap_set_compilation_service_cdhash_ppl(cdhash);
14182 #else
14183 pmap_set_compilation_service_cdhash_internal(cdhash);
14184 #endif
14185 }
14186
14187 bool
14188 pmap_match_compilation_service_cdhash(const uint8_t cdhash[CS_CDHASH_LEN])
14189 {
14190 #if XNU_MONITOR
14191 return pmap_match_compilation_service_cdhash_ppl(cdhash);
14192 #else
14193 return pmap_match_compilation_service_cdhash_internal(cdhash);
14194 #endif
14195 }
14196
14197 /*
14198 * As part of supporting local signing on the device, we need the PMAP layer
14199 * to store the local signing key so that PMAP_CS can validate with it. We
14200 * store it at the PMAP layer such that it is accessible to both AMFI and
14201 * PMAP_CS should they need it.
14202 */
14203 MARK_AS_PMAP_DATA static bool pmap_local_signing_public_key_set = false;
14204 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE] = { 0 };
14205
14206 MARK_AS_PMAP_TEXT void
14207 pmap_set_local_signing_public_key_internal(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14208 {
14209 bool key_set = false;
14210
14211 /*
14212 * os_atomic_cmpxchg returns true in case the exchange was successful. For us,
14213 * a successful exchange means that the local signing public key has _not_ been
14214 * set. In case the key has been set, we panic as we would never expect the
14215 * kernel to attempt to set the key more than once.
14216 */
14217 key_set = !os_atomic_cmpxchg(&pmap_local_signing_public_key_set, false, true, relaxed);
14218
14219 if (key_set) {
14220 panic("attempted to set the local signing public key multiple times");
14221 }
14222
14223 memcpy(pmap_local_signing_public_key, public_key, PMAP_CS_LOCAL_SIGNING_KEY_SIZE);
14224 pmap_cs_log_info("set local signing public key");
14225 }
14226
14227 void
14228 pmap_set_local_signing_public_key(const uint8_t public_key[PMAP_CS_LOCAL_SIGNING_KEY_SIZE])
14229 {
14230 #if XNU_MONITOR
14231 return pmap_set_local_signing_public_key_ppl(public_key);
14232 #else
14233 return pmap_set_local_signing_public_key_internal(public_key);
14234 #endif
14235 }
14236
14237 uint8_t*
14238 pmap_get_local_signing_public_key(void)
14239 {
14240 bool key_set = os_atomic_load(&pmap_local_signing_public_key_set, relaxed);
14241
14242 if (key_set) {
14243 return pmap_local_signing_public_key;
14244 }
14245
14246 return NULL;
14247 }
14248
14249 /*
14250 * Locally signed applications need to be explicitly authorized by an entitled application
14251 * before we allow them to run.
14252 */
14253 MARK_AS_PMAP_DATA static uint8_t pmap_local_signing_cdhash[CS_CDHASH_LEN] = {0};
14254 MARK_AS_PMAP_DATA SIMPLE_LOCK_DECLARE(pmap_local_signing_cdhash_lock, 0);
14255
14256 MARK_AS_PMAP_TEXT void
14257 pmap_unrestrict_local_signing_internal(
14258 const uint8_t cdhash[CS_CDHASH_LEN])
14259 {
14260
14261 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14262 memcpy(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14263 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14264
14265 pmap_cs_log_debug("unrestricted local signing for CDHash: 0x%02X%02X%02X%02X%02X...",
14266 cdhash[0], cdhash[1], cdhash[2], cdhash[3], cdhash[4]);
14267 }
14268
14269 void
14270 pmap_unrestrict_local_signing(
14271 const uint8_t cdhash[CS_CDHASH_LEN])
14272 {
14273 #if XNU_MONITOR
14274 return pmap_unrestrict_local_signing_ppl(cdhash);
14275 #else
14276 return pmap_unrestrict_local_signing_internal(cdhash);
14277 #endif
14278 }
14279
14280 #if PMAP_CS
14281 MARK_AS_PMAP_TEXT static void
14282 pmap_restrict_local_signing(void)
14283 {
14284 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14285 memset(pmap_local_signing_cdhash, 0, sizeof(pmap_local_signing_cdhash));
14286 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14287 }
14288
14289 MARK_AS_PMAP_TEXT static bool
14290 pmap_local_signing_restricted(
14291 const uint8_t cdhash[CS_CDHASH_LEN])
14292 {
14293 pmap_simple_lock(&pmap_local_signing_cdhash_lock);
14294 int ret = memcmp(pmap_local_signing_cdhash, cdhash, sizeof(pmap_local_signing_cdhash));
14295 pmap_simple_unlock(&pmap_local_signing_cdhash_lock);
14296
14297 return ret != 0;
14298 }
14299
14300 #endif /* PMAP_CS_INCLUDE_CODE_SIGNING */
14301 #endif
14302
14303 MARK_AS_PMAP_TEXT void
14304 pmap_footprint_suspend_internal(
14305 vm_map_t map,
14306 boolean_t suspend)
14307 {
14308 #if DEVELOPMENT || DEBUG
14309 if (suspend) {
14310 current_thread()->pmap_footprint_suspended = TRUE;
14311 map->pmap->footprint_was_suspended = TRUE;
14312 } else {
14313 current_thread()->pmap_footprint_suspended = FALSE;
14314 }
14315 #else /* DEVELOPMENT || DEBUG */
14316 (void) map;
14317 (void) suspend;
14318 #endif /* DEVELOPMENT || DEBUG */
14319 }
14320
14321 void
14322 pmap_footprint_suspend(
14323 vm_map_t map,
14324 boolean_t suspend)
14325 {
14326 #if XNU_MONITOR
14327 pmap_footprint_suspend_ppl(map, suspend);
14328 #else
14329 pmap_footprint_suspend_internal(map, suspend);
14330 #endif
14331 }
14332
14333 MARK_AS_PMAP_TEXT void
14334 pmap_nop_internal(pmap_t pmap __unused)
14335 {
14336 validate_pmap_mutable(pmap);
14337 }
14338
14339 void
14340 pmap_nop(pmap_t pmap)
14341 {
14342 #if XNU_MONITOR
14343 pmap_nop_ppl(pmap);
14344 #else
14345 pmap_nop_internal(pmap);
14346 #endif
14347 }
14348
14349 #if defined(__arm64__) && (DEVELOPMENT || DEBUG)
14350
14351 struct page_table_dump_header {
14352 uint64_t pa;
14353 uint64_t num_entries;
14354 uint64_t start_va;
14355 uint64_t end_va;
14356 };
14357
14358 static kern_return_t
14359 pmap_dump_page_tables_recurse(pmap_t pmap,
14360 const tt_entry_t *ttp,
14361 unsigned int cur_level,
14362 unsigned int level_mask,
14363 uint64_t start_va,
14364 void *buf_start,
14365 void *buf_end,
14366 size_t *bytes_copied)
14367 {
14368 const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14369 uint64_t num_entries = pt_attr_page_size(pt_attr) / sizeof(*ttp);
14370
14371 uint64_t size = pt_attr->pta_level_info[cur_level].size;
14372 uint64_t valid_mask = pt_attr->pta_level_info[cur_level].valid_mask;
14373 uint64_t type_mask = pt_attr->pta_level_info[cur_level].type_mask;
14374 uint64_t type_block = pt_attr->pta_level_info[cur_level].type_block;
14375
14376 void *bufp = (uint8_t*)buf_start + *bytes_copied;
14377
14378 if (cur_level == pt_attr_root_level(pt_attr)) {
14379 start_va &= ~(pt_attr->pta_level_info[cur_level].offmask);
14380 num_entries = pmap_root_alloc_size(pmap) / sizeof(tt_entry_t);
14381 }
14382
14383 uint64_t tt_size = num_entries * sizeof(tt_entry_t);
14384 const tt_entry_t *tt_end = &ttp[num_entries];
14385
14386 if (((vm_offset_t)buf_end - (vm_offset_t)bufp) < (tt_size + sizeof(struct page_table_dump_header))) {
14387 return KERN_INSUFFICIENT_BUFFER_SIZE;
14388 }
14389
14390 if (level_mask & (1U << cur_level)) {
14391 struct page_table_dump_header *header = (struct page_table_dump_header*)bufp;
14392 header->pa = ml_static_vtop((vm_offset_t)ttp);
14393 header->num_entries = num_entries;
14394 header->start_va = start_va;
14395 header->end_va = start_va + (num_entries * size);
14396
14397 bcopy(ttp, (uint8_t*)bufp + sizeof(*header), tt_size);
14398 *bytes_copied = *bytes_copied + sizeof(*header) + tt_size;
14399 }
14400 uint64_t current_va = start_va;
14401
14402 for (const tt_entry_t *ttep = ttp; ttep < tt_end; ttep++, current_va += size) {
14403 tt_entry_t tte = *ttep;
14404
14405 if (!(tte & valid_mask)) {
14406 continue;
14407 }
14408
14409 if ((tte & type_mask) == type_block) {
14410 continue;
14411 } else {
14412 if (cur_level >= pt_attr_leaf_level(pt_attr)) {
14413 panic("%s: corrupt entry %#llx at %p, "
14414 "ttp=%p, cur_level=%u, bufp=%p, buf_end=%p",
14415 __FUNCTION__, tte, ttep,
14416 ttp, cur_level, bufp, buf_end);
14417 }
14418
14419 const tt_entry_t *next_tt = (const tt_entry_t*)phystokv(tte & ARM_TTE_TABLE_MASK);
14420
14421 kern_return_t recurse_result = pmap_dump_page_tables_recurse(pmap, next_tt, cur_level + 1,
14422 level_mask, current_va, buf_start, buf_end, bytes_copied);
14423
14424 if (recurse_result != KERN_SUCCESS) {
14425 return recurse_result;
14426 }
14427 }
14428 }
14429
14430 return KERN_SUCCESS;
14431 }
14432
14433 kern_return_t
14434 pmap_dump_page_tables(pmap_t pmap, void *bufp, void *buf_end, unsigned int level_mask, size_t *bytes_copied)
14435 {
14436 if (not_in_kdp) {
14437 panic("pmap_dump_page_tables must only be called from kernel debugger context");
14438 }
14439 return pmap_dump_page_tables_recurse(pmap, pmap->tte, pt_attr_root_level(pmap_get_pt_attr(pmap)),
14440 level_mask, pmap->min, bufp, buf_end, bytes_copied);
14441 }
14442
14443 #else /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14444
14445 kern_return_t
14446 pmap_dump_page_tables(pmap_t pmap __unused, void *bufp __unused, void *buf_end __unused,
14447 unsigned int level_mask __unused, size_t *bytes_copied __unused)
14448 {
14449 return KERN_NOT_SUPPORTED;
14450 }
14451 #endif /* defined(__arm64__) && (DEVELOPMENT || DEBUG) */
14452
14453
14454 #ifdef CONFIG_XNUPOST
14455 #ifdef __arm64__
14456 static volatile bool pmap_test_took_fault = false;
14457
14458 static bool
14459 pmap_test_fault_handler(arm_saved_state_t * state)
14460 {
14461 bool retval = false;
14462 uint32_t esr = get_saved_state_esr(state);
14463 esr_exception_class_t class = ESR_EC(esr);
14464 fault_status_t fsc = ISS_IA_FSC(ESR_ISS(esr));
14465
14466 if ((class == ESR_EC_DABORT_EL1) &&
14467 ((fsc == FSC_PERMISSION_FAULT_L3) || (fsc == FSC_ACCESS_FLAG_FAULT_L3))) {
14468 pmap_test_took_fault = true;
14469 /* return to the instruction immediately after the call to NX page */
14470 set_saved_state_pc(state, get_saved_state_pc(state) + 4);
14471 retval = true;
14472 }
14473
14474 return retval;
14475 }
14476
14477 // Disable KASAN instrumentation, as the test pmap's TTBR0 space will not be in the shadow map
14478 static NOKASAN bool
14479 pmap_test_access(pmap_t pmap, vm_map_address_t va, bool should_fault, bool is_write)
14480 {
14481 pmap_t old_pmap = NULL;
14482
14483 pmap_test_took_fault = false;
14484
14485 /*
14486 * We're potentially switching pmaps without using the normal thread
14487 * mechanism; disable interrupts and preemption to avoid any unexpected
14488 * memory accesses.
14489 */
14490 uint64_t old_int_state = pmap_interrupts_disable();
14491 mp_disable_preemption();
14492
14493 if (pmap != NULL) {
14494 old_pmap = current_pmap();
14495 pmap_switch(pmap);
14496
14497 /* Disable PAN; pmap shouldn't be the kernel pmap. */
14498 #if __ARM_PAN_AVAILABLE__
14499 __builtin_arm_wsr("pan", 0);
14500 #endif /* __ARM_PAN_AVAILABLE__ */
14501 }
14502
14503 ml_expect_fault_begin(pmap_test_fault_handler, va);
14504
14505 if (is_write) {
14506 *((volatile uint64_t*)(va)) = 0xdec0de;
14507 } else {
14508 volatile uint64_t tmp = *((volatile uint64_t*)(va));
14509 (void)tmp;
14510 }
14511
14512 /* Save the fault bool, and undo the gross stuff we did. */
14513 bool took_fault = pmap_test_took_fault;
14514 ml_expect_fault_end();
14515
14516 if (pmap != NULL) {
14517 #if __ARM_PAN_AVAILABLE__
14518 __builtin_arm_wsr("pan", 1);
14519 #endif /* __ARM_PAN_AVAILABLE__ */
14520
14521 pmap_switch(old_pmap);
14522 }
14523
14524 mp_enable_preemption();
14525 pmap_interrupts_restore(old_int_state);
14526 bool retval = (took_fault == should_fault);
14527 return retval;
14528 }
14529
14530 static bool
14531 pmap_test_read(pmap_t pmap, vm_map_address_t va, bool should_fault)
14532 {
14533 bool retval = pmap_test_access(pmap, va, should_fault, false);
14534
14535 if (!retval) {
14536 T_FAIL("%s: %s, "
14537 "pmap=%p, va=%p, should_fault=%u",
14538 __func__, should_fault ? "did not fault" : "faulted",
14539 pmap, (void*)va, (unsigned)should_fault);
14540 }
14541
14542 return retval;
14543 }
14544
14545 static bool
14546 pmap_test_write(pmap_t pmap, vm_map_address_t va, bool should_fault)
14547 {
14548 bool retval = pmap_test_access(pmap, va, should_fault, true);
14549
14550 if (!retval) {
14551 T_FAIL("%s: %s, "
14552 "pmap=%p, va=%p, should_fault=%u",
14553 __func__, should_fault ? "did not fault" : "faulted",
14554 pmap, (void*)va, (unsigned)should_fault);
14555 }
14556
14557 return retval;
14558 }
14559
14560 static bool
14561 pmap_test_check_refmod(pmap_paddr_t pa, unsigned int should_be_set)
14562 {
14563 unsigned int should_be_clear = (~should_be_set) & (VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14564 unsigned int bits = pmap_get_refmod((ppnum_t)atop(pa));
14565
14566 bool retval = (((bits & should_be_set) == should_be_set) && ((bits & should_be_clear) == 0));
14567
14568 if (!retval) {
14569 T_FAIL("%s: bits=%u, "
14570 "pa=%p, should_be_set=%u",
14571 __func__, bits,
14572 (void*)pa, should_be_set);
14573 }
14574
14575 return retval;
14576 }
14577
14578 static __attribute__((noinline)) bool
14579 pmap_test_read_write(pmap_t pmap, vm_map_address_t va, bool allow_read, bool allow_write)
14580 {
14581 bool retval = (pmap_test_read(pmap, va, !allow_read) | pmap_test_write(pmap, va, !allow_write));
14582 return retval;
14583 }
14584
14585 static int
14586 pmap_test_test_config(unsigned int flags)
14587 {
14588 T_LOG("running pmap_test_test_config flags=0x%X", flags);
14589 unsigned int map_count = 0;
14590 unsigned long page_ratio = 0;
14591 pmap_t pmap = pmap_create_options(NULL, 0, flags);
14592
14593 if (!pmap) {
14594 panic("Failed to allocate pmap");
14595 }
14596
14597 __unused const pt_attr_t * const pt_attr = pmap_get_pt_attr(pmap);
14598 uintptr_t native_page_size = pt_attr_page_size(native_pt_attr);
14599 uintptr_t pmap_page_size = pt_attr_page_size(pt_attr);
14600 uintptr_t pmap_twig_size = pt_attr_twig_size(pt_attr);
14601
14602 if (pmap_page_size <= native_page_size) {
14603 page_ratio = native_page_size / pmap_page_size;
14604 } else {
14605 /*
14606 * We claim to support a page_ratio of less than 1, which is
14607 * not currently supported by the pmap layer; panic.
14608 */
14609 panic("%s: page_ratio < 1, native_page_size=%lu, pmap_page_size=%lu"
14610 "flags=%u",
14611 __func__, native_page_size, pmap_page_size,
14612 flags);
14613 }
14614
14615 if (PAGE_RATIO > 1) {
14616 /*
14617 * The kernel is deliberately pretending to have 16KB pages.
14618 * The pmap layer has code that supports this, so pretend the
14619 * page size is larger than it is.
14620 */
14621 pmap_page_size = PAGE_SIZE;
14622 native_page_size = PAGE_SIZE;
14623 }
14624
14625 /*
14626 * Get two pages from the VM; one to be mapped wired, and one to be
14627 * mapped nonwired.
14628 */
14629 vm_page_t unwired_vm_page = vm_page_grab();
14630 vm_page_t wired_vm_page = vm_page_grab();
14631
14632 if ((unwired_vm_page == VM_PAGE_NULL) || (wired_vm_page == VM_PAGE_NULL)) {
14633 panic("Failed to grab VM pages");
14634 }
14635
14636 ppnum_t pn = VM_PAGE_GET_PHYS_PAGE(unwired_vm_page);
14637 ppnum_t wired_pn = VM_PAGE_GET_PHYS_PAGE(wired_vm_page);
14638
14639 pmap_paddr_t pa = ptoa(pn);
14640 pmap_paddr_t wired_pa = ptoa(wired_pn);
14641
14642 /*
14643 * We'll start mappings at the second twig TT. This keeps us from only
14644 * using the first entry in each TT, which would trivially be address
14645 * 0; one of the things we will need to test is retrieving the VA for
14646 * a given PTE.
14647 */
14648 vm_map_address_t va_base = pmap_twig_size;
14649 vm_map_address_t wired_va_base = ((2 * pmap_twig_size) - pmap_page_size);
14650
14651 if (wired_va_base < (va_base + (page_ratio * pmap_page_size))) {
14652 /*
14653 * Not exactly a functional failure, but this test relies on
14654 * there being a spare PTE slot we can use to pin the TT.
14655 */
14656 panic("Cannot pin translation table");
14657 }
14658
14659 /*
14660 * Create the wired mapping; this will prevent the pmap layer from
14661 * reclaiming our test TTs, which would interfere with this test
14662 * ("interfere" -> "make it panic").
14663 */
14664 pmap_enter_addr(pmap, wired_va_base, wired_pa, VM_PROT_READ, VM_PROT_READ, 0, true);
14665
14666 #if XNU_MONITOR
14667 /*
14668 * If the PPL is enabled, make sure that the kernel cannot write
14669 * to PPL memory.
14670 */
14671 if (!pmap_ppl_disable) {
14672 T_LOG("Validate that kernel cannot write to PPL memory.");
14673 pt_entry_t * ptep = pmap_pte(pmap, va_base);
14674 pmap_test_write(NULL, (vm_map_address_t)ptep, true);
14675 }
14676 #endif
14677
14678 /*
14679 * Create read-only mappings of the nonwired page; if the pmap does
14680 * not use the same page size as the kernel, create multiple mappings
14681 * so that the kernel page is fully mapped.
14682 */
14683 for (map_count = 0; map_count < page_ratio; map_count++) {
14684 pmap_enter_addr(pmap, va_base + (pmap_page_size * map_count), pa + (pmap_page_size * (map_count)), VM_PROT_READ, VM_PROT_READ, 0, false);
14685 }
14686
14687 /* Validate that all the PTEs have the expected PA and VA. */
14688 for (map_count = 0; map_count < page_ratio; map_count++) {
14689 pt_entry_t * ptep = pmap_pte(pmap, va_base + (pmap_page_size * map_count));
14690
14691 if (pte_to_pa(*ptep) != (pa + (pmap_page_size * map_count))) {
14692 T_FAIL("Unexpected pa=%p, expected %p, map_count=%u",
14693 (void*)pte_to_pa(*ptep), (void*)(pa + (pmap_page_size * map_count)), map_count);
14694 }
14695
14696 if (ptep_get_va(ptep) != (va_base + (pmap_page_size * map_count))) {
14697 T_FAIL("Unexpected va=%p, expected %p, map_count=%u",
14698 (void*)ptep_get_va(ptep), (void*)(va_base + (pmap_page_size * map_count)), map_count);
14699 }
14700 }
14701
14702 T_LOG("Validate that reads to our mapping do not fault.");
14703 pmap_test_read(pmap, va_base, false);
14704
14705 T_LOG("Validate that writes to our mapping fault.");
14706 pmap_test_write(pmap, va_base, true);
14707
14708 T_LOG("Make the first mapping writable.");
14709 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14710
14711 T_LOG("Validate that writes to our mapping do not fault.");
14712 pmap_test_write(pmap, va_base, false);
14713
14714
14715 T_LOG("Make the first mapping execute-only");
14716 pmap_enter_addr(pmap, va_base, pa, VM_PROT_EXECUTE, VM_PROT_EXECUTE, 0, false);
14717
14718
14719 T_LOG("Validate that reads to our mapping do not fault.");
14720 pmap_test_read(pmap, va_base, false);
14721
14722 T_LOG("Validate that writes to our mapping fault.");
14723 pmap_test_write(pmap, va_base, true);
14724
14725
14726 /*
14727 * For page ratios of greater than 1: validate that writes to the other
14728 * mappings still fault. Remove the mappings afterwards (we're done
14729 * with page ratio testing).
14730 */
14731 for (map_count = 1; map_count < page_ratio; map_count++) {
14732 pmap_test_write(pmap, va_base + (pmap_page_size * map_count), true);
14733 pmap_remove(pmap, va_base + (pmap_page_size * map_count), va_base + (pmap_page_size * map_count) + pmap_page_size);
14734 }
14735
14736 T_LOG("Mark the page unreferenced and unmodified.");
14737 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14738 pmap_test_check_refmod(pa, 0);
14739
14740 /*
14741 * Begin testing the ref/mod state machine. Re-enter the mapping with
14742 * different protection/fault_type settings, and confirm that the
14743 * ref/mod state matches our expectations at each step.
14744 */
14745 T_LOG("!ref/!mod: read, no fault. Expect ref/!mod");
14746 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_NONE, 0, false);
14747 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14748
14749 T_LOG("!ref/!mod: read, read fault. Expect ref/!mod");
14750 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14751 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14752 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14753
14754 T_LOG("!ref/!mod: rw, read fault. Expect ref/!mod");
14755 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14756 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_NONE, 0, false);
14757 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14758
14759 T_LOG("ref/!mod: rw, read fault. Expect ref/!mod");
14760 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ, 0, false);
14761 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14762
14763 T_LOG("!ref/!mod: rw, rw fault. Expect ref/mod");
14764 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14765 pmap_enter_addr(pmap, va_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14766 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14767
14768 /*
14769 * Shared memory testing; we'll have two mappings; one read-only,
14770 * one read-write.
14771 */
14772 vm_map_address_t rw_base = va_base;
14773 vm_map_address_t ro_base = va_base + pmap_page_size;
14774
14775 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14776 pmap_enter_addr(pmap, ro_base, pa, VM_PROT_READ, VM_PROT_READ, 0, false);
14777
14778 /*
14779 * Test that we take faults as expected for unreferenced/unmodified
14780 * pages. Also test the arm_fast_fault interface, to ensure that
14781 * mapping permissions change as expected.
14782 */
14783 T_LOG("!ref/!mod: expect no access");
14784 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14785 pmap_test_read_write(pmap, ro_base, false, false);
14786 pmap_test_read_write(pmap, rw_base, false, false);
14787
14788 T_LOG("Read fault; expect !ref/!mod -> ref/!mod, read access");
14789 arm_fast_fault(pmap, rw_base, VM_PROT_READ, false, false);
14790 pmap_test_check_refmod(pa, VM_MEM_REFERENCED);
14791 pmap_test_read_write(pmap, ro_base, true, false);
14792 pmap_test_read_write(pmap, rw_base, true, false);
14793
14794 T_LOG("Write fault; expect ref/!mod -> ref/mod, read and write access");
14795 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14796 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14797 pmap_test_read_write(pmap, ro_base, true, false);
14798 pmap_test_read_write(pmap, rw_base, true, true);
14799
14800 T_LOG("Write fault; expect !ref/!mod -> ref/mod, read and write access");
14801 pmap_clear_refmod(pn, VM_MEM_MODIFIED | VM_MEM_REFERENCED);
14802 arm_fast_fault(pmap, rw_base, VM_PROT_READ | VM_PROT_WRITE, false, false);
14803 pmap_test_check_refmod(pa, VM_MEM_REFERENCED | VM_MEM_MODIFIED);
14804 pmap_test_read_write(pmap, ro_base, true, false);
14805 pmap_test_read_write(pmap, rw_base, true, true);
14806
14807 T_LOG("RW protect both mappings; should not change protections.");
14808 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14809 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ | VM_PROT_WRITE);
14810 pmap_test_read_write(pmap, ro_base, true, false);
14811 pmap_test_read_write(pmap, rw_base, true, true);
14812
14813 T_LOG("Read protect both mappings; RW mapping should become RO.");
14814 pmap_protect(pmap, ro_base, ro_base + pmap_page_size, VM_PROT_READ);
14815 pmap_protect(pmap, rw_base, rw_base + pmap_page_size, VM_PROT_READ);
14816 pmap_test_read_write(pmap, ro_base, true, false);
14817 pmap_test_read_write(pmap, rw_base, true, false);
14818
14819 T_LOG("RW protect the page; mappings should not change protections.");
14820 pmap_enter_addr(pmap, rw_base, pa, VM_PROT_READ | VM_PROT_WRITE, VM_PROT_READ | VM_PROT_WRITE, 0, false);
14821 pmap_page_protect(pn, VM_PROT_ALL);
14822 pmap_test_read_write(pmap, ro_base, true, false);
14823 pmap_test_read_write(pmap, rw_base, true, true);
14824
14825 T_LOG("Read protect the page; RW mapping should become RO.");
14826 pmap_page_protect(pn, VM_PROT_READ);
14827 pmap_test_read_write(pmap, ro_base, true, false);
14828 pmap_test_read_write(pmap, rw_base, true, false);
14829
14830 T_LOG("Validate that disconnect removes all known mappings of the page.");
14831 pmap_disconnect(pn);
14832 if (!pmap_verify_free(pn)) {
14833 T_FAIL("Page still has mappings");
14834 }
14835
14836 T_LOG("Remove the wired mapping, so we can tear down the test map.");
14837 pmap_remove(pmap, wired_va_base, wired_va_base + pmap_page_size);
14838 pmap_destroy(pmap);
14839
14840 T_LOG("Release the pages back to the VM.");
14841 vm_page_lock_queues();
14842 vm_page_free(unwired_vm_page);
14843 vm_page_free(wired_vm_page);
14844 vm_page_unlock_queues();
14845
14846 T_LOG("Testing successful!");
14847 return 0;
14848 }
14849 #endif /* __arm64__ */
14850
14851 kern_return_t
14852 pmap_test(void)
14853 {
14854 T_LOG("Starting pmap_tests");
14855 #ifdef __arm64__
14856 int flags = 0;
14857 flags |= PMAP_CREATE_64BIT;
14858
14859 #if __ARM_MIXED_PAGE_SIZE__
14860 T_LOG("Testing VM_PAGE_SIZE_4KB");
14861 pmap_test_test_config(flags | PMAP_CREATE_FORCE_4K_PAGES);
14862 T_LOG("Testing VM_PAGE_SIZE_16KB");
14863 pmap_test_test_config(flags);
14864 #else /* __ARM_MIXED_PAGE_SIZE__ */
14865 pmap_test_test_config(flags);
14866 #endif /* __ARM_MIXED_PAGE_SIZE__ */
14867
14868 #endif /* __arm64__ */
14869 T_PASS("completed pmap_test successfully");
14870 return KERN_SUCCESS;
14871 }
14872 #endif /* CONFIG_XNUPOST */
14873
14874 /*
14875 * The following function should never make it to RELEASE code, since
14876 * it provides a way to get the PPL to modify text pages.
14877 */
14878 #if DEVELOPMENT || DEBUG
14879
14880 #define ARM_UNDEFINED_INSN 0xe7f000f0
14881 #define ARM_UNDEFINED_INSN_THUMB 0xde00
14882
14883 /**
14884 * Forcibly overwrite executable text with an illegal instruction.
14885 *
14886 * @note Only used for xnu unit testing.
14887 *
14888 * @param pa The physical address to corrupt.
14889 *
14890 * @return KERN_SUCCESS on success.
14891 */
14892 kern_return_t
14893 pmap_test_text_corruption(pmap_paddr_t pa)
14894 {
14895 #if XNU_MONITOR
14896 return pmap_test_text_corruption_ppl(pa);
14897 #else /* XNU_MONITOR */
14898 return pmap_test_text_corruption_internal(pa);
14899 #endif /* XNU_MONITOR */
14900 }
14901
14902 MARK_AS_PMAP_TEXT kern_return_t
14903 pmap_test_text_corruption_internal(pmap_paddr_t pa)
14904 {
14905 vm_offset_t va = phystokv(pa);
14906 unsigned int pai = pa_index(pa);
14907
14908 assert(pa_valid(pa));
14909
14910 pvh_lock(pai);
14911
14912 pv_entry_t **pv_h = pai_to_pvh(pai);
14913 assert(!pvh_test_type(pv_h, PVH_TYPE_NULL));
14914 #if defined(PVH_FLAG_EXEC)
14915 const bool need_ap_twiddle = pvh_get_flags(pv_h) & PVH_FLAG_EXEC;
14916
14917 if (need_ap_twiddle) {
14918 pmap_set_ptov_ap(pai, AP_RWNA, FALSE);
14919 }
14920 #endif /* defined(PVH_FLAG_EXEC) */
14921
14922 /*
14923 * The low bit in an instruction address indicates a THUMB instruction
14924 */
14925 if (va & 1) {
14926 va &= ~(vm_offset_t)1;
14927 *(uint16_t *)va = ARM_UNDEFINED_INSN_THUMB;
14928 } else {
14929 *(uint32_t *)va = ARM_UNDEFINED_INSN;
14930 }
14931
14932 #if defined(PVH_FLAG_EXEC)
14933 if (need_ap_twiddle) {
14934 pmap_set_ptov_ap(pai, AP_RONA, FALSE);
14935 }
14936 #endif /* defined(PVH_FLAG_EXEC) */
14937
14938 InvalidatePoU_IcacheRegion(va, sizeof(uint32_t));
14939
14940 pvh_unlock(pai);
14941
14942 return KERN_SUCCESS;
14943 }
14944
14945 #endif /* DEVELOPMENT || DEBUG */
14946